# Imports

In [13]:
import urllib.request
from bs4 import BeautifulSoup
import logging
from transformers import pipeline

# Data scraping and summarization

In [14]:
# Al jazeera Europe first page
req = urllib.request.Request('https://www.aljazeera.com/news/')
page = urllib.request.urlopen(req)
soup = BeautifulSoup(page.read(), 'html.parser')

# Get links into a list
link = []
raw_links = soup.find_all('a', class_='u-clickable-card__link')
for raw_link in raw_links:
    link.append('https://www.aljazeera.com' + raw_link.get('href'))

In [15]:
# Retrieve links from Al Jazeera main page
urls = link
amount_of_links = len(urls)
link_number = 0
containers = []

while link_number < amount_of_links:
    # Article load
    req = urllib.request.Request(urls[link_number])
    page = urllib.request.urlopen(req)
    soup = BeautifulSoup(page.read(), 'html.parser')

    # Title of the article
    title = soup.find_all(['h1'])
    title = title[0].text

    # Main image and alt text of the article
    img = soup.find('img', {'fetchpriority': 'high'})

    # In case image or alt text is not found
    try:
        image_text = img['alt']
    except:
        image_text = ''
    try:
        image_url = 'https://www.aljazeera.com' + img['src']
    except:
        image_url = ''

    # Text of the article
    results = soup.find_all(['p', 'li'])
    text = [results.text for results in results]
    article = ' '.join(text)
    article = article.replace('.', '.<end>')
    article = article.replace('!', '?<end>')
    article = article.replace('?', '!<end>')
    sentences = article.split('<end>')
    del sentences[-1]  # Last sentence is not useful

    # Splitting sentences into chunks
    max_chunk = 500
    chunks = []
    current_chunk = 0

    # Split sentences into chunks of words with a maximum length of `max_chunk`
    for sentence in sentences:
        if len(chunks) == current_chunk + 1:
            # Extend the current chunk if it can accommodate the sentence
            if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
                chunks[current_chunk].extend(sentence.split(' '))
            else:
                # Move to the next chunk if the current one is full
                current_chunk += 1
                chunks.append(sentence.split(' '))
        else:
            chunks.append(sentence.split(' '))
    # Join the words in each chunk back into sentences
    for chunk_id in range(len(chunks)):
        chunks[chunk_id] = ' '.join(chunks[chunk_id])

    # Disable logging
    logging.getLogger("transformers").setLevel(logging.ERROR)

    # Summarize text in chunks
    summarizer = pipeline('summarization')
    sentence_res = summarizer(chunks, max_length=100, min_length=20, do_sample=False)
    all_sentences_summ = ' '.join([summ['summary_text'] for summ in sentence_res])
    all_sentences_summ = all_sentences_summ.replace(' .', '.')

    # Summarize alt text
    image_text_res = summarizer(image_text, max_length=40, min_length=5, do_sample=False)
    image_text_summ = ' '.join([summ['summary_text'] for summ in image_text_res])
    image_text_summ = image_text_summ.replace(' .', '.')

    containers.append({
            'text': title,                         # Title of the article
            'link': urls[link_number],             # Link to the title
            'image_url': image_url,                # Link to the main image
            'image_text': image_text_summ,         # Summarized alt text of the image
            'all_sentences': all_sentences_summ    # Summarized text
        })
    link_number += 1

In [16]:
# Contence of first contaianer
containers[0]

{'text': 'UN chief calls for international force in Haiti during visit',
 'link': 'https://www.aljazeera.com/news/2023/7/1/un-chief-calls-for-international-force-in-haiti-during-visit',
 'image_url': '',
 'image_text': ' CNN.com will feature iReporter photos in a weekly Travel Snapshots gallery. Please submit your best shots of our featured destinations for next week. Visit CNN iReport.com/',
 'all_sentences': " Antonio Guterres calls for international 'rapid action force' to help Haitian police tackle spiralling gang violence. The world body's leader has raised the alarm about the situation in the Western Hemisphere’s poorest country, which has been racked by gang violence, a worsening public health situation, and political instability.  The UN refugee agency said some 73,500 people fled Haiti last year. The United Nations has said 5.2 million – nearly half Haiti’s population – need humanitarian assistance in 2023."}

# Visualization in a HTML template

In [17]:
from jinja2 import Template
from IPython.display import display, HTML

In [18]:
template = Template("""
<html>
<head>
    <title>Al Jazeera Summarizer</title>
    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css">
    <style>
        body {
            background-color: #222;
            color: white;
        }
        .container {
            margin: 50px auto;
            max-width: 800px;
        }
        h1 {
            font-size: 24px;
            font-weight: bold;
        }
        .img-fluid {
            max-width: 100%;
            margin-top: 20px;
        }
        .image-text {
            color: #999;
            font-size: 18px;
        }
        .all-sentences {
            font-size: 20px;
            margin-top: 20px;
        }
        .small-link {
            font-size: 14px;
            color: blue;
        }
    </style>
</head>
<body>
    <div class="container text-center">
        {% for container in containers %}
        <div class="row">
            <div class="col-md-12 text-center">
                <h1>{{ container.text }}</h1>
            </div>
            <div class="col-md-12 text-center">
                <h5><a href="{{ container.link }}" class="small-link">{{ container.link }}</a></h5>
            </div>
        </div>
        <div class="row">
            <div class="col-md-12 text-center">
                <img src="{{ container.image_url }}" class="img-fluid">
                <p class="image-text">{{ container.image_text }}</p>
            </div>
        </div>
        <div class="row">
            <div class="col-md-12">
                <p class="all-sentences">{{ container.all_sentences }}</p>
            </div>
        </div>
        <hr>
        {% endfor %}
    </div>
</body>
</html>
""")

In [20]:
html_code = template.render(containers=containers)
display(HTML(html_code))