Data structure
- Title
- Link
- Abstract
- Date
- Content

In [27]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import json
from tqdm import tqdm
from enchant.checker import SpellChecker
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import plotly.graph_objects as go

# Collect Data

In [3]:
# get title, link, abstract, date of articles
r = requests.get("https://www.aljazeera.com/where/mozambique/")

In [4]:
soup = BeautifulSoup(r.content, "html.parser")
articles = soup.find_all(class_="gc__content")[:10]

results = []
host = "https://www.aljazeera.com"

for article in articles:
    result = {}
    
    # title & link
    title_wrap = article.find(class_="gc__title")
    result["title"] = title_wrap.a.span.string.replace("\xad", "")
    result["link"] = host + title_wrap.a["href"]
    
    # abstract
    result["abstract"] = article.find(class_="gc__excerpt").p.string.replace("\xad", "")
    
    # date
    result["date"] = article.find(class_="gc__date__date").find_all("span")[1].string
    
    results.append(result)


In [72]:
# get content of each article
for result in results:
    link = result["link"]
    r = requests.get(link)
    soup = BeautifulSoup(r.content, "html.parser")
    
    paragraphs = soup.find("main", id="main-content-area").find(class_=re.compile("all\-content")).find_all("p")
    content = ""

    for p in paragraphs:
        for pp in p.contents:
            if pp.string:
                content += ' ' + pp.string.strip()
    
    result["content"] = content


# Data Pre-process

In [29]:
# assume that if more than 10% words in the article are English words, then the article is in English
max_error_rate = 0.1

def is_in_english(quote):
    d = SpellChecker("en_US")
    d.set_text(quote)
    errors = [err.word for err in d]
    return False if len(errors) > (max_error_rate * len(quote)) else True

# remove non-English articles
print("Check if articles are in English:")
for result in tqdm(results):
    if not is_in_english(result["content"]):
        results.remove(result)

Check if articles are in English:


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 13.96it/s]


# Sentiment Analysis

In [34]:
# nltk.download('vader_lexicon')  # run this line when you first time run this file

sia = SentimentIntensityAnalyzer()
ss = sia.polarity_scores(results[0]['content'])
for k in ss:
    print('{0}:{1},'.format(k, ss[k]), end='')


neg:0.106,neu:0.827,pos:0.067,compound:-0.9332,

In [38]:
sia = SentimentIntensityAnalyzer()
for result in results:
    result["sentiment_title"] = sia.polarity_scores(result["title"])["compound"]
    result["sentiment_abstract"] = sia.polarity_scores(result["abstract"])["compound"]
    result["sentiment_content"] = sia.polarity_scores(result["content"])["compound"]

In [45]:
# convert to json and save the file
with open("10_most_recent_articles.json", 'w') as f:
    json.dump(results, f)

# Visualize

In [55]:
# with open("10_most_recent_articles.json") as f:
#     results = json.load(f)

total = len(results)
data = [r["sentiment_content"] for r in results]

fig = go.Figure()
fig.add_trace(go.Histogram(x=data, xbins=dict(start=-1.0, end=1.0, size=0.1)))
fig.update_layout(
    title_text=f"Sentiment Distribution of {total} Most Recent Articles", # title of plot
    xaxis_title_text='Sentiment', # xaxis label
    yaxis_title_text='Count', # yaxis label
)
