# 1. Download and Configure Modules

In [176]:
url = "https://www.aljazeera.com/where/mozambique/"

In [177]:
!pip install html5print
!pip install flair
!pip install -U kaleido
!pip install selenium
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Ign:2 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:4 http://security.ubuntu.com/ubuntu bionic-security InRelease
Hit:5 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:6 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Hit:7 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Hit:8 http://archive.ubuntu.com/ubuntu bioni

In [178]:
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

In [179]:
from selenium import webdriver
from bs4 import BeautifulSoup

# It is a dynamic web page, so I used selenium
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome('chromedriver',options=chrome_options)

# 1. Web Scraping



## Get basic info



In [180]:
def get_info_from_div(div_soup):
  article_info = {}

  # 1. Get title
  article_info["title"] = div_soup.find('h3', class_="gc__title").a.span.text.replace('\xad', '')
    
  # 2. Get summery
  article_info["excerpt"] = div_soup.find('div', class_="gc__excerpt").p.text.replace('\xad', '')

  # 3. Get image url
  article_info["img"] = "https://www.aljazeera.com" + div_soup.find('img')['src']

  # 4. Get publish time
  article_info["publish_date"] = div_soup.find('div', class_="gc__date__date").find_all('span')[-1].text.replace('\xad', '')

  # 5. Get article url
  article_info["article"] = "https://www.aljazeera.com" + div_soup.find('h3', class_="gc__title").a['href']

  return article_info

wd.get(url)

source = wd.page_source
soup = BeautifulSoup(source, features='lxml')

recent_ten_articles = []

recent_four_articles = soup.find('ul', class_="featured-articles-list")

if recent_four_articles:
  for recent_four_article_soup in recent_four_articles.find_all('article'):
    recent_ten_articles.append(get_info_from_div(recent_four_article_soup))

recent_articles = soup.find('section', id="news-feed-container")
if recent_articles:
  for recent_article_soup in recent_articles.find_all('article'):
    recent_ten_articles.append(get_info_from_div(recent_article_soup))


## Get full content

In [181]:
from tqdm import tqdm

final_article_results = []
for recent_ten_article in tqdm(recent_ten_articles):
  wd.get(recent_ten_article['article'])
  article_soup = BeautifulSoup(wd.page_source, features='lxml') 

  article_content_soup = article_soup.find('div', class_="wysiwyg wysiwyg--all-content css-1ck9wyi")
  full_content = ""
  if article_content_soup:
    for paragraph in article_content_soup.findChildren('p',recursive=False):
      full_content+=paragraph.text+'\n'
    recent_ten_article['full_content'] = full_content 

    final_article_results.append(recent_ten_article)
  

100%|██████████| 14/14 [01:01<00:00,  4.37s/it]


# Save and Read JSON

In [182]:
import json
with open('data.json', 'w') as f:
    json.dump(final_article_results, f, ensure_ascii=False)

with open('data.json', 'r') as f:
    data_from_json = json.load(f)

# Sentiment analysis

In [183]:
from flair.models import TextClassifier
from flair.data import Sentence
sia = TextClassifier.load('en-sentiment')
def flair_prediction(x):
    sentence = Sentence(x)
    sia.predict(sentence)
    score = sentence.labels[0]

    if "POSITIVE" in str(score):
      score_string = str(score).split('POSITIVE')[-1].replace('(','').replace(')','')
      return "pos", float(score_string)
    elif "NEGATIVE" in str(score):
      score_string = str(score).split('NEGATIVE')[-1].replace('(','').replace(')','')
      return "neg", float(score_string)
    else:
        return "neu", 0

2022-06-09 06:45:37,832 loading file /root/.flair/models/sentiment-en-mix-distillbert_4.pt


In [184]:
for article in tqdm(data_from_json):
  article["sentiment"], article["score"]  = flair_prediction(article["full_content"])

100%|██████████| 13/13 [00:10<00:00,  1.21it/s]


# Data visualization

In [185]:
score_list = []
date_list = []
for article in data_from_json:
  print(article['article'])
  print(article["title"])
  print(article["sentiment"], article["score"])
  print(article['publish_date'])
  print()
  if article["sentiment"] == "neg":
    score_list.append(0-article["score"])
  else:
    score_list.append(article["score"])
  date_list.append(article['publish_date'])

https://www.aljazeera.com/news/2022/5/23/floods-hit-south-africas-kwazulu-natal-province-again
Floods hit South Africa’s KwaZulu-Natal province again
neg 0.9898
23 May 2022

https://www.aljazeera.com/news/2022/3/18/mozambique-cyclone-gombe-death-toll-rises-to-53
Mozambique: Cyclone Gombe death toll rises to 53
neg 0.996
18 Mar 2022

https://www.aljazeera.com/news/2022/3/4/mozambique-announces-new-prime-minister-and-finance-minister
Mozambique announces new prime minister after cabinet reshuffle
neg 0.9038
4 Mar 2022

https://www.aljazeera.com/economy/2022/3/1/analysis-can-african-gas-replace-russian-supplies-to-europe
Analysis: Can African gas replace Russian supplies to Europe?
neg 0.9908
1 Mar 2022

https://www.aljazeera.com/news/2022/1/27/at-least-70-dead-from-tropical-storm-ana-in-southern-africa
Dozens dead from Tropical Storm Ana in southern Africa
neg 0.9945
27 Jan 2022

https://www.aljazeera.com/news/2022/1/12/southern-africa-bloc-sadc-extends-mozambique-mission
Southern Africa

In [186]:
print(score_list)
print(date_list)
date_list.reverse()
score_list.reverse()

[-0.9898, -0.996, -0.9038, -0.9908, -0.9945, 0.6893, -0.9069, -0.7817, -0.9964, -0.9748, -0.9716, -0.9977, -0.9997]
['23 May 2022', '18 Mar 2022', '4 Mar 2022', '1 Mar 2022', '27 Jan 2022', '12 Jan 2022', '24 Sep 2021', '8 Aug 2021', '10 Jul 2021', '23 Jun 2021', '20 Jun 2021', '9 Jun 2021', '14 May 2021']


In [187]:
import plotly.graph_objects as go

fig = go.Figure([go.Scatter(x=date_list, y=score_list)])
fig.show()