<a href="https://colab.research.google.com/github/aliceczr/guardian_LDA/blob/main/Guardian_LDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installing dependecies

In [1]:
!pip install gensim # Library for topic modelling
!pip install nltk  # Library for NLP
!pip install spacy # Library for NLP
!pip install pyLDAvis # Library for implementing LDA


Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.1


In [2]:
import requests
import gensim
import gensim.corpora as corpora
from gensim.models.ldamodel import LdaModel
import nltk
import spacy
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import re
import pyLDAvis
import pyLDAvis.gensim_models
from string import punctuation
from spacy.lang.en import stop_words
from gensim.models import Phrases
from gensim.models.phrases import Phraser

## Data extraction and cleaning

In this part we are going to be using The Guardian API to get the data we will inject in the LDA model
 - First thing is to visit the website to get your api key
  - You can get your key here: https://open-platform.theguardian.com/


In [3]:
api_key = 'c79b24f9-27cd-46bc-ab7e-e1b098fbb95a'
page_size = 50
total_articles_needed = 200
url_template = 'https://content.guardianapis.com/search?q=technology&api-key={api_key}&show-fields=all&page={page}&page-size={page_size}'

articles_list = []
page = 1

while len(articles_list) < total_articles_needed:
    url = url_template.format(api_key=api_key, page=page, page_size=page_size)
    response = requests.get(url)
    data = response.json()

    for article in data['response']['results']:
        articles_list.append(article['fields']['body'])

        if len(articles_list) >= total_articles_needed:
            break

    page += 1




  and should_run_async(code)


Since the extraction process gives us text with HTML tags, these tags are treated as words to the model, which can result in poorly defined topics. To address this, we need to use the bs4 library with BeautifulSoup to obtain plain text, ensuring that only meaningful words are used for topic modeling.

In [61]:
cleaned_articles = []
bad_chars = [";", ",", ":", '!', "*", ',', '–', '{', '}', '(', ')', '[', ']', "'", '”', '.','£', '’','-','“',"say","say","$"]

for article in articles_list:
    soup = BeautifulSoup(article, 'html.parser')
    text = soup.get_text()
    cleaned_text = ''.join(i for i in text if i not in bad_chars)
    cleaned_articles.append(cleaned_text)


  and should_run_async(code)


In [31]:
nltk.download('stopwords')
nlp = spacy.load('en_core_web_sm')

  and should_run_async(code)
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [32]:
docs = [nlp(doc) for doc in cleaned_articles] # Pre processing the data

# Stemming and stopword removal
processed_docs = []
for doc in docs:
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    processed_docs.append(tokens)

  and should_run_async(code)


In [33]:
id2word = corpora.Dictionary(processed_docs)
corpus = [id2word.doc2bow(doc) for doc in processed_docs]

  and should_run_async(code)


In [56]:
lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=5, random_state=100, update_every=1, chunksize=10, passes=10, alpha='auto', per_word_topics=True)



  and should_run_async(code)


In [57]:
for idx, topic in lda_model.print_topics(-1):
    print(f'Tópico: {idx} \nPalavras: {topic}')

Tópico: 0 
Palavras: 0.018*"say" + 0.015*"
" + 0.015*" " + 0.009*"government" + 0.007*"service" + 0.007*"company" + 0.005*"UK" + 0.005*"outage" + 0.005*"library" + 0.005*"system"
Tópico: 1 
Palavras: 0.014*"say" + 0.011*"technology" + 0.011*"not" + 0.011*"people" + 0.008*" " + 0.006*"time" + 0.006*"s" + 0.006*"use" + 0.005*"way" + 0.005*"new"
Tópico: 2 
Palavras: 0.029*"AI" + 0.012*"company" + 0.010*" " + 0.009*"technology" + 0.007*"say" + 0.007*"model" + 0.006*"human" + 0.006*"tech" + 0.005*"  " + 0.005*"Trump"
Tópico: 3 
Palavras: 0.007*"song" + 0.007*"light" + 0.007*"art" + 0.006*"insect" + 0.006*"Kowalkiewicz" + 0.006*"Lenker" + 0.006*"artist" + 0.005*"film" + 0.005*"movie" + 0.005*"music"
Tópico: 4 
Palavras: 0.017*" " + 0.016*"say" + 0.006*"year" + 0.005*"need" + 0.005*"project" + 0.005*"work" + 0.005*"home" + 0.004*"heat" + 0.004*"price" + 0.004*"cost"


  and should_run_async(code)


In [58]:
pyLDAvis.enable_notebook()
lda_vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
pyLDAvis.display(lda_vis)

  and should_run_async(code)


In [59]:
from gensim.models import CoherenceModel

# Calcula a perplexidade do modelo (quanto menor, melhor)
perplexity = lda_model.log_perplexity(corpus)
print(f'Perplexidade do modelo: {perplexity}')

# Calcula a coerência do modelo
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Coerência do modelo: {coherence_lda}')


  and should_run_async(code)


Perplexidade do modelo: -8.67159672470842
Coerência do modelo: 0.39907535795079485
