# Topic Modeling on News Articles


## Objective
### Automatically detect topics in a collection of news articles by analyzing the text and identifying common themes.

### Step 1: Data Collection : News Dataset

In [19]:
# Import necessary libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import gensim
from gensim import corpora
import pyLDAvis.gensim_models
import pyLDAvis
import matplotlib.pyplot as plt
import requests
import json
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from gensim.models import CoherenceModel



In [20]:
# Google News API
API_KEY = '019c362ec11d44fb91aa17a4b10c2d86'

# Fetch news articles from News API
def fetch_news_articles(query, num_articles=5):
    url = 'https://newsapi.org/v2/everything'
    params = {
        'q': query,
        'pageSize': num_articles,  # Number of articles to retrieve
        'apiKey': API_KEY,
        'language': 'en',  # Language of articles
    }

    response = requests.get(url, params=params)
    data = response.json()
    
    # Extract article content
    articles = [article['description'] for article in data['articles'] if article['description']]
    return articles


## Step 2: Text Preprocessing

In [21]:
import requests
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from gensim import corpora

# Ensure required NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stopwords, stemmer, and lemmatizer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    tokens = word_tokenize(text.lower())  # Tokenization
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]  # Stopword removal
    
    # Apply stemming
    tokens = [stemmer.stem(word) for word in tokens]
    
    # Apply lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return tokens

# Example usage
query = 'machine learning'
articles = fetch_news_articles(query)  # Ensure articles are fetched
processed_articles = [preprocess(article) for article in articles]

# Create a dictionary that maps each word to a unique id
dictionary = corpora.Dictionary(processed_articles)

# Create a corpus: Bag of Words format for each document
corpus = [dictionary.doc2bow(article) for article in processed_articles]

print("Processed Articles:", processed_articles)
print("Dictionary:", dictionary.token2id)
print("Corpus:", corpus)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\artha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\artha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\artha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Processed Articles: [['scroll', 'found', 'shadow', 'vesuviu', 'librari', 'ancient', 'text', 'besid', 'illumin', 'machin', 'learn', 'comput', 'vision'], ['despit', 'limit', 'maker', 'mariovgg', 'think', 'ai', 'video', 'could', 'one', 'day', 'replac', 'game', 'engin'], ['upon', 'time', 'machin', 'learn', 'arcan', 'field', 'preserv', 'preciou', 'research', 'hole', 'grand', 'academ', 'institut', 'progress', 'slow', 'hard', 'today', 'howev'], ['ai', 'learn', 'play', 'simul', 'environ'], ['initi', 'iphon', 'iphon', 'pro', 'use', 'task']]
Dictionary: {'ancient': 0, 'besid': 1, 'comput': 2, 'found': 3, 'illumin': 4, 'learn': 5, 'librari': 6, 'machin': 7, 'scroll': 8, 'shadow': 9, 'text': 10, 'vesuviu': 11, 'vision': 12, 'ai': 13, 'could': 14, 'day': 15, 'despit': 16, 'engin': 17, 'game': 18, 'limit': 19, 'maker': 20, 'mariovgg': 21, 'one': 22, 'replac': 23, 'think': 24, 'video': 25, 'academ': 26, 'arcan': 27, 'field': 28, 'grand': 29, 'hard': 30, 'hole': 31, 'howev': 32, 'institut': 33, 'preci

## Step 3: a) LDA Model Training

In [22]:
# Build LDA Model
lda_model = gensim.models.LdaModel(corpus, num_topics=3, id2word=dictionary, passes=15)

# Print topics found by the model
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)


(0, '0.065*"iphon" + 0.038*"ai" + 0.037*"task" + 0.037*"pro" + 0.037*"use"')
(1, '0.038*"machin" + 0.038*"learn" + 0.038*"today" + 0.038*"research" + 0.038*"hole"')
(2, '0.067*"learn" + 0.038*"found" + 0.038*"text" + 0.038*"vesuviu" + 0.038*"librari"')


## Step 3: b) Word Embeddings with Word2Vec

In [23]:
from gensim.models import Word2Vec

# Train Word2Vec on the preprocessed articles
word2vec_model = Word2Vec(sentences=processed_articles, vector_size=100, window=5, min_count=1, workers=4)


# Get all words in the vocabulary
vocabulary = list(word2vec_model.wv.index_to_key)

# Pick a word from the vocabulary to check similar words
print(vocabulary[:10])  # Look at the first 10 words in the vocabulary

# Example: Pick a word that exists in the vocabulary
similar_words = word2vec_model.wv.most_similar(vocabulary[0], topn=5)
print(f"Most similar words to '{vocabulary[0]}': {similar_words}")


['learn', 'iphon', 'ai', 'machin', 'vision', 'one', 'could', 'video', 'think', 'mariovgg']
Most similar words to 'learn': [('illumin', 0.218916654586792), ('mariovgg', 0.21620631217956543), ('pro', 0.19549766182899475), ('preciou', 0.16923967003822327), ('environ', 0.1518188714981079)]


## Step 4: Topics Visualization

In [24]:
# Prepare for visualization
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

# Display the visualization in the notebook
pyLDAvis.display(lda_display)

# Optionally, save the visualization to an HTML file
pyLDAvis.save_html(lda_display, 'lda_visualization.html')


## Step 5: Coherence Score Calculation

In [25]:
from gensim.models import CoherenceModel

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_articles, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_lda}')


Coherence Score: 0.5330683838522655
