In [17]:
import pandas as pd
import numpy as np


In [18]:
#load the datast
stories=pd.read_csv("stories.csv")
stories

Unnamed: 0,body,topic
0,,['39822b5f-e37e-43e8-b997-7142fe55c3ea']
1,,['0d817400-3f5d-41e0-929c-c31fdbe75d31']
2,,['83a09c6b-5f2f-421f-ae50-b38acca7e008']
3,,['6fbf954a-03f9-4782-a65f-783271c9c447']
4,hello and welcome to BBC News a woman who gave...,"['83a09c6b-5f2f-421f-ae50-b38acca7e008', '9ff5..."
...,...,...
5176,News. More local help will soon be on the way....,"['9ff54ded-904b-4e0c-85ce-a3617f5cb913', '9632..."
5177,"with March 1, we start what is called Meteorol...",['9a06646a-e1df-4fca-888e-69658420556b']
5178,overseas. A massive Russian convoy is headed t...,['9ff54ded-904b-4e0c-85ce-a3617f5cb913']
5179,"And this morning, the National Hockey League s...","['9ff54ded-904b-4e0c-85ce-a3617f5cb913', 'b492..."


In [19]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
import nltk
import spacy

In [20]:
# download the nltk stopwords and spacy english language model
nltk.download('stopwords')
nlp = spacy.load('en_core_web_sm')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [21]:
#prepare the data and get the vectorizer
def preprocessing_step(text):
  custom_stop_words = ['and', 'the', 'in', 'of', 'to', 'a', 'is', 'that','an','but'] 
  # Combine the NLTK stopwords with the custom stop words
  stop_words = set(stopwords.words('english')).union(set(custom_stop_words))
  vectorizer = CountVectorizer(stop_words=list(stop_words))
  X = vectorizer.fit_transform(text)
  return X,vectorizer

X,vectorizer= preprocessing_step(stories['body'])





In [22]:
# get the LatentDirichletAllocation model and train it 
from sklearn.decomposition import LatentDirichletAllocation

# number of topics to extract
n_topics = 3 
lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda_model.fit(X)

In [23]:
pip install --upgrade scikit-learn


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [24]:
# get the top words for each topic
n_top_words = 5
feature_names = vectorizer.get_feature_names_out()

# function to get the top words for a given topic
def get_top_words(topic):

  return [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
     

# passing the topics to the dataframe
stories['topic_text'] = None

for i, row in stories.iterrows():
    row_text = row['body']
    vectorized_text = vectorizer.transform([row_text])
    topic_dist = lda_model.transform(vectorized_text)[0]
    top_topic = np.argmax(topic_dist)
    top_words = get_top_words(lda_model.components_[top_topic])
    stories.at[i, 'topic_text'] = ', '.join(top_words)


stories


Unnamed: 0,body,topic,topic_text
0,,['39822b5f-e37e-43e8-b997-7142fe55c3ea'],"ukraine, president, us, russia, today"
1,,['0d817400-3f5d-41e0-929c-c31fdbe75d31'],"ukraine, president, us, russia, today"
2,,['83a09c6b-5f2f-421f-ae50-b38acca7e008'],"ukraine, president, us, russia, today"
3,,['6fbf954a-03f9-4782-a65f-783271c9c447'],"ukraine, president, us, russia, today"
4,hello and welcome to BBC News a woman who gave...,"['83a09c6b-5f2f-421f-ae50-b38acca7e008', '9ff5...","new, police, say, city, year"
...,...,...,...
5176,News. More local help will soon be on the way....,"['9ff54ded-904b-4e0c-85ce-a3617f5cb913', '9632...","ukraine, president, us, russia, today"
5177,"with March 1, we start what is called Meteorol...",['9a06646a-e1df-4fca-888e-69658420556b'],"going, morning, school, today, right"
5178,overseas. A massive Russian convoy is headed t...,['9ff54ded-904b-4e0c-85ce-a3617f5cb913'],"ukraine, president, us, russia, today"
5179,"And this morning, the National Hockey League s...","['9ff54ded-904b-4e0c-85ce-a3617f5cb913', 'b492...","ukraine, president, us, russia, today"


In [25]:
# save the model 
import pickle

pickle.dump(lda_model, open('trained_model.pkl', 'wb'))



In [26]:
#get the test data from task1
test_data=pd.read_csv("news.csv")
test_data['body']

0     Well knew. This morning police need your help ...
1     a call. San Francisco firefighters rescued a m...
2     Paul. Meanwhile, the state set a record in ene...
3     Emergency crews in Florida continue to search ...
4     But even though the state never ordered rollin...
5     aid. And today, president Joe Biden and first ...
6     In the last month, there have been numerous da...
7     and the warriors are playing the Boston Celtic...
8     And San Leandro police searching for the perso...
9     The updated Bivalent Coronavirus booster shot ...
10    temperatures tonight, only back down to the un...
11    And National Guard teams are also on the groun...
12    A man died after falling from an escalator aft...
13    All right. Well happening today. Friends and f...
14    detectives are searching for six people they s...
15    San Jose police say a security guard trying to...
16    San Jose police just announcing today an arres...
17    shooting. Surveillance video shows how a c

In [27]:
#passing the test data to preprocessing_step function
X_test,test_vectorizer= preprocessing_step(test_data['body'])

In [28]:
# loading the saved model
loaded_model = pickle.load(open('trained_model.pkl', 'rb'))
loaded_model.fit(X_test)


In [29]:
### passing topics to dataset

n_top_words = 5
feature_names = test_vectorizer.get_feature_names_out()

test_data['topic_text'] = None

for i, row in test_data.iterrows():
    row_text = row['body']
    vectorized_text = test_vectorizer.transform([row_text])
    topic_dist = loaded_model.transform(vectorized_text)[0]
    top_topic = np.argmax(topic_dist)
    top_words = get_top_words(loaded_model.components_[top_topic])
    test_data.at[i, 'topic_text'] = ', '.join(top_words)


test_data


Unnamed: 0,first_words,last_words,source_video_id,body,start,end,topic_text
0,Well knew. This morning police need your help,"gunpoint, beating him and stealing his cell ph...",18246,Well knew. This morning police need your help ...,464928,504300,"say, power, police, guard, blackouts"
1,a call. San Francisco firefighters rescued a man,all the way down to the ocean. ocean.,12387,a call. San Francisco firefighters rescued a m...,359020,0,"like, two, years, game, games"
2,"Paul. Meanwhile, the state set a record in","night through conservation, some 4000 conserva...",16859,"Paul. Meanwhile, the state set a record in ene...",60704,100410,"russo, last, police, also, custody"
3,Emergency crews in Florida continue to search for,"in Florida to more than 850,000 homes.",18246,Emergency crews in Florida continue to search ...,505290,534518,"like, two, years, game, games"
4,But even though the state never ordered rolling,feel since their power got cut out needlessly.,16859,But even though the state never ordered rollin...,100910,283306,"say, power, police, guard, blackouts"
5,"aid. And today, president Joe Biden and first",to view the destruction caused by Hurricane Ian.,18246,"aid. And today, president Joe Biden and first ...",546494,592450,"like, two, years, game, games"
6,"In the last month, there have been numerous",are necessary to crack down on those hackers.,18246,"In the last month, there have been numerous da...",614910,699230,"like, two, years, game, games"
7,and the warriors are playing the Boston Celtics,that. We'll see if they get it tonight.,12387,and the warriors are playing the Boston Celtic...,419994,649122,"like, two, years, game, games"
8,And San Leandro police searching for the person,footage to try to piece together more informat...,16859,And San Leandro police searching for the perso...,578612,618980,"like, two, years, game, games"
9,The updated Bivalent Coronavirus booster shot ...,on their vaccinations getting severe illness f...,16859,The updated Bivalent Coronavirus booster shot ...,619310,659730,"like, two, years, game, games"


In [30]:
###converting the topic column to UUID 
import uuid


test_data['uuid_topic'] = None

# Iterating over the rows and producing UUIDs for each topic
for a, row in test_data.iterrows():
    text_topic = row['topic_text']
    generated_uuid = uuid.uuid5(uuid.NAMESPACE_DNS, text_topic)
    test_data.at[a, 'uuid_topic'] = str(generated_uuid)

test_data

Unnamed: 0,first_words,last_words,source_video_id,body,start,end,topic_text,uuid_topic
0,Well knew. This morning police need your help,"gunpoint, beating him and stealing his cell ph...",18246,Well knew. This morning police need your help ...,464928,504300,"say, power, police, guard, blackouts",5f08968c-e229-598a-aba5-daaf4a2d920a
1,a call. San Francisco firefighters rescued a man,all the way down to the ocean. ocean.,12387,a call. San Francisco firefighters rescued a m...,359020,0,"like, two, years, game, games",bcbeb58a-3bed-50a9-a5e4-7a342c3c74c3
2,"Paul. Meanwhile, the state set a record in","night through conservation, some 4000 conserva...",16859,"Paul. Meanwhile, the state set a record in ene...",60704,100410,"russo, last, police, also, custody",7c1c33b3-ec59-5aa6-9312-944d96095276
3,Emergency crews in Florida continue to search for,"in Florida to more than 850,000 homes.",18246,Emergency crews in Florida continue to search ...,505290,534518,"like, two, years, game, games",bcbeb58a-3bed-50a9-a5e4-7a342c3c74c3
4,But even though the state never ordered rolling,feel since their power got cut out needlessly.,16859,But even though the state never ordered rollin...,100910,283306,"say, power, police, guard, blackouts",5f08968c-e229-598a-aba5-daaf4a2d920a
5,"aid. And today, president Joe Biden and first",to view the destruction caused by Hurricane Ian.,18246,"aid. And today, president Joe Biden and first ...",546494,592450,"like, two, years, game, games",bcbeb58a-3bed-50a9-a5e4-7a342c3c74c3
6,"In the last month, there have been numerous",are necessary to crack down on those hackers.,18246,"In the last month, there have been numerous da...",614910,699230,"like, two, years, game, games",bcbeb58a-3bed-50a9-a5e4-7a342c3c74c3
7,and the warriors are playing the Boston Celtics,that. We'll see if they get it tonight.,12387,and the warriors are playing the Boston Celtic...,419994,649122,"like, two, years, game, games",bcbeb58a-3bed-50a9-a5e4-7a342c3c74c3
8,And San Leandro police searching for the person,footage to try to piece together more informat...,16859,And San Leandro police searching for the perso...,578612,618980,"like, two, years, game, games",bcbeb58a-3bed-50a9-a5e4-7a342c3c74c3
9,The updated Bivalent Coronavirus booster shot ...,on their vaccinations getting severe illness f...,16859,The updated Bivalent Coronavirus booster shot ...,619310,659730,"like, two, years, game, games",bcbeb58a-3bed-50a9-a5e4-7a342c3c74c3


In [31]:
#save the results in a new file
test_data.to_csv("finaldataset.csv",index=False)