# AI & ML Video Assignment

# Building a Web News Article Clustering System Latent Semantic Association

Perform topic modelling on the dataset using PLSA.
Dataset Link - https://www.kaggle.com/therohk/india-headlines-news-dataset
Output/explain the following steps in the code:
a) The vocabulary of words used.
b) Words document Co-occurrence matrix
c) Words under each topic and the score
For the given dataset, use the first 100,000 headlines and 20 topics

In [5]:
# import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import NMF 
from sklearn.preprocessing import normalize 

In [6]:
#read the dataset

df = pd.read_csv("data.csv",error_bad_lines=False)
df.head(20)

Unnamed: 0,publish_date,headline_category,headline_text
0,20010101,sports.wwe,win over cena satisfying but defeating underta...
1,20010102,unknown,Status quo will not be disturbed at Ayodhya; s...
2,20010102,unknown,Fissures in Hurriyat over Pak visit
3,20010102,unknown,America's unwanted heading for India?
4,20010102,unknown,For bigwigs; it is destination Goa
5,20010102,unknown,Extra buses to clear tourist traffic
6,20010102,unknown,Dilute the power of transfers; says Riberio
7,20010102,unknown,Focus shifts to teaching of Hindi
8,20010102,unknown,IT will become compulsory in schools
9,20010102,unknown,Move to stop freedom fighters' pension flayed


In [7]:
data_text = df[["headline_text"]].astype("str")
data_text.shape

(3297172, 1)

In [8]:
data_text = data_text.loc[1:100000,:]
data_text.shape

(100000, 1)

In [9]:
#removing stopwords

stopw = stopwords.words("english")
stopw

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [10]:
def stopwords_remove(x):
    terms = x.split()
    terms = [w for w in terms if w not in stopw]
    sentence = ' '.join(terms)
    return sentence

In [11]:
data_text['Refined_headlines'] = data_text['headline_text'].apply(lambda x: stopwords_remove(x))

In [12]:
data_text.head()

Unnamed: 0,headline_text,Refined_headlines
1,Status quo will not be disturbed at Ayodhya; s...,Status quo disturbed Ayodhya; says Vajpayee
2,Fissures in Hurriyat over Pak visit,Fissures Hurriyat Pak visit
3,America's unwanted heading for India?,America's unwanted heading India?
4,For bigwigs; it is destination Goa,For bigwigs; destination Goa
5,Extra buses to clear tourist traffic,Extra buses clear tourist traffic


In [13]:
def word_count(x):
    terms = x.split()
    return len(terms)
data_text['word_count']=data_text['Refined_headlines'].apply(lambda x: word_count(x))

In [14]:
data_text.head()

Unnamed: 0,headline_text,Refined_headlines,word_count
1,Status quo will not be disturbed at Ayodhya; s...,Status quo disturbed Ayodhya; says Vajpayee,6
2,Fissures in Hurriyat over Pak visit,Fissures Hurriyat Pak visit,4
3,America's unwanted heading for India?,America's unwanted heading India?,4
4,For bigwigs; it is destination Goa,For bigwigs; destination Goa,4
5,Extra buses to clear tourist traffic,Extra buses clear tourist traffic,5


In [15]:
#vectorizing the dataset

headline_sentences = [''.join(text) for text in data_text['Refined_headlines']]

vectorizer = CountVectorizer(analyzer='word',max_features=5000)


In [17]:
# creating word documents matrix

x_counts = vectorizer.fit_transform(headline_sentences)
x_counts.toarray().shape

(100000, 5000)

In [18]:
#vocabulary of word used


feature_names = vectorizer.get_feature_names()
feature_names

['000',
 '000cr',
 '01',
 '02',
 '10',
 '100',
 '10th',
 '11',
 '12',
 '120',
 '13',
 '14',
 '15',
 '150',
 '16',
 '17',
 '18',
 '19',
 '20',
 '200',
 '2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '21',
 '22',
 '23',
 '24',
 '25',
 '250',
 '26',
 '27',
 '28',
 '29',
 '2nd',
 '30',
 '300',
 '31',
 '32',
 '33',
 '34',
 '35',
 '350',
 '36',
 '38',
 '39',
 '3c',
 '40',
 '400',
 '42',
 '43',
 '44',
 '45',
 '47',
 '48',
 '49',
 '4th',
 '50',
 '500',
 '51',
 '52',
 '54',
 '55',
 '57',
 '58',
 '60',
 '600',
 '63',
 '64',
 '65',
 '70',
 '73',
 '75',
 '80',
 '800',
 '8888',
 '90',
 'aamir',
 'ab',
 'abducted',
 'abduction',
 'abdullah',
 'abhishek',
 'abide',
 'ablaze',
 'about',
 'abroad',
 'absence',
 'abu',
 'abuse',
 'abvp',
 'academic',
 'academy',
 'acb',
 'accept',
 'accepts',
 'access',
 'accident',
 'accidents',
 'account',
 'accounts',
 'accused',
 'accuses',
 'acid',
 'acquire',
 'acquires',
 'acquisition',
 'acquittal',
 'acquitted',
 'across',
 'act',
 'acting',
 'action',
 

In [19]:
x_counts.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [20]:
# weight creation technique on words


transformer = TfidfTransformer(smooth_idf=False)
x_tfidf = transformer.fit_transform(x_counts)
x_tfidf

<100000x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 374810 stored elements in Compressed Sparse Row format>

In [21]:
#fitting NMF model

num_topics = 20
model = NMF(n_components = num_topics, init='nndsvd')
model.fit(x_tfidf)

NMF(alpha=0.0, beta_loss='frobenius', init='nndsvd', l1_ratio=0.0, max_iter=200,
    n_components=20, random_state=None, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [22]:
# finding top words for each topic 


def get_nmf_topics(model, n_top_words):
    
    feat_names = vectorizer.get_feature_names()
    
    word_dict = {}
    for i in range(num_topics):
        
        words_ids = model.components_[i].argsort()[:-n_top_words-1:-1]
        words = [feat_names[key] for key in words_ids]
        word_dict['Topic #'+'{:02d}'.format(i+1)] = words
        
    return pd.DataFrame(word_dict)
   

In [23]:
get_nmf_topics(model,10)

Unnamed: 0,Topic #01,Topic #02,Topic #03,Topic #04,Topic #05,Topic #06,Topic #07,Topic #08,Topic #09,Topic #10,Topic #11,Topic #12,Topic #13,Topic #14,Topic #15,Topic #16,Topic #17,Topic #18,Topic #19,Topic #20
0,the,govt,india,city,new,police,bjp,no,killed,it,meet,rs,held,us,power,cm,day,case,cong,says
1,speaking,state,pak,round,year,delhi,up,please,two,time,today,cr,man,pak,supply,state,life,hc,polls,pm
2,man,may,china,sports,chief,station,polls,yet,militants,life,pm,lakh,two,indian,water,centre,one,murder,leader,minister
3,art,staff,talks,old,gets,chief,congress,time,road,back,all,crore,murder,may,tariff,seeks,today,plea,poll,chief
4,week,set,terrorism,scan,look,firing,poll,change,kashmir,bangalore,begins,000,seized,indo,hike,work,security,bail,chief,vajpayee
5,world,hc,world,briefs,soon,custody,sp,up,one,indian,party,worth,fake,war,state,gujarat,strike,court,may,we
6,counsellor,plans,musharraf,water,set,traffic,gujarat,takers,injured,get,national,net,three,ties,may,minister,celebrated,accused,up,vhp
7,good,bihar,pakistan,up,policy,arrest,modi,water,among,industry,tomorrow,10,woman,help,cuts,congress,world,sc,ncp,congress
8,way,asks,air,jan,get,probe,sena,decision,four,big,water,plan,leader,visit,cut,probe,valentine,gets,leaders,jaya
9,life,schools,visit,visit,york,attack,pm,minister,encounter,students,discuss,get,racket,terrorism,get,visit,celebrations,cbi,demands,still


In [30]:
# score of the words in topic


model.components_

array([[0.        , 0.        , 0.        , ..., 0.        , 0.00152007,
        0.00178157],
       [0.00703785, 0.00342503, 0.        , ..., 0.        , 0.        ,
        0.00445012],
       [0.01181424, 0.        , 0.00636285, ..., 0.00189262, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.00037952, 0.00287066,
        0.00452545],
       [0.        , 0.        , 0.        , ..., 0.        , 0.00047199,
        0.02636155],
       [0.        , 0.        , 0.        , ..., 0.        , 0.00189912,
        0.00151906]])