In [1]:
import re
import numpy as np
import pandas as pd

import gensim
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [2]:
data = pd.read_csv('scraped_articles_train.csv', error_bad_lines=False);

In [3]:
data

Unnamed: 0.1,Unnamed: 0,Title,Text,Source
0,0,Watch Latest News Videos,india\n\nCheck out the latest news videos from...,https://timesofindia.indiatimes.com/world
1,1,Hindi TV News: Check Latest News on Hindi TV S...,Vijayendra: Need to know what happened to Sushant,http://photogallery.indiatimes.com
2,2,Internet platforms can’t be allowed to monetis...,"Netflix’s new documentary, ‘The Social Dilemma...",https://timesofindia.indiatimes.com
3,3,Rework Special Marriage Act for love and liber...,"Sometime in 2014, responding to all the rhetor...",https://timesofindia.indiatimes.com
4,4,Five winners of the post-pandemic global econo...,Even as Covid-19 continues to bubble up in hot...,https://timesofindia.indiatimes.com
...,...,...,...,...
298,298,Venezuelan health workers are getting cash bon...,(CNN) Venezuela's frontline health workers are...,https://edition.cnn.com/americas
299,299,30 cases of Covid-19 have been linked to a kar...,(CNN) A Canadian karaoke bar could face fines ...,https://edition.cnn.com/americas
300,300,Amazon tribes are using drones to track defore...,The 28-year-old belongs to a 250-strong tribe ...,https://edition.cnn.com/americas
301,301,Amazon tribes are using technology to protect ...,Mandu Uru Eu Wau Wau is a member of the Uru-Eu...,https://edition.cnn.com/americas


In [4]:
data.drop(data.columns[data.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
data.head()

Unnamed: 0,Title,Text,Source
0,Watch Latest News Videos,india\n\nCheck out the latest news videos from...,https://timesofindia.indiatimes.com/world
1,Hindi TV News: Check Latest News on Hindi TV S...,Vijayendra: Need to know what happened to Sushant,http://photogallery.indiatimes.com
2,Internet platforms can’t be allowed to monetis...,"Netflix’s new documentary, ‘The Social Dilemma...",https://timesofindia.indiatimes.com
3,Rework Special Marriage Act for love and liber...,"Sometime in 2014, responding to all the rhetor...",https://timesofindia.indiatimes.com
4,Five winners of the post-pandemic global econo...,Even as Covid-19 continues to bubble up in hot...,https://timesofindia.indiatimes.com


In [5]:
data = data[['Title','Text']].dropna()
#articles is a list of all articles
articles = data['Text'].tolist()
articles[2]

'Netflix’s new documentary, ‘The Social Dilemma’, should be compulsory viewing for Indians. It helps explain the unprecedented spread of hate speech and communal falsehoods. It features executives and IT nerds from top internet companies — Facebook, Twitter, Instagram, Google, YouTube. They say they started by believing that the internet would be a great democratiser, providing voice and knowledge to millions lacking it. Alas, it is also producing terrible polarisation, lies and strife.\n\nThis is not because internet companies have bad people or evil intentions. The problem lies in their profit model. They offer great free services. Their profits come from advertising, often subliminal advertising. This means flashing messages for very brief periods below the normal human perception level, reaching the subconscious. Subliminal advertising has long been used by conventional advertisers too, highlighted in books like Vance Packard’s ‘The Hidden Persuaders’. But the smartphone has taken 

In [6]:
from nltk.stem.snowball import SnowballStemmer
import nltk
stemmer = SnowballStemmer("english")
def clean_text(document):
    document = re.sub('[^\w_\s-]', ' ',document)       #remove punctuation marks and other symbols
    tokens = nltk.word_tokenize(document)              #Tokenize sentences
    cleaned_article = ' '.join([stemmer.stem(item) for item in tokens])    #Stemming each token
    return cleaned_article

In [7]:
#cleaned, tokenized and stemmed article
cleaned_articles = list(map(clean_text, articles))
cleaned_articles[2]

'netflix s new documentari the social dilemma should be compulsori view for indian it help explain the unpreced spread of hate speech and communal falsehood it featur execut and it nerd from top internet compani facebook twitter instagram googl youtub they say they start by believ that the internet would be a great democratis provid voic and knowledg to million lack it ala it is also produc terribl polaris lie and strife this is not becaus internet compani have bad peopl or evil intent the problem lie in their profit model they offer great free servic their profit come from advertis often sublimin advertis this mean flash messag for veri brief period below the normal human percept level reach the subconsci sublimin advertis has long been use by convent advertis too highlight in book like vanc packard s the hidden persuad but the smartphon has taken this psycholog manipul to new height free internet programm aim to get the attent of as mani viewer as possibl for as long as possibl they 

In [8]:
# writing function for the entire dataset
from nltk.stem import WordNetLemmatizer
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text,pos='v'))

#Tokenize and Lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token)>3:
            result.append(lemmatize_stemming(token))
            
    return result

In [9]:
processed_docs = []

for doc in cleaned_articles:
    processed_docs.append(preprocess(doc))

In [10]:
len(processed_docs)

293

In [11]:
print(processed_docs[:5])

[['india', 'check', 'latest', 'news', 'video', 'time', 'india', 'cover', 'wide', 'ring', 'topic', 'news', 'video', 'break', 'news', 'polit', 'news', 'polit', 'debat', 'current', 'affair', 'news', 'busi', 'news', 'world', 'news', 'scienc', 'news', 'educ', 'news', 'watch', 'video', 'news', 'event', 'happen', 'video', 'stori', 'world', 'on', 'stay', 'updat', 'time', 'india', 'news', 'video'], ['vijayendra', 'need', 'know', 'happen', 'sushant'], ['netflix', 'documentari', 'social', 'dilemma', 'compulsori', 'view', 'indian', 'help', 'explain', 'unprec', 'spread', 'hate', 'speech', 'communal', 'falsehood', 'featur', 'execut', 'nerd', 'internet', 'compani', 'facebook', 'twitter', 'instagram', 'googl', 'youtub', 'start', 'believ', 'internet', 'great', 'democrati', 'provid', 'voic', 'knowledg', 'million', 'lack', 'produc', 'terribl', 'polari', 'strife', 'becaus', 'internet', 'compani', 'peopl', 'evil', 'intent', 'problem', 'profit', 'model', 'offer', 'great', 'free', 'servic', 'profit', 'come',

In [12]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [13]:
print (dictionary)

Dictionary(9287 unique tokens: ['affair', 'break', 'busi', 'check', 'cover']...)


In [14]:
#Lets see if dictionary created succesfully
count=0
for k,v in dictionary.iteritems():
    print (k, v)
    count +=1
    if count >20:
        break

0 affair
1 break
2 busi
3 check
4 cover
5 current
6 debat
7 educ
8 event
9 happen
10 india
11 latest
12 news
13 on
14 polit
15 ring
16 scienc
17 stay
18 stori
19 time
20 topic


In [15]:
#remove rare and repeatative words
dictionary.filter_extremes(no_below=15,no_above=0.1,keep_n=100000)

In [16]:
print (dictionary)

Dictionary(432 unique tokens: ['check', 'cover', 'educ', 'latest', 'polit']...)


In [17]:
#Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
#words and how many times those words appear. Save this to 'bow_corpus'
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [18]:
bow_corpus[0]

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 1)]

In [19]:
#preview 
document_num = 10
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]],
                                                     bow_doc_x[i][1]))

Word 8 ("attempt") appears 1 time.
Word 10 ("billion") appears 1 time.
Word 11 ("book") appears 1 time.
Word 12 ("communiti") appears 1 time.
Word 17 ("explain") appears 1 time.
Word 25 ("individu") appears 1 time.
Word 33 ("network") appears 2 time.
Word 40 ("pressur") appears 1 time.
Word 62 ("address") appears 1 time.
Word 66 ("collect") appears 1 time.
Word 74 ("dream") appears 1 time.
Word 75 ("effect") appears 1 time.
Word 81 ("invest") appears 1 time.
Word 90 ("privat") appears 3 time.
Word 97 ("statement") appears 1 time.
Word 101 ("advanc") appears 2 time.
Word 113 ("econom") appears 2 time.
Word 116 ("emerg") appears 1 time.
Word 133 ("matter") appears 1 time.
Word 139 ("potenti") appears 2 time.
Word 140 ("prove") appears 1 time.
Word 143 ("relat") appears 1 time.
Word 155 ("small") appears 2 time.
Word 157 ("spend") appears 1 time.
Word 161 ("surpri") appears 1 time.
Word 168 ("activ") appears 3 time.
Word 176 ("parti") appears 7 time.
Word 188 ("firm") appears 1 time.
Word

In [20]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 10, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)

In [21]:
for idx, topic in lda_model.print_topics():
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.042*"cricket" + 0.023*"coach" + 0.018*"dream" + 0.018*"tournament" + 0.016*"grand" + 0.014*"score" + 0.013*"young" + 0.012*"england" + 0.012*"court" + 0.011*"titl"


Topic: 1 
Words: 0.025*"footbal" + 0.019*"invest" + 0.016*"owner" + 0.016*"premier" + 0.016*"english" + 0.014*"success" + 0.014*"health" + 0.012*"titl" + 0.012*"money" + 0.011*"spend"


Topic: 2 
Words: 0.040*"lakh" + 0.036*"tour" + 0.024*"hyundai" + 0.023*"segment" + 0.022*"option" + 0.022*"photo" + 0.021*"variant" + 0.020*"latest" + 0.019*"movi" + 0.016*"transmiss"


Topic: 3 
Words: 0.017*"human" + 0.017*"court" + 0.015*"virus" + 0.013*"fear" + 0.012*"action" + 0.012*"connect" + 0.011*"emerg" + 0.011*"lockdown" + 0.011*"small" + 0.010*"self"


Topic: 4 
Words: 0.047*"cent" + 0.043*"demand" + 0.042*"delhi" + 0.025*"polici" + 0.023*"sector" + 0.020*"sale" + 0.019*"manufactur" + 0.016*"infrastructur" + 0.016*"august" + 0.015*"improv"


Topic: 5 
Words: 0.027*"road" + 0.022*"juli" + 0.018*"minist" + 0.018

In [22]:
#LDA using TF-IDF
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

[(0, 0.3026955564121304),
 (1, 0.33283839242555835),
 (2, 0.34747123238377764),
 (3, 0.2787530139522639),
 (4, 0.5850460891378325),
 (5, 0.28775053967256903),
 (6, 0.29252304456891626),
 (7, 0.29749869484243646)]
Topic: 0 Word: 0.021*"food" + 0.014*"sure" + 0.011*"invest" + 0.011*"debut" + 0.010*"hous" + 0.010*"movi" + 0.009*"money" + 0.009*"grind" + 0.009*"learn" + 0.009*"excit"
Topic: 1 Word: 0.026*"februari" + 0.020*"januari" + 0.012*"sunday" + 0.011*"ride" + 0.009*"train" + 0.009*"photo" + 0.008*"championship" + 0.008*"black" + 0.008*"england" + 0.007*"rate"
Topic: 2 Word: 0.016*"cent" + 0.014*"sale" + 0.012*"word" + 0.011*"premium" + 0.010*"custom" + 0.010*"mind" + 0.010*"wear" + 0.010*"free" + 0.009*"segment" + 0.008*"explain"
Topic: 3 Word: 0.042*"choic" + 0.019*"websit" + 0.015*"ensur" + 0.015*"cricket" + 0.011*"european" + 0.011*"dream" + 0.011*"arriv" + 0.010*"review" + 0.010*"union" + 0.009*"plea"
Topic: 4 Word: 0.011*"race" + 0.011*"victori" + 0.010*"tour" + 0.010*"black" +

In [23]:
for index, score in sorted(lda_model_tfidf[bow_corpus[250]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.3784719705581665	 
Topic: 0.014*"minist" + 0.012*"human" + 0.010*"health" + 0.010*"region" + 0.008*"athlet" + 0.008*"london" + 0.007*"polici" + 0.007*"twitter" + 0.007*"polit" + 0.007*"sign"

Score: 0.17108547687530518	 
Topic: 0.042*"choic" + 0.019*"websit" + 0.015*"ensur" + 0.015*"cricket" + 0.011*"european" + 0.011*"dream" + 0.011*"arriv" + 0.010*"review" + 0.010*"union" + 0.009*"plea"

Score: 0.14408209919929504	 
Topic: 0.014*"cricket" + 0.008*"cost" + 0.008*"photo" + 0.007*"manufactur" + 0.007*"project" + 0.007*"march" + 0.007*"score" + 0.007*"school" + 0.007*"connect" + 0.007*"turn"

Score: 0.12701639533042908	 
Topic: 0.011*"race" + 0.011*"victori" + 0.010*"tour" + 0.010*"black" + 0.010*"premier" + 0.009*"delhi" + 0.009*"winner" + 0.009*"version" + 0.008*"teammat" + 0.008*"rest"

Score: 0.10376241058111191	 
Topic: 0.016*"footbal" + 0.013*"deal" + 0.012*"ring" + 0.009*"polit" + 0.009*"york" + 0.008*"relea" + 0.008*"cricket" + 0.007*"invest" + 0.007*"consum" + 0.007*"r

In [26]:
#Testing model on unseen document
unseen_document = 'One in five maple species is threatened in the wild, according to the first full assessment of extinction risks.Known for the vivid colour of their autumn leaves, the trees are popular in parks and gardens.But in their natural habitats, they face a myriad of threats, including unsustainable logging, climate change, deforestation and forest fires.Botanists are calling for urgent action to protect rare maple trees.And they say seeds should be stored as an insurance policy against extinction.The assessment of all 158 species of maple is part of an effort to map the conservation status of all tree species by the end of 2020. It was carried out by the group, Botanic Gardens Conservation International.Conservation manager Dan Crowley told BBC News:  Maples are some of our most familiar trees, particularly in autumn when they give us those wonderful displays of yellow, orange, red and purple colours.And whilst they are common in some of our open spaces, spaces where they are highly valued, several species are also highly threatened in the wild.'

In [27]:
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.7803139686584473	 Topic: 0.017*"human" + 0.017*"court" + 0.015*"virus" + 0.013*"fear" + 0.012*"action"
Score: 0.16252639889717102	 Topic: 0.049*"footbal" + 0.040*"photo" + 0.027*"athlet" + 0.017*"women" + 0.016*"school"
