## Topic Modeling

In [1]:
import gensim # LDA
import pandas as pd # working with dataframe
import nltk # tokenization, remove stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer # stemming and lemmatization 
from sklearn.cluster import KMeans # Unsupervised clustering KMeans
from sklearn.metrics import accuracy_score # measuring accuracy of LDA topic modeling and KMeans clustering

In [2]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/bita/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Reading data into dataframe

In [3]:
data = pd.read_csv('Articles.csv', encoding = 'unicode_escape') # reading data
news_df = data.drop(['Date', 'Heading'], axis=1) # dropping unnecessary columns
NewsType={'business':1, 'sports':0} # encoding NewsType column which is a categorical  into a numerical feature
news_df['NewsType'].replace(NewsType, inplace=True)
news_df

Unnamed: 0,Article,NewsType
0,KARACHI: The Sindh government has decided to b...,1
1,HONG KONG: Asian markets started 2015 on an up...,1
2,HONG KONG: Hong Kong shares opened 0.66 perce...,1
3,HONG KONG: Asian markets tumbled Tuesday follo...,1
4,NEW YORK: US oil prices Monday slipped below $...,1
...,...,...
2687,strong>DUBAI: Dubai International Airport and ...,1
2688,"strong>BEIJING: Former Prime Minister, Shaukat...",1
2689,strong>WASHINGTON: Uber has grounded its fleet...,1
2690,strong>BEIJING: The New Development Bank plans...,1


### Preprocessing Articles

In [4]:
def preprocess(article): # preprocessing Articles including Tokenization, removing stopwords, lemmatization and stemming 
    stemmer = SnowballStemmer('english')
    tokens = []
    for token in gensim.utils.simple_preprocess(article): # tokenize each article and tokens within an article
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3: # removing stopwords and keep those words with 4 characters or more
            tokens.append(stemmer.stem(WordNetLemmatizer().lemmatize(token, pos='v'))) # lemmatize and then stemming
    return tokens

In [5]:
preprocessed = news_df['Article'].map(preprocess) # preprocess each article

### Creating dictionary and corpus

In [6]:
dictionary = gensim.corpora.Dictionary(preprocessed) # create a dictionary from all tokens in all articles
dictionary.filter_extremes(no_below=15, no_above=0.5) # filter out tokens in the dictionary by their frequency
corpus = [dictionary.doc2bow(art) for art in preprocessed] # create corpora, contains how many times each word in the dictionary appears in a document

### LDA model

In [7]:
topic_model = gensim.models.LdaMulticore(corpus, num_topics=2, id2word=dictionary, passes=10) # train lda model on corpus using tokens in dictionary
for i, t in topic_model.print_topics(): # print extracted topics 
    print(f'Topic {i}:\n{t}\n\n')

Topic 0:
0.013*"match" + 0.012*"pakistan" + 0.012*"england" + 0.011*"cricket" + 0.011*"wicket" + 0.010*"world" + 0.008*"final" + 0.008*"second" + 0.008*"captain" + 0.007*"india"


Topic 1:
0.019*"percent" + 0.011*"market" + 0.011*"price" + 0.010*"pakistan" + 0.008*"million" + 0.007*"countri" + 0.007*"billion" + 0.007*"month" + 0.007*"trade" + 0.006*"crude"




### Determine topic for each document 

In [11]:
t=[]
# to get topics distribution for each document
for i in range(len(corpus)): 
    top_topics = topic_model.get_document_topics(corpus[i], minimum_probability=0.0) 
    t.append(top_topics)

# select maximum distribution as a main topic for each document
x=[]
topic_dict={}
for i in range(len(t)): 
    temp=[]
    for j in range(2):
        temp.append(list(t[i][j])[1])
    selected_topic = temp.index(max(temp)) 
    topic_dict[i] = temp # create a feature dict based on topics in order use it later in KMeans clustering
    x.append(selected_topic)

In [12]:
news_df['selected_topic'] = x # add new column to the dataframe showing selected topic of each document using LDA
news_df

Unnamed: 0,Article,NewsType,selected_topic
0,KARACHI: The Sindh government has decided to b...,1,1
1,HONG KONG: Asian markets started 2015 on an up...,1,1
2,HONG KONG: Hong Kong shares opened 0.66 perce...,1,1
3,HONG KONG: Asian markets tumbled Tuesday follo...,1,1
4,NEW YORK: US oil prices Monday slipped below $...,1,1
...,...,...,...
2687,strong>DUBAI: Dubai International Airport and ...,1,1
2688,"strong>BEIJING: Former Prime Minister, Shaukat...",1,1
2689,strong>WASHINGTON: Uber has grounded its fleet...,1,1
2690,strong>BEIJING: The New Development Bank plans...,1,1


### Clustering based on LDA topics

In [14]:
topic_df=pd.DataFrame.from_dict(topic_dict, orient='index') # create feature dataframe out of feature dict
topic_df
kmeans = KMeans(n_clusters=2) # clustering documents based on KMeans algorithm
kmeans.fit(topic_df)
y_kmeans = kmeans.labels_

In [15]:
news_df['predicted_topic']=y_kmeans # add new column to the dataframe showing predicted topic for each document using KMeans
news_df

Unnamed: 0,Article,NewsType,selected_topic,predicted_topic
0,KARACHI: The Sindh government has decided to b...,1,1,1
1,HONG KONG: Asian markets started 2015 on an up...,1,1,1
2,HONG KONG: Hong Kong shares opened 0.66 perce...,1,1,1
3,HONG KONG: Asian markets tumbled Tuesday follo...,1,1,1
4,NEW YORK: US oil prices Monday slipped below $...,1,1,1
...,...,...,...,...
2687,strong>DUBAI: Dubai International Airport and ...,1,1,1
2688,"strong>BEIJING: Former Prime Minister, Shaukat...",1,1,1
2689,strong>WASHINGTON: Uber has grounded its fleet...,1,1,1
2690,strong>BEIJING: The New Development Bank plans...,1,1,1
