# HW03: Distance and Topic Model

Remember that these homework work as a completion grade. **You can skip one section without losing credit.**

## Load and Pre-process Text

In [None]:
#Import the AG news dataset (same as hw01)
#Download them from here 
#!wget https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv

import pandas as pd
import nltk
import numpy as np
import random
df = pd.read_csv('train.csv')

df.columns = ["label", "title", "lead"]
label_map = {1:"world", 2:"sport", 3:"business", 4:"sci/tech"}
def replace_label(x):
	return label_map[x]
df["label"] = df["label"].apply(replace_label) 
df["text"] = df["title"] + " " + df["lead"]
df.head()

In [None]:
import spacy
dfs = df.sample(200)
nlp = spacy.load('en_core_web_sm')
from sklearn.feature_extraction.text import CountVectorizer

##TODO pre-process text as you did in HW02
dfs['sentences'] = dfs['text'].apply(lambda x: list(nlp(x).sents))
dfs['tokens'] = dfs['text'].apply(lambda x: list(nlp(x)))

def preprocess(text):
    return ' '.join([word.lemma_.lower() for word in list(nlp(text)) if not word.is_punct and not word.is_stop])

dfs['tokens_lower'] = dfs['text'].apply(lambda x: preprocess(x))

In [None]:
dfs.head()

In [None]:
##TODO vectorize the pre-processed text using CountVectorizer
vectorizer = CountVectorizer(min_df=0.01,
                             max_df=0.9,
                             max_features = 1000,
                             stop_words = 'english',
                             ngram_range = (1, 2))

X = vectorizer.fit_transform(dfs['tokens_lower'])

cv_df = pd.DataFrame(X.toarray())
cv_df.columns = vectorizer.get_feature_names()

## Cosine Similarity and PCA

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

##TODO compute the cosine similarity for the first 200 snippets and for the first snippet,
# show the three most similar snippets and their respective cosine similarity scores
similarity = cosine_similarity(cv_df.iloc[:200])
first_snippet = similarity[0].copy() # cosine similarity for first snippet
first_snippet[::-1].sort()

print('First snippet:')
print(dfs.iloc[0]['text'])
print('\n')

for i in range(3):
    print(dfs.iloc[np.where(similarity[0] == first_snippet[i+1])]['text'].values)
    print(f'Cosine similarity is : {first_snippet[i+1]}')
    print('\n')

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3,svd_solver='randomized')

##TODO reduce the vectorized data using PCA
pca = PCA(n_components = 3, svd_solver = 'randomized')
pca_df = pca.fit_transform(cv_df)
pca.explained_variance_ratio_

##TODO compute again cosine similarity with the reduced version for the first 200 snippets
similarity_pca = cosine_similarity(pca_df[:200])

##TODO for the first snippet, show again its three most similar snippets

first_snipp = similarity_pca[0].copy() # cosine similarity for first snippet
first_snipp[::-1].sort()

print('First snippet:')
print(dfs.iloc[0]['text'])
print('\n')

for i in range(3):
    print(dfs.iloc[np.where(similarity_pca[0] == first_snipp[i+1])]['text'].values)
    print(f'Cosine similarity is : {first_snipp[i+1]}')
    print('\n')
    
# Yes, the results change masively. It seems like after PCA the texts are much closer 
# to the first snippet concerning the theme: Darfur / US. Interesting that Yahoo article gets such a high similarity score.

Compare the cosine similarity before and after PCA reduction. Did the results change? 

## Clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

##TODO create the clusters found with k-means clustering nd 10 clusters
no_clusters = 10
k_means = KMeans(n_clusters=no_clusters, n_jobs = -1, random_state = 123)
k_means.fit(X)
clusters = k_means.labels_.tolist()
dfs['cluster'] = clusters

##TODO find the optimal number of clusters in a range from 2 to 50 using the silhouette score
silhouette_score(X, k_means.labels_)
s_score = list()

for n in range(2, 50):
    kmeans = KMeans(n_clusters = n, n_jobs = -1)
    kmeans.fit(X)
    s_score.append(silhouette_score(X, kmeans.labels_))

In [None]:
import matplotlib.pyplot as plt
plt.plot(range(2, 50), s_score)
plt.xlabel('No clusters')
plt.ylabel('Silhouette score')
plt.show

In [None]:
opt_sil_score = max(s_score[2:50])
s_score.index(opt_sil_score)
opt_num_cluster = range(2, 50)[s_score.index(opt_sil_score)]
print('The optimal number of clusters is %s' %opt_num_cluster)

In [None]:
##TODO create the clusters using the optimal number of clusters obtained before

k_means = KMeans(n_clusters=opt_num_cluster, n_jobs = -1, random_state = 123)
k_means.fit(X)
clusters = k_means.labels_.tolist()
dfs['cluster_optimal'] = clusters

In [None]:
##TODO compare the documents in cluster "1" under the two specifications, does the cluster look 
# cleaner after having searched for the optimal number of clusters?
dfs[dfs['cluster_optimal'] == 1]['text']

## Topic Modeling: LDA

For this part you will need to use LDA Mallet. If you cannot have Mallet run, you can use the simple LDA algorithm 

In [None]:
from gensim.corpora import Dictionary
from gensim.models.wrappers import LdaMallet
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from gensim import corpora
from random import shuffle

##TODO create a dictionary with the pre-processed tokenized text
# and filter it according to frequencies and keeping 1000 vocabularies


tokenizer = nlp.Defaults.create_tokenizer(nlp)

def preprocess(text, nlp):
    return [word.lemma_.lower() for word in nlp(text)
            if not word.is_punct and not word.is_stop and not word.is_digit 
            and len(word) > 2]

documents = list()

for doc in dfs['text']:
    for paragraph in doc.split("\n\n"):
        documents.append(preprocess(paragraph, tokenizer))
        
shuffle(documents)

dictionary = corpora.Dictionary(documents)
dictionary.filter_extremes(no_below = 5, no_above = 0.7, keep_n = 1000)

##TODO create the doc_term_matrix
doc_term_matrix = [dictionary.doc2bow(doc) for doc in documents]

In [None]:
##TODO train a LDA Mallet model with 5, 10 and 15 topics
##TODO compute the coherence score for each of these model 
#and print the topics from the model with highest coherence score

scores = list()
# topics = list()

for num_topics in [5, 10, 15]:

    lda = LdaModel(doc_term_matrix, num_topics=num_topics, id2word = dictionary, passes=2)
    coherence = CoherenceModel(model = lda, texts = documents, corpus = doc_term_matrix,
                              dictionary=dictionary, coherence='c_v')
    scores.append((num_topics, coherence.get_coherence(), lda.show_topics(formatted=True), lda))
    
lda_models = pd.DataFrame(scores, columns=["Number of Topics", "Coherence Scores", "Topics", 'LDA Model'])


# topics from model with highest coherence score
list(lda_models[lda_models['Coherence Scores'] == lda_models['Coherence Scores'].values.max()]['Topics'].values)

In [None]:
import pyLDAvis.gensim
##TODO using LDAvis visualize the topics using the optimal number of topics

best_model = lda_models[lda_models['Coherence Scores'] == lda_models['Coherence Scores'].values.max()]['LDA Model'].values[0]
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(best_model, doc_term_matrix, dictionary)