# Introduction to Text Mining Part 3 - Exercises with answers

## Exercise 1

#### Task 1
##### Load libraries that are used in this module.

#### Result:

In [None]:
# Helper packages.
import os 
import pickle
import pandas as pd
import numpy as np

# Cosine similarity and clustering packages.
from sklearn.metrics.pairwise import cosine_similarity
from scipy.cluster.hierarchy import ward, dendrogram, fcluster
import gensim
from gensim import matutils
from gensim.models.coherencemodel import CoherenceModel

# Network creation and visualization.
import networkx as nx
from pyvis.network import Network

# Other plotting tools.
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt

#### Task 2 
##### Set `main_dir` to the location of your `booz-allen-hamilton` folder.
##### Make `data_dir` from the `main_dir` and concatenate remainder of the path to data directory.
##### Make `plot_dir` from the `main_dir` and concatenate remainder of the path to plots directory.

#### Result:

In [None]:
from pathlib import Path
# Set `home_dir` to the root directory of your computer.
home_dir = Path.home()

# Set `main_dir` to the location of your `booz-allen-hamilton` folder.
main_dir = home_dir / "Desktop" / "booz-allen-hamilton"

# Make `data_dir` from the `main_dir` and remainder of the path to data directory.
data_dir = main_dir / "data"

# Make `plot_dir` from the `main_dir` and remainder of the path to data directory.
plot_dir = main_dir / "data"

#### Task 3 
##### Set the working directory to `data_dir`.
##### Check if the working directory is updated to `data_dir`.

#### Result:

In [None]:
# Change the working directory.
os.chdir(data_dir)

# Check the working directory.
print(os.getcwd())

##### Task 4
##### Load the pickled file from the previous exercises: 

##### 'dictionary_ex.sav', 'corpus_tfidf_ex.sav', 'bow_corpus_ex.sav', 'ex_DTM.sav' and 'ex_titles_clean.sav' and name them as
##### 'dictionary_ex', 'corpus_tfidf_ex', 'bow_corpus_ex', 'ex_DTM' and 'processed_docs_ex'

#### Result:

In [None]:
dictionary_ex = pickle.load(open('dictionary_ex.sav', 'rb'))
corpus_tfidf_ex = pickle.load(open('corpus_tfidf_ex.sav', 'rb'))
bow_corpus_ex = pickle.load(open('bow_corpus_ex.sav', 'rb'))
DTM_ex = pickle.load(open('ex_DTM.sav', 'rb'))
processed_docs_ex = pickle.load(open('ex_titles_clean.sav', 'rb'))

##### Task 5 
##### Load UN agreement titles data from original file, 'UN_agreement_titles.csv'.
##### Load pre-saved word counts array we pickled in the previous session, `ex_word_counts_array.sav`.

#### Result:

In [None]:
UN = pd.read_csv('UN_agreement_titles.csv')

word_counts_array_ex = pickle.load(open("ex_word_counts_array.sav","rb"))

#### Task 6
##### Now we are going to run LDA on our `corpus_tfidf_ex` object.
##### Choose the same parameters as we did in the slides.
##### Save the model as `lda_model_tfidf_ex` and print.

#### Result:

In [None]:
lda_model_tfidf_ex = gensim.models.LdaModel(corpus_tfidf_ex, num_topics = 5, 
                                                id2word = dictionary_ex, passes = 2)

In [None]:
print(lda_model_tfidf_ex)

#### Task 7
##### Look at the output of your LDA model, print each of the 5 topics and the top words within each topic.
##### Then, take the first doc from `processed_docs_ex` and classify it within one of the five topics. Which one is it most similar to?

#### Result:

In [None]:
# Look at all 5 topics and top words within each one.
for idx, topic in lda_model_tfidf_ex.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

In [None]:
# Classify the first doc in processed_docs_ex.
for index, score in sorted(lda_model_tfidf_ex[corpus_tfidf_ex[1]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf_ex.print_topic(index, 10)))

# Topic 1 is the closest to title 1.

## Exercise 2

#### Task 1 
##### Find the topic coherence for the LDA model.
##### Save it as `coherence_lda_ex` and print it out.

#### Result:

In [None]:
coherence_model_lda = CoherenceModel(model = lda_model_tfidf_ex, texts = processed_docs_ex, 
                                     dictionary = dictionary_ex, coherence = 'c_v')
coherence_lda_ex = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda_ex)

#### Task 2 
##### Define the convenience function `compute_coherence_values` and tweak parameters as you think needed.
##### Set the seed to 1.
##### Compute the `model_list` and `coherence_values` using the function above.
##### There are more titles, so maybe more topics will be necessary as a limit. Hence, set `limit` as `80`.

#### Result:

In [None]:
# Convenience function defined in class.
def compute_coherence_values(dictionary, corpus, texts, limit, start = 2, step = 3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.LdaMulticore(corpus = corpus, id2word = dictionary, num_topics = num_topics)
        model_list.append(model)
        coherencemodel = CoherenceModel(model = model, texts = texts, dictionary = dictionary, coherence = 'c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
np.random.seed(1)
model_list, coherence_values = compute_coherence_values(dictionary = dictionary_ex, corpus = corpus_tfidf_ex, 
                                                        texts = processed_docs_ex, start = 2, limit = 80, step = 6)

#### Task 3 
##### Plot the findings from `model_list` and `coherence_values`.
##### Set `x` as the range where `start` is `2`, `limit` is `80` and `step` `6`.
##### Plot `x` against `coherence_values`. Label the axes accordingly.
##### What would improve our LDA model? What number of topics make the most sense?

#### Result:

In [None]:
# Plot the optimal number of topics.
import matplotlib.pyplot as plt
limit = 80; start = 2; step = 6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc = 'best')
plt.show()


- It looks like the optimal topic is either around 30 or around 70.

##### Task 4
##### Prepare the visualization object for plotting, and assign it to a new variable 'vis'.
###### Display the results.
##### What can you infer about the topics by looking at this plot?


##### Adjust the slider from 0 to 1. What can you tell about the relevant terms in topic 3?

#### Result:

In [None]:
# Prepare LDA vis object by providing:
vis = pyLDAvis.gensim.prepare(lda_model_tfidf_ex, #<- model object
                              corpus_tfidf_ex,    #<- corpus object
                              dictionary_ex)      #<- dictionary object

pyLDAvis.display(vis)

**Inference about the topics:**
Based on distances between the circles, topics `1` and `2` seem to be the most similar in meaning.
Based on circle sizes, topic `1` has the maximum number of documents

**Relevant terms in topic 3:** `Amend`, `Convent` and `protocol` seem to be the top most relevant terms in topic 3

## Exercise 3

#### Task 1 
##### Obtain the topic probabilities for the 1st document in our corpus.
##### Which topic has the highest probability for document 1?

#### Result:

In [None]:
# Select the index of the document in corpus.
doc_num = 0

# Extract the vector of tf_idf weights for the document.
doc_vec = corpus_tfidf_ex[doc_num]
print(doc_vec)

# Extract topic probabilities for that document. 
# Note: The probability values may change for different seed values
doc_topics = lda_model_tfidf_ex.get_document_topics(doc_vec)
print(doc_topics)

- Topic `2` has the highest probability for document `1`. (remember, it's listed as `1` because the index starts at `0`)

#### Task 2 
##### Now get the best topic and its probability for the document programmatically.
##### What is the best topic and its probability in document 1?

##### Result:

In [None]:
# Set the seed.
np.random.seed(2)
# Initialize maximum probability score.
max_prob = 0

# Initialize best topic.
best_topic = 0

# Loop over all topics for the document.
for topic in doc_topics:
    if max_prob <= topic[1]:  #<- if current topic's probability is as high as max
        max_prob = topic[1]   #<- make current topic's probability the new max
        best_topic = topic[0] #<- make current topic best

# Create a tuple with information we just got.
doc_topic_pair_ex = (doc_num, best_topic, max_prob)
print(doc_topic_pair_ex)

# We can see that for document 1, the best topic is 2 and its probability is 76.1%
# Note: the probability values may change for different seed values

#### Task 3 
##### Define a function, GetDocTopicPair(), to extract this information for a document given an LDA model.

##### Put it all together into a function that returns a tuple with the index of the document, the best fit topic, and its probability.

#### Result:

In [None]:
def GetDocTopicPair(doc_num, corpus, lda_model_tfidf):
# Extract the vector of tf_idf weights for the document.    
    doc_vec = corpus[doc_num]
# Extract topic probabilities for that document.
    doc_topics = lda_model_tfidf.get_document_topics(doc_vec)
    max_prob = 0
    best_topic = 0
    for topic in doc_topics:
        if max_prob <= topic[1]:
            max_prob = topic[1]
            best_topic = topic[0]
    doc_topic_pair = (doc_num, best_topic, max_prob)
    return(doc_topic_pair)

#### Task 4 
##### Apply the above function to each document in our corpus by using a loop.
##### What does the list of tuples contain?

#### Result:

In [None]:
# Create an empty list of the same length as the number of documents.
doc_topic_pairs_ex = [None]*dictionary_ex.num_docs

# Loop through a range of document indices.
for i in range(dictionary_ex.num_docs):
    # For each document index, get the document-topic tuple.
    doc_topic_pairs_ex[i] = GetDocTopicPair(i, corpus_tfidf_ex, lda_model_tfidf_ex)

print(doc_topic_pairs_ex[:10])

- The list of tuples represents each document with it's best fit topic

#### Task 5 
##### Create a dataframe called `doc_topic_df_ex` and assign the list of tuples to it.


#### Result:

In [None]:
# Make a dataframe out of a list of tuples.
doc_topic_df_ex = pd.DataFrame(doc_topic_pairs_ex)
# Assign column names to the dataframe.
doc_topic_df_ex.columns = ["doc_id", "best_topic", "best_probability"]
print(doc_topic_df_ex.head())

#### Task 6 
##### Retrieve all documents with word count less than 3 and assign original index from UN data to our `doc_topic_df_ex` dataframe.
##### Print the last 5 rows.


#### Result:

In [None]:
# Find indices of articles that we kept.
valid_snippets_ex = np.where(word_counts_array_ex >= 3)[0]
print(len(valid_snippets_ex))

In [None]:
# Now assign the index of the original article to be the index of the dataframe.
doc_topic_df_ex.index = valid_snippets_ex
print(doc_topic_df_ex.tail(5))

#### Task 7 
##### Retrieve all documents assigned to topic 2. Save it in `topic2_docs` and output the top ten documents assigned to that topic.
##### Print the number of documents assigned to topic 2.

#### Result:

In [None]:
# Filter and sort all documents assigned to topic 2 by probability in descending order.
topic2_docs = doc_topic_df_ex.query("best_topic==1")
topic2_docs = topic2_docs.sort_values(by = "best_probability", ascending = False)
print(topic2_docs.head(10))

In [None]:
# Let's see how many documents were assigned to that topic.
print(topic2_docs.shape)

#### Task 8 
##### Get the indices of the top 15 documents in the topic and then the headlines of the top 15 documents in the topic from the 
##### original UN dataframe.

#### Result:

In [None]:
# Let's get the top 15 documents that were assigned to that topic.
top_15 = topic2_docs.index[0:15,]
# Inspect the top 15 documents in topic 2.
UN_articles_topic2 = UN.loc[top_15, :]
print(UN_articles_topic2[['title']])

#### Task 9 
##### Save the LDA visualization as a HTML file called `UN_LDAvis`.

#### Result:

In [None]:
# Save the plot as a self-contained HTML file.
pyLDAvis.save_html(vis, plot_dir+"/UN_LDAvis.html")

## Exercise 4

#### Task 1 
##### Generate a TDM from corpus weighted with TF-IDF, name it `TDM_tf_idf_ex`
##### Check the dimensions of the type of the TDM.
##### How many terms and documents are there in the 2D array?

#### Result:

In [None]:
# Convert corpus weighted with TF-IDF to a TDM matrix.
TDM_tf_idf_ex = matutils.corpus2dense(corpus_tfidf_ex,
                                      DTM_ex.shape[1],
                                      DTM_ex.shape[0])


print(type(TDM_tf_idf_ex))
print(TDM_tf_idf_ex.shape)

#### Task 2
##### Convert the above TDM into a DTM called `DTM_tf_idf_ex`.
##### Print the dimensions of the matrix.
##### Save the matrix created as a dataframe called `DTM_df_ex`.

#### Result:

In [None]:
# Transpose matrix to get the DTM.
DTM_tf_idf_ex = TDM_tf_idf_ex.transpose()

print(DTM_tf_idf_ex.shape)

In [None]:
# Create the DTM weighted with TF-IDF.
DTM_df_ex = pd.DataFrame(DTM_tf_idf_ex,
                         columns = DTM_ex.columns, 
                         index = valid_snippets_ex) #<- set index to original article index
print(DTM_df_ex.head())

#### Task 3 
##### Compute cosine similarity for the `DTM_tf_idf_ex` matrix.
##### Print the shape of the matrix.
##### Save the similarity matrix as a dataframe called `similarity_df_ex`.

#### Result:

In [None]:
# Compute similarity matrix (a numpy 2D array).
similarity_ex = cosine_similarity(DTM_tf_idf_ex)
print(type(similarity_ex))

print(similarity_ex.shape)

# Create similarity dataframe with appropriate column names and indices.
similarity_df_ex = pd.DataFrame(similarity_ex,
                                columns = valid_snippets_ex,
                                index = valid_snippets_ex)