# Advanced Text Mining Part 3 - Exercises with answers

## Exercise 1

#### Task 1
##### Load libraries that are used in this module.

#### Result:

In [None]:
# Helper packages.
import os
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Packages for loading pre-trained word-embedding model
import gensim
from gensim.models import Word2Vec

##### Task 2
##### Set `main_dir` to the location of your `booz-allen-hamilton` folder.
##### Make `data_dir` from the `main_dir` and concatenate remainder of the path to data directory.
##### Make `plots_dir` from the `main_dir` and concatenate remainder of the path to plots directory.

#### Result:

In [None]:
from pathlib import Path
# Set `home_dir` to the root directory of your computer.
home_dir = Path.home()

# Set `main_dir` to the location of your `booz-allen-hamilton` folder.
main_dir = home_dir / "Desktop" / "booz-allen-hamilton"

# Make `data_dir` from the `main_dir` and remainder of the path to data directory.
data_dir = main_dir / "data"

# Make `plots_dir` from the `main_dir` and remainder of the path to plots directory.
plot_dir = main_dir / "plots"

#### Task 3 
##### Set the working directory to `data_dir`.
##### Check if the working directory is updated to `data_dir`.

#### Answer :

In [None]:
# Change the working directory.
os.chdir(data_dir)
# Check the working directory.
print(os.getcwd())

#### Task 4
#####  Load the corpus from `UN_agreement_titles.csv` into a new variable `agreements`.

#### Result:

In [None]:
# Load corpus from a text document.
agreements  = pd.read_csv(data_dir + '/UN_agreement_titles.csv')

#### Task 5
##### Make a series from the dataframe that contains only the `title` column of `agreements` and name it `titles`.

#### Result:

In [None]:
# Create a series from the dataframe, name it `titles`.
titles = agreements["title"]

#### Task 6
##### Tokenize each title in the series `titles` and assign it to `ex_titles_tokenized`.
##### Assign the first tokenized titles to `ex_title_words` and print this out.

#### Result:

In [None]:
# Tokenize each title into a large list of tokenized titles.
ex_titles_tokenized = [word_tokenize(titles[i]) for i in range(0,len(titles))]

# First tokenized title.
ex_titles_words = ex_titles_tokenized[0]
print(ex_titles_words)

#### Task 7
##### Define and run a cleaning function to convert to lower case, remove stop words, remove punctuation and any non-alphabetical characters on the list `ex_titles_tokenized` and return `ex_titles_clean_list` and `ex_titles_clean`

#### Result:

In [None]:
def cleaning(ex_titles_clean, ex_titles_tokenized):

# Process words in all documents.
    for i in range(len(ex_titles_tokenized)):
    # 1. Convert to lower case.
        ex_titles_clean[i] = [titles.lower() for titles in ex_titles_tokenized[i]]
    
    # 2. Remove stopwords.
        ex_titles_clean[i] = [word for word in ex_titles_clean[i] if not word in stop_words]
    
    # 3. Remove punctuation and any non-alphabetical characters.
        ex_titles_clean[i] = [word for word in ex_titles_clean[i] if word.isalpha()]
    
    ex_titles_clean_list = [' '.join(snippet) for snippet in ex_titles_clean]
    return ex_titles_clean_list, ex_titles_clean

#### Task 8
##### Create an empty list `ex_titles_clean_not_stemmed` for clean titles whose length is same as `ex_titles_tokenized` 
##### Clean tokens for each title in `ex_titles_clean_list` using the cleaning function

#### Result:

In [None]:
# Create a vector for clean titles.
ex_titles_clean_not_stemmed = [None] * len(ex_titles_tokenized)

ex_titles_clean_list,ex_titles_clean = cleaning(ex_titles_clean_not_stemmed,ex_titles_tokenized)

#### Task 9
##### Initialize `CountVectorizer`
##### Transform the list of titles into DTM and show output as a matrix
##### Convert the matrix into a pandas dataframe for easier manipulation and print the top rows of the dataframe

#### Result:

In [None]:
vec = CountVectorizer()

X = vec.fit_transform(ex_titles_clean_list)
print(X.toarray())

In [None]:
ex_DTM_not_stemmed = pd.DataFrame(X.toarray(), columns = vec.get_feature_names())
print(ex_DTM_not_stemmed.head())

#### Task 10

##### Using `ex_titles_clean`, create a `Word2Vec` model and name as `ex_model`. 
##### Be sure to use the same parameters as we did in the module. 

##### Print the `vector_size` of `ex_model`. 
##### Also, just like we did in the module, see what similar words come up for `administration` and `united` in this model. 
#### Result:

In [None]:
ex_model = Word2Vec(ex_titles_clean, 
                 size = 200, 
                 min_count = 3,
                 iter = 5)

In [None]:
print(ex_model.vector_size)

In [None]:
print(ex_model.wv.most_similar('administration'))

In [None]:
print(ex_model.wv.most_similar('united'))

## Exercise 2

#### Task 1

##### Load the pre-trained glove embeddings and save as `glove_file`. 
##### We will be loading the file with vector size of 200. 

##### Define `LoadGloveModel()` function as we did in class to extract workd embeddings from the glove file. 
##### Save the outputs from `LoadGloveModel` function as `ex_glove_model`. 

#### Result: 

In [None]:
# Number of glove dimensions.
GLOVE_DIM = 200

# Load pre-trained glove embeddings.
glove_file = data_dir + "/glove.6B.200d.txt"

In [None]:
# Note: Use f = open(glove_file,'r', encoding= 'utf-8') if unicode errors occur

def LoadGloveModel(glove_file):
    print("Loading Glove Model")
    f = open(glove_file,'r',encoding="utf8")
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.", len(model), " words loaded!")
    return model

# Load embedings from file.
ex_glove_model = LoadGloveModel(glove_file)

#### Task 2
##### Check the first few embeddings of `ex_glove_model`. 
##### Create a frequency count of each word in the corpus using `ex_DTM_not_stemmed`  and save it to `ex_corpus_freq_dist`.
##### Save `ex_corpus_freq_dist` as a dataframe named `ex_word_counts`.

#### Result:

In [None]:
dict(list(ex_glove_model.items())[0:5])

In [None]:
# Save series as a dictionary.
ex_corpus_freq_dist = ex_DTM_not_stemmed.sum(axis = 0).to_dict()
dict(list(ex_corpus_freq_dist.items())[0:5])

In [None]:
# Extract word counts for exploratory analysis.
ex_word_counts = pd.DataFrame(list(ex_corpus_freq_dist.items()), columns = ['word', 'count'])

In [None]:
print(ex_word_counts.head())

#### Task 3
##### Iniitialize the following variables as shown:

In [None]:
# Initialize embeddings matrix.
ex_DICT_SIZE = len(ex_word_counts.index)
ex_word_emb_matrix = np.zeros((ex_DICT_SIZE, GLOVE_DIM))
ex_words = list(ex_word_counts.word)
ex_NUM_MESSAGES = len(ex_titles_clean_not_stemmed)

##### Create a loop to extract the vectors from `glove_model` and save to `ex_word_emb_matrix`.
##### Print its shape and the first vector.
#### Result:

In [None]:
for i in range(ex_DICT_SIZE):
    w = ex_words[i]
    vect = ex_glove_model.get(w)

    if vect is not None:
        ex_word_emb_matrix[i] = vect

In [None]:
print(ex_word_emb_matrix.shape)

In [None]:
print(ex_word_emb_matrix[0])

#### Task 4
##### Convert `ex_DTM_not_stemmed` to a  numpy array.
##### Compute sums of all word counts for each tweet and save as `ex_DTM_row_sums`,
##### Create `titles_embeddings_matrix` by multiplying `ex_DTM_non_stemmed` with `ex_word_emb_matrix`.

#### Result:

In [None]:
# Convert dataframe to a numpy array
ex_DTM_not_stemmed = ex_DTM_not_stemmed.to_numpy()

# Compute sums of all word counts for each chat message
ex_DTM_row_sums = np.sum(ex_DTM_not_stemmed, axis=1)

titles_embeddings_matrix = ex_DTM_not_stemmed.dot(ex_word_emb_matrix)

#### Task 5
##### Compute the weighted average of each document by using a loop to average `titles_embeddings_matrix` using `ex_DTM_row_sums`.
##### Save `titles_embeddings_matrix` as a dataframe named `titles_emb_df` and print the results.

#### Result:

In [None]:
for i in range(ex_NUM_MESSAGES):
    titles_embeddings_matrix[i] = np.true_divide(titles_embeddings_matrix[i], ex_DTM_row_sums[i])

In [None]:
# Save as a data frame and add chat message ids.
titles_emb_df = pd.DataFrame(titles_embeddings_matrix)
print(titles_emb_df.head())

## Exercise 3

#### Task 1
##### Take a look at the first title (`titles[0]`)
##### Convert its vector representation in `titles_emb_df` and save to `target_titles_emb`. Print the first 5 results.
* Note: You can use `pd.set_option('display.max_colwidth', -1)` and `pd.set_option('display.max_rows', 2000)` to see the whole (non-truncated) message in a column of a dataframe or in a series.

#### Result:

In [None]:
titles[0]

In [None]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 2000)

titles[:100]

In [None]:
# Average embeddingstitles = titles_df['titles'] 
target_titles_emb = titles_emb_df.loc[0].to_numpy()
print(target_titles_emb[0:5])

#### Task 2
##### Find cosine similarity `ex_similarity_scores` for `titles_emb_df` and `target_titles_emb`. Don't forget to reshape `target_titles_emb`.
##### Convert `ex_similarity_scores` into a dataframe named `ex_similarity_scores_df`. Set index as `titles_df.index`.

#### Result:

In [None]:
ex_similarity_scores = cosine_similarity(titles_emb_df, target_titles_emb.reshape(1, -1))
ex_similarity_scores[0:5]

In [None]:
ex_similarity_scores_df = pd.DataFrame(ex_similarity_scores, 
                                    columns = ['similarity_score'],
                                    index = titles.index)
print(ex_similarity_scores_df.head())

#### Task 3
##### Sort values of `ex_similarity_scores_df` in descending order.
##### Print the first 3 most similar documents to the target document.

#### Result:

In [None]:
ex_similarity_scores_df.sort_values('similarity_score', ascending = False).head()

In [None]:
print(titles[0])

In [None]:
print(titles[371])

In [None]:
print(titles[609])

#### Task 4
##### Plot a histogram to see the distribution of cosine similarity scores using the similarity scores from `ex_similarity_scores_df`.

#### Result:

In [None]:
# Plot results.
fig = plt.figure(figsize=(15,10))
cm = plt.cm.PRGn
n, bins, patches = plt.hist(ex_similarity_scores_df['similarity_score'], 
                            20, color='green')
for i, p in enumerate(patches):
    plt.setp(p, 'facecolor', cm(i/25)) # notice the i/25
fig.suptitle('Distribution of Cosine Similarity Scores', fontsize=20)
plt.xlabel("Similarity score", fontsize=18)
plt.ylabel("Number of titles", fontsize=18)
plt.show()