# Advanced Text Mining Part 1 - Exercises with answers

## Exercise 1

#### Task 1
##### Load libraries that are used in this module.

#### Result:

In [None]:
# Helper packages.
import os 
import pickle
import pandas as pd
import numpy as np

# Cosine similarity and clustering packages.
from sklearn.metrics.pairwise import cosine_similarity
from scipy.cluster.hierarchy import ward, dendrogram, fcluster
import gensim
from gensim import matutils

# Network creation and visualization.
import networkx as nx
from pyvis.network import Network

# Other plotting tools.
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt

##### Task 2
##### Set `main_dir` to the location of your `booz-allen-hamilton` folder.
##### Make `data_dir` from the `main_dir` and concatenate remainder of the path to data directory.
##### Make `plots_dir` from the `main_dir` and concatenate remainder of the path to plots directory.

#### Result:

In [None]:
from pathlib import Path
# Set `home_dir` to the root directory of your computer.
home_dir = Path.home()

# Set `main_dir` to the location of your `booz-allen-hamilton` folder.
main_dir = home_dir / "Desktop" / "booz-allen-hamilton"

# Make `data_dir` from the `main_dir` and remainder of the path to data directory.
data_dir = main_dir / "data"

# Make `plots_dir` from the `main_dir` and remainder of the path to plots directory.
plot_dir = main_dir / "plots"

#### Task 3 
##### Set the working directory to `data_dir`.
##### Check if the working directory is updated to `data_dir`.

#### Answer :

In [None]:
# Change the working directory.
os.chdir(data_dir)

# Check the working directory.
print(os.getcwd())

##### Task 4
##### Load the pickled file from the previous exercises: 

##### 'similarity_df_ex.sav', 'similarity_ex.sav', 'valid_snippets_ex.sav', 'doc_topic_df_ex.sav' and 'corpus_tfidf_ex.sav' and name them as
##### 'similarity_df_ex', 'similarity_ex', 'valid_snippets_ex', 'doc_topic_df_ex' and 'corpus_tfidf_ex'

#### Result:

In [None]:
similarity_df_ex = pickle.load(open('similarity_df_ex.sav', 'rb'))
similarity_ex = pickle.load(open('similarity_ex.sav', 'rb'))
valid_snippets_ex = pickle.load(open('valid_snippets_ex.sav', 'rb'))
doc_topic_df_ex = pickle.load(open('doc_topic_df_ex.sav', 'rb'))
corpus_tfidf_ex = pickle.load(open('corpus_tfidf_ex.sav', 'rb'))

##### Task 5 
##### Load UN agreement titles data from original file, 'UN_agreement_titles.csv'.

#### Result:

In [None]:
UN = pd.read_csv('UN_agreement_titles.csv')

#### Task 6
##### Compute a graph from similarity matrix `similarity_df_ex`.
##### Convert the graph into a dataframe in the form of a edgelist called `edgelist_df_ex`.
##### Print the shape of `edgelist_df_ex`.

#### Result:

In [None]:
# Create a graph object from the similarity matrix.
graph = nx.from_pandas_adjacency(similarity_df_ex)

# Convert it to a dataframe in a form of an edgelist.
edgelist_df_ex = nx.to_pandas_edgelist(graph)

# Take a look at the data frame of edges.
print(edgelist_df_ex.head())

In [None]:
print(edgelist_df_ex.shape)

#### Task 7
##### Create a cosine similarity score distribution by plotting the weights of edges .
##### Filter out all pairs of documents with weights below 0.4 and above 0.8.
##### Print the head and shape of the new `edgelist_df_ex`.

#### Result:

In [None]:
# Result:
# Plot the weights of edges (i.e. similarity scores).
plt.hist(edgelist_df_ex['weight'])
plt.xlabel('Cosine similarity score')
plt.title('Cosine similarity score distribution')
plt.show()

In [None]:
# Filter out all entries below 0.4 and above 0.8.
edgelist_df_ex = edgelist_df_ex.query('weight>0.4 and weight<0.8')

# Take a look at the dataframe of edges.
print(edgelist_df_ex.head())

In [None]:
print(edgelist_df_ex.shape)

#### Task 8
##### Create an empty network object `network_ex` with the following base parameters:
    - height - 100%
    - width - 60%
    - bgcolor - FFFFF
    - font_color - 000000

#### Result:

In [None]:
# Create an empty network object.
network_ex = Network(height = "100%",
                     width = "60%",
                     bgcolor = "#FFFFFF",
                     font_color = "#000000")

# Set the physics layout of the network.
network_ex.force_atlas_2based()
network_ex.set_edge_smooth('dynamic')
print(network_ex)

#### Task 9
##### Populate the empty network with edge and node data. Use `edgelift_df_ex` and 
##### zip the three necessary columns - source, target, and weight - into an iterable set of tuples.
##### Print network nodes and network edges of your choice.

#### Result:

In [None]:
# Zip columns of edgelist data into a set of tuples.
edge_data = zip(edgelist_df_ex['source'], edgelist_df_ex['target'], edgelist_df_ex['weight'])
# Iterate through the edge data.
for e in edge_data:
    src = e[0] #<- get the source node
    dst = e[1] #<- get the destination (i.e. target node)
    w = e[2] #<- get the weight of the edge
# Add a source node with its information.
    network_ex.add_node(src, src, title = src)
# Add a destination node with its information.
    network_ex.add_node(dst, dst, title = dst)
# Add an edge between source and destination nodes with weight w.
    network_ex.add_edge(src, dst, value = w)

In [None]:
print(network_ex.nodes[0:5])
print(network_ex.edges[0:5])
print(network_ex.shape)

#### Task 10
##### Get the neighbor map for each node.
##### Print the document IDs that are most similar to document 25.

#### Result:

In [None]:
# Get a list of node neighbors.
neighbor_map = network_ex.get_adj_list()

# Show documents most similar to document 25.
print(neighbor_map[25])

#### Task 11
##### Add the neighbor node information into the hover over tooltip.
##### Print information for node 5.
##### Save the network graph as `UN_similar_snippets` and show it in browser.

#### Result:

In [None]:
# Add neighbor data to node hover data.
for node in network_ex.nodes:
    title = "Most similar articles: <br>"
    neighbors = list(neighbor_map[node["id"]])
    title = title + "<br>".join(str(neighbor) for neighbor in neighbors)
    node["title"] = title

print(network_ex.nodes[5])

In [None]:
# Save html and show graph in browser.
network_ex.show(plot_dir + "/UN_similar_snippets.html")

#### Task 12
##### Hover over a node of your choice to see the list of all its neighbors. For example, node 924 is used below.
##### Print the articles from the edgelist `edgelist_df_ex` with their weights.
##### Look up the articles closest to the node and print them.
##### Modify the graph's appearance by using `physics` parameter and re-save the graph.
##### Optional: Try using `nodes` and `edges` parameters to change the appearance of the graph.

#### Result:

In [None]:
edgelist_df_subset_ex = edgelist_df_ex.query("source==924")
print(edgelist_df_subset_ex)

In [None]:
print(edgelist_df_subset_ex)

In [None]:
print(UN.iloc[924, 0])

In [None]:
print(UN.iloc[450, 0])

In [None]:
print(UN.iloc[914, 0])
# We can see that these 3 articles are similar, because their snippets all start and end the same way: Development Credit Agreement...

In [None]:
# Show buttons to modify the look.
network_ex.show_buttons(filter_=['physics'])

In [None]:
# Save html and show graph in browser.
network_ex.show(plot_dir+"/UN_similar_snippets.html")

## Exercise 2

#### Task 1 
##### Compute the distance matrix `distance_ex` from `similarity_ex`.
##### Create the linkage matrix based on `distance_ex` and print the first 10 rows.
##### Print the shape of the matrix and the first 4 links.
##### Print the 110th link. Which clusters are linked? What is the distance between them? How many observations are there in the new cluster?


#### Result:

In [None]:
# Compute distance matrix by subtracting similarity from 1.
distance_ex = 1 - similarity_ex

# Define the `linkage_matrix` using `ward` clustering algorithm.
linkage_matrix_ex = ward(distance_ex)
print(linkage_matrix_ex[0:10])

# Print shape of the matrix.
print(linkage_matrix_ex.shape)
    
print(linkage_matrix_ex[0:4])

#Print the 110th link in the matrix.
print(linkage_matrix_ex[109])

`110th link` was between clusters `426` and `1050`, with distance of `0` and the number of observations in the new cluster being `3`

#### Task 2 
##### Visualize the hierarchical clusters with right orientation and leaf font size 14. Set figsize to (15, 40).


#### Answers:

In [None]:
# Now we can plot the hierarchical clusters.
fig, axes = plt.subplots(figsize = (15, 40))
axes = dendrogram(linkage_matrix_ex,
                  orientation = "right",
                  labels = valid_snippets_ex,
                  leaf_font_size = 14)

#### Task 3 
##### Split the dendrogram based on maximum clusters. Set the maximum number of clusters named `k` as 3.

#### Result:

In [None]:
# Set k - the max number of clusters.
k = 3

# Get cluster labels for each snippet.
cluster_labels = fcluster(linkage_matrix_ex, #<- linkage matrix
                          k, #<- max number of clusters
                          criterion = 'maxclust') #<- criterion maxclust
print(cluster_labels)

#### Task 4 
##### Create a variable with valid snippets of `UN` and name as ` UN_valid_articles`.
##### Add `cluster_labels` to `UN_valid_articles` and name the column as `hclust_label`.
##### Sort `doc_topic_df_ex` by `doc_id` and save.
##### Add a column called `LDA_topic_label` to `UN_valid_articles` from `best_topic` in `doc_topic_df_ex`.
##### Save the plot and the data in png and csv format respectively.

#### Result:

In [None]:
UN_valid_articles = UN.loc[valid_snippets_ex]
UN_valid_articles['hclust_label'] = cluster_labels
doc_topic_df_ex = doc_topic_df_ex.sort_values(by = "doc_id")
UN_valid_articles['LDA_topic_label'] = doc_topic_df_ex['best_topic']

fig.savefig(plot_dir + '/UN_hclust.png')
UN_valid_articles.to_csv(data_dir + '/UN_snippets_with_cluster_labels.csv')