In [None]:

#######################################################
#######################################################
############    COPYRIGHT - DATA SOCIETY   ############
#######################################################
#######################################################

## ADVANCED TEXT MINING PART3 ##

## NOTE: To run individual pieces of code, select the line of code and
##       press ctrl + enter for PCs or command + enter for Macs



In [None]:
#=================================================-
#### Slide 3: Import packages  ####

# Helper packages.
import os
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Packages for loading pre-trained word-embedding model
import gensim
from gensim.models import Word2Vec



In [None]:
#=================================================-
#### Slide 4: Directory settings  ####

from pathlib import Path
# Set `home_dir` to the root directory of your computer.
home_dir = Path.home()

# Set `main_dir` to the location of your `booz-allen-hamilton` folder.
main_dir = home_dir / "Desktop" / "booz-allen-hamilton"

# Make `data_dir` from the `main_dir` and remainder of the path to data directory.
data_dir = main_dir / "data"



In [None]:
#=================================================-
#### Slide 5: Working directory  ####

# Set working directory.
os.chdir(data_dir)
# Check working directory.
print(os.getcwd())



In [None]:
#=================================================-
#### Slide 6: Loading text data  ####

# Load corpus from a csv (for Mac).
NYT = pd.read_csv('NYT_article_data.csv')



In [None]:
#=================================================-
#### Slide 7: Recap: corpus pre-processing steps  ####

num_docs = len(NYT["snippet"])
print(num_docs)

# Tokenize each text into a large list of tokenized snippets.
NYT_tokenized = [word_tokenize(snippet) for snippet in NYT["snippet"]]



In [None]:
#=================================================-
#### Slide 8: Cleaning function  ####

def prep_text(text_tokenized):
    # Process words in all snippets.
    clean_text = [None]*len(text_tokenized)

    for i in range(len(text_tokenized)):
        # 1. Convert to lower case.
        text = [word.lower() for word in text_tokenized[i]]
        # 2. Remove stop words.
        text = [word for word in text if not word in stop_words]
        # 3. Remove punctuation and any non-alphabetical characters.
        text = [word for word in text if word.isalpha()]
        clean_text[i] = text

    clean_text_list = [' '.join(snippet) for snippet in clean_text]

    return clean_text_list, clean_text



In [None]:
#=================================================-
#### Slide 9: Prep NYT text for analysis  ####

NYT_clean_list, NYT_clean = prep_text(NYT_tokenized)
print(NYT_clean[:3])
print(NYT_clean_list[:3])



In [None]:
#=================================================-
#### Slide 10: Recap: create a DTM  ####

# Initialize `CountVectorizer`.
vec = CountVectorizer()

# Transform the list of snippets into DTM.
X = vec.fit_transform(NYT_clean_list)
print(X.toarray()) #<- show output as a matrix

print(vec.get_feature_names()[:10])



In [None]:
#=================================================-
#### Slide 11: Recap: create a DTM (cont'd)  ####

# Convert the matrix into a pandas dataframe for easier manipulation.
DTM_not_stemmed = pd.DataFrame(X.toarray(), columns = vec.get_feature_names())
print(DTM_not_stemmed.head())



In [None]:
#=================================================-
#### Slide 22: gensim.models.Word2Vec  ####

model = Word2Vec(NYT_clean, size = 50, min_count = 3, iter = 15, seed = 2)
print(model.vector_size)

print(NYT_clean[0])




In [None]:
#=================================================-
#### Slide 23: Word2Vec: most similar words  ####

print(model.wv.most_similar('government'))



In [None]:
#=================================================-
#### Slide 24: Word2Vec: most similar words  ####

print(model.wv.most_similar('trade', topn = 5))



In [None]:
#=================================================-
#### Slide 26: Exercise 1  ####





In [None]:
#=================================================-
#### Slide 33: Load GloVe text files  ####

# Number of glove dimensions.
GLOVE_DIM = 200

# Load pre-trained glove embeddings.
glove_file = data_dir + "/glove.6B.200d.txt"



In [None]:
#=================================================-
#### Slide 34: Load GloVe text files  ####

def LoadGloveModel(glove_file):
    print("Loading GloVe Model")
    f = open(glove_file,'r',encoding="utf8")
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.", len(model), " words loaded!")
    return model

# Load embeddings from file.
glove_model = LoadGloveModel(glove_file)



In [None]:
#=================================================-
#### Slide 35: Load GloVe text files  ####

dict(list(glove_model.items())[0:3])



In [None]:
#=================================================-
#### Slide 37: Word count of corpus  ####

# Save series as a dictionary.
corpus_freq_dist = DTM_not_stemmed.sum(axis = 0).to_dict()
dict(list(corpus_freq_dist.items())[0:5])

# Extract word counts for exploratory analysis.
word_counts = pd.DataFrame(list(corpus_freq_dist.items()), columns = ['word', 'count'])
print(word_counts.head())



In [None]:
#=================================================-
#### Slide 38: Word embeddings matrix  ####

# Initialize embeddings matrix.
DICT_SIZE = len(word_counts.index)
word_emb_matrix = np.zeros((DICT_SIZE, GLOVE_DIM))
words = list(word_counts.word)
NUM_MESSAGES = len(NYT_tokenized)

for i in range(DICT_SIZE):
    w = words[i]
    vect = glove_model.get(w)

    if vect is not None:
        word_emb_matrix[i] = vect




In [None]:
#=================================================-
#### Slide 39: Word embeddings matrix  ####

print(word_emb_matrix.shape)
print(word_emb_matrix[0])



In [None]:
#=================================================-
#### Slide 40: NYT Article embeddings matrix  ####

# Convert dataframe to a numpy array.
DTM_not_stemmed = DTM_not_stemmed.to_numpy()

# Compute sums of all word counts for each chat message.
DTM_row_sums = np.sum(DTM_not_stemmed, axis=1)

NYT_embeddings_matrix = DTM_not_stemmed.dot(word_emb_matrix)
print(DTM_not_stemmed.shape)



In [None]:
#=================================================-
#### Slide 41: NYT Article embeddings  ####

for i in range(NUM_MESSAGES):
    NYT_embeddings_matrix[i] = np.true_divide(NYT_embeddings_matrix[i], DTM_row_sums[i])

# Save as a dataframe and add NYT snippet IDs.
NYT_emb_df = pd.DataFrame(NYT_embeddings_matrix)

print(NYT_emb_df.head())



In [None]:
#=================================================-
#### Slide 43: Exercise 2  ####





In [None]:
#=================================================-
#### Slide 49: Find documents similar to snippet 25  ####

NYT_snippets = NYT['snippet']
NYT_snippets[1]



In [None]:
#=================================================-
#### Slide 50: Find documents similar to snippet 25  ####

# Average embeddings.
target_NYT_emb = NYT_emb_df.loc[1].to_numpy()
target_NYT_emb[0:5]
target_NYT_emb.reshape(1, -1)



In [None]:
#=================================================-
#### Slide 51: Compute cosine similarity   ####

similarity_scores = cosine_similarity(NYT_emb_df, target_NYT_emb.reshape(1, -1))
similarity_scores[0:5]

similarity_scores_df = pd.DataFrame(similarity_scores,
                                    columns = ['similarity_score'],
                                    index = NYT.index)

print(similarity_scores_df.head())



In [None]:
#=================================================-
#### Slide 52: View results  ####

similarity_scores_df.sort_values('similarity_score', ascending = False).head()



In [None]:
#=================================================-
#### Slide 53: View results  ####

print(NYT_snippets[168])
print(NYT_snippets[112])



In [None]:
#=================================================-
#### Slide 54: Cosine similarity score distribution plot  ####

# Plot results.
fig = plt.figure(figsize=(15,10))
cm = plt.cm.PRGn
n, bins, patches = plt.hist(similarity_scores_df['similarity_score'],
20, color='green')

for i, p in enumerate(patches):
    plt.setp(p, 'facecolor', cm(i/25)) # notice the i/25
    fig.suptitle('Distribution of Cosine Similarity Scores', fontsize=20)
    plt.xlabel("Similarity score", fontsize=18)
    plt.ylabel("Number of NYT snippets", fontsize=18)
    plt.show()



In [None]:
#=================================================-
#### Slide 56: Exercise 3  ####



