# Ex1.2 WordToVec

Make sure that all the other kernels are shut down otherwise you might run out of resources.

In [None]:
import time
start = time.perf_counter()

You will experiment with word embeddings of all the State of the Union speeches. Ideally words that are close in meaning should get vectors that are close.

In [None]:
# Imports needed packages.
import gzip
import gensim 
import logging

# Configures the logging.
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
# This is to supress warnings about planned deprecation of some methods.
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

We define a function that will load the data.

In [None]:
def read_input(input_file):
    logging.info("reading file {0}...this may take a while".format(input_file))
    count = 0
    with open(input_file, 'r') as f:
      while True:
        count += 1
        line = f.readline()
        if not line:
          break
#       Do some pre-processing using the gensim library.
#       The simple_preprocess() method returns a list of words for each speech. Each "line" is a speech.
        yield gensim.utils.simple_preprocess(line)

All the speeches have previously been put in a single file, sotu.txt

In [None]:
data_file = 'sotu.txt'
with open(data_file) as file:
    raw_speeches = [line.rstrip() for line in file]
documents = list (read_input (data_file))
logging.info ("Done reading data file")

In [None]:
# Each entry in the documents list is itself a list of words that came from a speech. 
# Here, for instance, is the speech at index 0. 
documents[0]
# The list is a bit long and might clutter your screen so if you want it to go away after you have looked at it, then 
# right click on the output and select "Clear Outputs".

In [None]:
# documents[0] is itself a list so we have a list of lists.
type(documents[0])

We build the vocabluary and train the model.

In [None]:
# You can reduce verbosity if you like by raising the log level, say to ERROR.
# You can right click on the result and select "Clear Outputs" if it gets too cluttered.
logging.root.level = logging.INFO

model = gensim.models.Word2Vec(
        documents,
        vector_size = 150, # Number of dimensions in the vector space.
        window = 10, # Maximum distance between the current and predicted word within a sentence.
        min_count = 2,
        workers = 10, # Number of threads to use.
        epochs = 20) #Number of epochs

In [None]:
# Now that we have a model, lets search for some word associations.
word = "sea"
model.wv.most_similar(word, topn = 10)
# Try a few other words.

## Document to Vector

DocToVec is the same idea as WordToVec but now it is the distance between similar documents, ones with many words in common, that should be small. 

In [None]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

In [None]:
# A TaggedDocument is just the list of words in a document plus an associated tag. 
# We will just use an integer as a label. We create a list of TaggedDocument objects, 
# one for each of our speeches.
tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(documents)]

In [None]:
# Take a look at the TaggedDocument at index 0.
tagged_documents[0]

In [None]:
# We create the vectors, one for each document in tagged_documents.
d2v_model = Doc2Vec(
    tagged_documents, 
    vector_size = 100, 
    window = 10, 
    min_count = 500, # Ignore words with total frequency less than this.
    workers = 2, 
    epochs = 10,
    dm = 1, # Use distributed memory algorithm rather than bag of words.
    alpha = 0.025, # Learning rate at the beginning.
    min_alpha = 0.001 # Learning rate tapers down to this.
)

In [None]:
# We train the model, i.e. calculate the distances between the vectors representing the documents.
logging.root.level = logging.ERROR
d2v_model.train(
    tagged_documents, 
    total_examples = d2v_model.corpus_count, 
    epochs = 10, 
    start_alpha = 0.002, 
    end_alpha = -0.016)

Which documents are most similar to document 57?

In [None]:
d2v_model.docvecs.most_similar(57)

In [None]:
# If you are curious you can take a look at the originals and compare them. 
# They are rather long but here is speech 57:
raw_speeches[57]

## Clustering

Clustering is a machine learning algorithm where similar items are in the same "cluster" while disimilar items are in different clusters. We can cluster the speeches.

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [None]:
d2v_model.docvecs

In [None]:
#We create a KMeans model instance
kmeans_model = KMeans(
    n_clusters = 4, 
    init = 'k-means++', 
    max_iter = 100) 
# The docs-to-vec object we created has a vector for each document
# Since the distance between the vectors represent similarity 
# we cluster the vectors
X = kmeans_model.fit(d2v_model.docvecs.vectors)
# Each vector now has a label, the cluster to which it belongs
labels = kmeans_model.labels_.tolist()
#l = kmeans_model.fit_predict(d2v_model.docvecs.vectors)
l=0
# In order to draw 2D pictures we use PCA to create a 2D projection
# that loses as little information as possible
pca = PCA(n_components=2).fit(d2v_model.docvecs.vectors)
datapoint = pca.transform(d2v_model.docvecs.vectors)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
# In order to draw 2D pictures we use PCA to create a 2D projection
# that loses as little information as possible
pca = PCA(n_components=2).fit(d2v_model.docvecs.vectors)
datapoint = pca.transform(d2v_model.docvecs.vectors)

plt.figure
# These will be the colors given to the clusters
label_col = ['#FFFF00', '#008000', '#0000FF', '#800080']
color = [label_col[i] for i in labels]
plt.scatter(datapoint[:, 0], datapoint[:, 1], c=color)

# We draw the centroids
centroids = kmeans_model.cluster_centers_
centroidpoint = pca.transform(centroids)
plt.scatter(centroidpoint[:, 0], centroidpoint[:, 1], marker = '^', s = 150, c = '#000000')
plt.show()

Rather than a graphic, we might want a listing of the cluster memberships.

In [None]:
print (kmeans_model.labels_)
print('\n')

#create a dictionary to get cluster data
clusters={0:[],1:[],2:[],3:[]}
for i in range(len(tagged_documents)):
    clusters[kmeans_model.labels_[i]].append(i)
for i in range(len(clusters)):
    print("Cluster",i,clusters[i],'\n')

In [None]:
# Here is the cluster in which speech 57 lies:
clusters[kmeans_model.labels_[57]]
# Are the speeches that the most_similar() method said were closest to speech 57 in the same cluster?

In [None]:
end = time.perf_counter()
print("Time taken: in min", (end - start)/60)