In [1]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('punkt')

import numpy as np
import pandas as pd
from gensim.models import word2vec

import re # For regular expressions

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bhara\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bhara\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## (a) Load the dataset

In [2]:
def load_data():
    """ Read tweets from the file.
        Return:
            list of lists (list_words), with words from each of the processed tweets
    """
    tweets = pd.read_csv('tweets.csv', names=['text'])
    list_words = []
    ### iterate over all tweets from the dataset
    for i in tweets.index:
      ### remove non-letter.
      text = re.sub("[^a-zA-Z ]", "", tweets.iloc[i].text)
      ### tokenize
      words = text.split()

      stop_words = set(stopwords.words("english"))
      
      new_words = []
      ### iterate over all words of a tweet
      for w in words:
        ## TODO: remove the stop words and convert a word (w) to the lower case
        if w.lower() not in stop_words:
          new_words.append(w.lower())
        
      list_words.append(new_words)
    return list_words

# check a few samples of twitter corpus
twitter_corpus = load_data()
print(twitter_corpus[:3])

[['upgrade', 'b', 'lasgtbwi', 'brainer', 'week', 'thanks'], ['flight', 'delayed', 'orig', 'departure', 'time', 'adhere', 'original', 'time', 'drop', 'bags', 'arrive', 'gate', 'new', 'time', 'works', 'airport', 'delays'], ['currently', 'flight', 'delayed', 'hour', 'reparked', 'kickoff', 'passenger', 'mins', 'later', 'let', 'back', 'offer', 'drinks', 'food', 'terrible', 'flown']]


## (b) Create co-occurrence matrix

In [3]:
def distinct_words(corpus):
    """ get a list of distinct words for the corpus.
        Params:
            corpus (list of list of strings): corpus of documents
        Return:
            corpus_words (list of strings): list of distinct words across the corpus, sorted (using python 'sorted' function)
            num_corpus_words (integer): number of distinct words across the corpus
    """
    corpus_words = []
    num_corpus_words = -1
    # ------------------
    # TODO:
    for tweet in corpus:
        for word in tweet:
            if word not in corpus_words:
                corpus_words.append(word)
    
    num_corpus_words = len(corpus_words)
    corpus_words = sorted(corpus_words)

    # ------------------
    return corpus_words, num_corpus_words

words, num_words = distinct_words(twitter_corpus)
print(words[:10], num_words)

['aa', 'aaaaand', 'aaaahhh', 'aaaand', 'aampb', 'aawish', 'aba', 'abacus', 'abat', 'abc'] 10841


In [4]:
def compute_co_occurrence_matrix(corpus, window_size=5):
    """ Compute co-occurrence matrix for the given corpus and window_size (default of 5).    
        Params:
            corpus (list of list of strings): corpus of documents
            window_size (int): size of context window
        Return:
            M (numpy matrix of shape = [number of corpus words x number of corpus words]): 
                Co-occurence matrix of word counts. 
                The ordering of the words in the rows/columns should be the same as the ordering of the words given by the distinct_words function.
            word2Ind (dict): dictionary that maps word to index (i.e. row/column number) for matrix M.
    """
    M = np.zeros((num_words, num_words))
    word2Ind = {word:i for i, word in enumerate(words)}
    
    # ------------------
    # TODO:
    for tweet in corpus:
        for idx, word in enumerate(tweet):
            snippet = tweet[idx-window_size:idx+window_size]
            for winword in snippet:
                if word == winword:
                    continue
                M[word2Ind[word]][word2Ind[winword]] += 1

    # ------------------

    return M, word2Ind

M, word2Ind = compute_co_occurrence_matrix(twitter_corpus)

## (c) SVD

In [5]:
# -----------------------------
# Run SVD
# Note: This may take several minutes
svd = np.linalg.svd(M)
# ------------------------------

In [24]:
u, s, vt = svd
svdvec = u[:,0:50]
svdvec[0]

array([-1.17753923e-03, -1.70919369e-03,  1.34122524e-05, -3.24855321e-04,
        4.36910487e-04, -6.25674420e-04,  1.74534635e-03,  4.69460153e-03,
       -2.41173383e-03, -2.67955561e-03, -1.72447623e-03,  7.37985022e-04,
       -7.67156010e-04,  1.45069326e-03, -1.88542498e-03,  4.61357977e-03,
        1.67730203e-03,  2.38072866e-03,  2.51411184e-03,  5.57206676e-03,
       -7.74304613e-04, -6.22149371e-03, -5.88557659e-04,  2.70487346e-03,
       -3.39341299e-04, -5.92755373e-03, -7.23058413e-03,  2.68102948e-03,
        1.04587764e-03, -2.20739522e-03,  6.23608034e-03,  4.54652242e-03,
       -2.30011801e-03,  8.63077415e-03,  7.68760995e-04,  2.23827307e-03,
       -1.46133409e-04, -2.02301050e-03, -2.56462792e-04,  8.99979917e-04,
        1.83894116e-03,  8.69968459e-04,  1.10336983e-02,  1.68459899e-03,
        1.31351208e-02, -6.41530442e-03, -2.98473395e-03,  2.04847670e-03,
       -9.13565806e-04, -5.32133961e-03])

## (d) Word2Vec

In [6]:
# Creating the word2vec model and setting values for the various parameters

# Initializing the train model. 
num_features = 50   # Word vector dimensionality
min_word_count = 0  # Minimum word count. You can change it also.
num_workers = 4     # Number of parallel threads, can be changed
context = 5         # Context window size
downsampling = 1e-3 # (0.001) Downsample setting for frequent words, can be changed
# Initializing the train model
print("Training Word2Vec model....")
model = word2vec.Word2Vec(twitter_corpus)

# To make the model memory efficient
model.init_sims(replace=True)

Training Word2Vec model....


  model.init_sims(replace=True)


## (d) Compare SVD word embeddings with Word2Vec

In [111]:
from sklearn.metrics.pairwise import cosine_similarity

def svd_most_similar(query_word, n=10):
    """ return 'n' most similar words of a query word using the SVD word embeddings similar to word2vec's most_smilar    
        Params:
            query_word (strings): a query word
        Return:
            most_similar (list of strings): the list of 'n' most similar words
    """
    n += 1
    wvec = svdvec[word2Ind[query_word]].reshape([1, 50])
    simscores = cosine_similarity(wvec, svdvec)[0]
    idx = np.argpartition(simscores, -n)[-n:]
    idx = idx[np.argsort(simscores[idx])[::-1]]
    most_similar = [word for word, i in word2Ind.items() if i in idx and word != query_word]

    return most_similar

## SVD vs Word2Vec: "???"

In [114]:
svd_most_similar("delay")

['cantplanthehoneymoonwithoutwifi',
 'dangling',
 'fasten',
 'fufuaufufuu',
 'fufuuafufufubbudfufuubbuufef',
 'grants',
 'ling',
 'terminalthat',
 'unload',
 'upload']

In [115]:
model.wv.most_similar("delay") #this word2vec trained model on tweets

[('delayed', 0.997496485710144),
 ('hour', 0.9972133636474609),
 ('almost', 0.9969688653945923),
 ('late', 0.9969677925109863),
 ('waiting', 0.9969348311424255),
 ('sitting', 0.9968117475509644),
 ('half', 0.9966129064559937),
 ('plane', 0.996375322341919),
 ('vegas', 0.9962120056152344),
 ('going', 0.9960824847221375)]

In [116]:
svd_most_similar("flight")

['aaaaand',
 'alistmember',
 'firstmarathon',
 'image',
 'mar',
 'midday',
 'pouring',
 'seniors',
 'thanksdavid',
 'underonaflight']

In [117]:
model.wv.most_similar("flight")

[('hrs', 0.9984350204467773),
 ('plane', 0.9983459115028381),
 ('lax', 0.9983003735542297),
 ('almost', 0.9981791377067566),
 ('pm', 0.9981695413589478),
 ('vegas', 0.9981691241264343),
 ('connecting', 0.9981654286384583),
 ('late', 0.9981614351272583),
 ('weather', 0.9981505870819092),
 ('airport', 0.9981454014778137)]