In [35]:
from gensim.models import word2vec
import multiprocessing
import os
import nltk
import pandas as pd
import re
import time
pd.options.mode.chained_assignment = None #set it to None to remove SettingWithCopyWarning
#nltk.download('stopwords')

In [36]:
STOP_WORDS = nltk.corpus.stopwords.words()

In [42]:
#data = pd.read_csv('./data/train.csv').sample(50000, random_state=23)
data = pd.read_csv('./data/train.csv')
print(data.shape)

(404290, 6)


#### Define functions for cleaning sentences and dataframes

In [32]:
def clean_sentence(val):
    "remove chars that are not letters or numbers, downcase, then remove stop words"
    regex = re.compile('([^\s\w]|_)+')
    sentence = regex.sub('', val).lower()
    sentence = sentence.split(" ")
    
    for word in list(sentence):
        if word in STOP_WORDS:
            sentence.remove(word)  
            
    sentence = " ".join(sentence)
    return sentence


def clean_dataframe(data):
    "drop nans, then apply 'clean_sentence' function to question1 and 2"
    data = data.dropna(how="any")
    
    for col in ['question1', 'question2']:
        data[col] = data[col].apply(clean_sentence)
    
    return data

#### Fetch clean data

In [None]:
tic = time.time()
data = clean_dataframe(data)
toc = time.time()
duration = str(round((toc -tic)))
print('Time taken to read the dataset: '+duration+' Second(s)')
data.head(5)

#### prepare gensim-friendly data, that is iterable list of sentences

In [None]:
def build_corpus(data):
    "Creates a list of lists containing words from each sentence"
    corpus = []
    for col in ['question1', 'question2']:
        for sentence in data[col].iteritems():
            word_list = sentence[1].split(" ")
            corpus.append(word_list)
            
    return corpus

In [None]:
tic = time.time()
corpus = build_corpus(data)        
toc = time.time()
duration = str(round((toc -tic)))
print('Time taken to build the corpus: '+duration+' Second(s)')
corpus[0:2]

### Word 2 Vec
The Word to Vec model produces a vocabulary, with each word being represented by an n-dimensional numpy array (100 values in this example)

In [27]:
quora_model = word2vec.Word2Vec(corpus, size=100, window=20, min_count=200, workers=4)

In [28]:
quora_model.most_similar('python', topn=5)

KeyError: "word 'Artificial intelligence' not in vocabulary"

In [None]:
def tsne_plot(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()