# Word Embedding t-SNE visualization

### Ben Greenawald (bhg5yd)
Based on https://www.kaggle.com/jeffd23/visualizing-word-vectors-with-t-sne

In [2]:
import pandas as pd
pd.options.mode.chained_assignment = None 
import numpy as np
import re
import nltk

from gensim.models import word2vec
from gensim.models import doc2vec

import os
import progressbar
import pickle

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline



### Read in and save the data

In [2]:
# Function to clean sentences
STOP_WORDS = nltk.corpus.stopwords.words()

def clean_sentence(val):
    "remove chars that are not letters or numbers, downcase, then remove stop words"
    regex = re.compile('([^\s\w]|_)+')
    sentence = regex.sub('', val).lower()
    sentence = sentence.split(" ")
    
    for word in list(sentence):
        if word in STOP_WORDS:
            sentence.remove(word)  
            
    sentence = " ".join(sentence)
    return sentence

In [3]:
# Read in the data
data_path = "C:\\Users\\bgree\\Documents\\capstone\\Eng\\eng_clean\\"

# Read in all files
sentences = [0] * len(os.listdir(data_path)) 
files = [0] * len(os.listdir(data_path)) 
bar = progressbar.ProgressBar()

for i, file in bar(enumerate(os.listdir(data_path))):
    with open((data_path + file), "r") as cur_file:
        sentences[i] = clean_sentence(cur_file.read())
        files[i] = file
        cur_file.close()

| 5502 Elapsed Time: 0:10:16                                                   


In [4]:
# Replace newlines with spaces, replace multiple spaces with single space
newline = re.compile("[\n]+")
multispace = re.compile("[ ]+")

for i, sentence in enumerate(sentences):
    sentence = newline.sub(' ', sentence)
    sentence = multispace.sub(' ', sentence)
    sentences[i] = sentence

In [5]:
# Pickle the list so we don't have to read it in again
with open('C:\\Users\\bgree\\Documents\\capstone\\Eng\\sentences.pkl', 'wb') as f:
    pickle.dump(sentences, f)
    f.close()
    
with open('C:\\Users\\bgree\\Documents\\capstone\\Eng\\files.pkl', 'wb') as f:
    pickle.dump(files, f)
    f.close()

In [11]:
with open('C:\\Users\\bgree\\Documents\\capstone\\Eng\\sentences.pkl', 'rb') as f:
    sentences = pickle.load(f)
    f.close()
    
with open('C:\\Users\\bgree\\Documents\\capstone\\Eng\\files.pkl', 'rb') as f:
    files = pickle.load(f)
    f.close()

### Build the corpus

In [12]:
def build_corpus(data):
    "Creates a list of lists containing words from each sentence"
    corpus = [0] * len(data)
    for i, sentence in enumerate(data):
        word_list = sentence.split(" ")
        corpus[i] = word_list
            
    return corpus

In [13]:
corpus = build_corpus(sentences)

### Build the word2vec model

In [14]:
model = word2vec.Word2Vec(corpus, size=100, window=5, min_count=50, workers=4)

In [15]:
model.save("C:\\Users\\bgree\\Documents\\capstone\\Eng\\model")

In [2]:
model = word2vec.Word2Vec.load("C:\\Users\\bgree\\Documents\\capstone\\Eng\\model")

In [3]:
model.wv.most_similar(positive=['kill', 'love'], negative=['hate'])

[('feed', 0.4436623752117157),
 ('save', 0.42832493782043457),
 ('physically', 0.38688328862190247),
 ('death', 0.382870078086853),
 ('drown', 0.371822327375412),
 ('heal', 0.36274629831314087),
 ('restore', 0.359414279460907),
 ('recover', 0.35854101181030273),
 ('starve', 0.35682594776153564),
 ('killed', 0.35319411754608154)]

In [19]:
model.wv.most_similar(positive=['doctor', 'woman'], negative=['man'])

[('nurse', 0.6744319200515747),
 ('hospital', 0.6417989134788513),
 ('patients', 0.6167763471603394),
 ('doctors', 0.5895438194274902),
 ('clinic', 0.5791116952896118),
 ('shane', 0.5531526803970337),
 ('mom', 0.5406304597854614),
 ('sick', 0.5246722102165222),
 ('surgery', 0.5209794044494629),
 ('teenager', 0.5186388492584229)]

In [20]:
model.wv.most_similar(positive=['husband', 'woman'], negative=['man'])

[('wife', 0.770444393157959),
 ('wives', 0.5845280289649963),
 ('zainab', 0.5403890013694763),
 ('husbands', 0.539920449256897),
 ('marriage', 0.5272690057754517),
 ('daughter', 0.5203981399536133),
 ('women', 0.5176483392715454),
 ('divorced', 0.4951685070991516),
 ('nurse', 0.49472132325172424),
 ('unmarried', 0.48751023411750793)]

In [10]:
model.wv.most_similar(positive=['america'], negative=['money'])

[('doomed', 0.4933154582977295),
 ('nation', 0.47456789016723633),
 ('patriotic', 0.46974021196365356),
 ('continent', 0.45280495285987854),
 ('civilization', 0.43584635853767395),
 ('democracy', 0.4354703724384308),
 ('europe', 0.4346542954444885),
 ('anglo', 0.42719873785972595),
 ('liberalism', 0.4231260418891907),
 ('eastern', 0.41540855169296265)]

In [4]:
model.wv.most_similar(positive=['america'], negative=['hope'])

[('pakistan', 0.5671586990356445),
 ('sweden', 0.5523396730422974),
 ('indian', 0.5497785806655884),
 ('korea', 0.5490356683731079),
 ('african', 0.5363327264785767),
 ('negroes', 0.5339013338088989),
 ('africans', 0.5332790613174438),
 ('indonesia', 0.5225752592086792),
 ('usa', 0.5220274925231934),
 ('korean', 0.5138765573501587)]

### Run the t-SNE

In [21]:
def tsne_plot(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(n_components=2, random_state=23, verbose=3)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    d = {"x": x, 
         "y":y,
         "words": labels}
    
    return pd.DataFrame(data=d)

In [22]:
tsne_data = tsne_plot(model)

  import sys


[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 12050 samples in 0.053s...
[t-SNE] Computed neighbors for 12050 samples in 30.726s...
[t-SNE] Computed conditional probabilities for sample 1000 / 12050
[t-SNE] Computed conditional probabilities for sample 2000 / 12050
[t-SNE] Computed conditional probabilities for sample 3000 / 12050
[t-SNE] Computed conditional probabilities for sample 4000 / 12050
[t-SNE] Computed conditional probabilities for sample 5000 / 12050
[t-SNE] Computed conditional probabilities for sample 6000 / 12050
[t-SNE] Computed conditional probabilities for sample 7000 / 12050
[t-SNE] Computed conditional probabilities for sample 8000 / 12050
[t-SNE] Computed conditional probabilities for sample 9000 / 12050
[t-SNE] Computed conditional probabilities for sample 10000 / 12050
[t-SNE] Computed conditional probabilities for sample 11000 / 12050
[t-SNE] Computed conditional probabilities for sample 12000 / 12050
[t-SNE] Computed conditional probabilities for sa

In [23]:
tsne_data.to_csv("wordTSNE.csv", index=False)

### doc2vec

In [37]:
# Get the document labels
group_to_labels = {}
with open("C:\\Users\\bgree\\Documents\\capstone\\Eng\\eng_group_labels.txt", "r") as file:
    for line in file.readlines():
        line_split = line.split(",")
        group_to_labels[line_split[0]] = int(line_split[1])
        
# Get the filenames and labels
digit_remover = re.compile("[\d]+")

groups_labels = [0] * len(sentences)
binary_labels = [0] * len(sentences)
for i, file in enumerate(files):
    f = file.split('.')[0]
    f = digit_remover.sub('',f)
    if f[-1] == '-':
        f = f[:-1]
    if f == 'AndrewMurray-HolyinChrist':
        f = 'AndrewMurray'
    groups_labels[i] = f
    binary_labels[i] = group_to_labels[f]

In [47]:
docs = [0] * len(sentences)
for i, doc in enumerate(sentences):
     str_list = doc.split()
     t = doc2vec.TaggedDocument(str_list,[i])
     docs[i] = t

In [61]:
modelDoc = doc2vec.Doc2Vec(docs, size=100, window=8, min_count=5, workers=4)



In [62]:
modelDoc.save("C:\\Users\\bgree\\Documents\\capstone\\Eng\\modelDoc")

In [63]:
modelDoc = word2vec.Word2Vec.load("C:\\Users\\bgree\\Documents\\capstone\\Eng\\modelDoc")

### t-SNE 2

In [81]:
def tsne_doc(model):
    docs = []
    for i in range(5503):
        docs.append(model.docvecs[i])
    
    tsne_model = TSNE(n_components=2, random_state=23, verbose=3)
    new_values = tsne_model.fit_transform(docs)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    d = {"x": x, 
         "y":y
        }
    
    return pd.DataFrame(data=d)

In [82]:
tsne_data2 = tsne_doc(modelDoc)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 5503 samples in 0.026s...
[t-SNE] Computed neighbors for 5503 samples in 4.106s...
[t-SNE] Computed conditional probabilities for sample 1000 / 5503
[t-SNE] Computed conditional probabilities for sample 2000 / 5503
[t-SNE] Computed conditional probabilities for sample 3000 / 5503
[t-SNE] Computed conditional probabilities for sample 4000 / 5503
[t-SNE] Computed conditional probabilities for sample 5000 / 5503
[t-SNE] Computed conditional probabilities for sample 5503 / 5503
[t-SNE] Mean sigma: 0.663664
[t-SNE] Computed conditional probabilities in 0.174s
[t-SNE] Iteration 50: error = 85.7676392, gradient norm = 0.0452217 (50 iterations in 12.517s)
[t-SNE] Iteration 100: error = 85.8437500, gradient norm = 0.0302953 (50 iterations in 12.008s)
[t-SNE] Iteration 150: error = 85.8016357, gradient norm = 0.0296729 (50 iterations in 21.506s)
[t-SNE] Iteration 200: error = 85.6228256, gradient norm = 0.0444155 (50 iterations in 16.705s

In [84]:
tsne_data2['binary'] = binary_labels
tsne_data2['group'] = groups_labels

In [85]:
tsne_data2.to_csv("docTSNE.csv")

## Repeat for Arabic

In [7]:
# Read in the data
ar_data_path = "C:\\Users\\bgree\\Documents\\capstone\\Models\\arabic-docs\\"

# Read in all files
ar_sentences = [0] * len(os.listdir(ar_data_path)) 
ar_files = [0] * len(os.listdir(ar_data_path)) 
bar = progressbar.ProgressBar()

for i, file in bar(enumerate(os.listdir(ar_data_path))):
    with open((ar_data_path + file), "r", encoding='utf-8') as cur_file:
        ar_sentences[i] = cur_file.read().lower()
        ar_files[i] = file
        cur_file.close()

| 0 Elapsed Time: 0:00:11                                                      
| 14857 Elapsed Time: 0:00:53                                                  


In [8]:
# Replace newlines with spaces, replace multiple spaces with single space
newline = re.compile("[\n]+")
multispace = re.compile("[ ]+")

for i, sentence in enumerate(ar_sentences):
    sentence = newline.sub(' ', sentence)
    sentence = multispace.sub(' ', sentence)
    ar_sentences[i] = sentence

In [10]:
# Pickle the list so we don't have to read it in again
with open('C:\\Users\\bgree\\Documents\\capstone\\Ar\\sentences.pkl', 'wb') as f:
    pickle.dump(ar_sentences, f)
    f.close()
    
with open('C:\\Users\\bgree\\Documents\\capstone\\Ar\\files.pkl', 'wb') as f:
    pickle.dump(ar_files, f)
    f.close()

## RUN JUST THIS BLOCK

In [3]:
with open('C:\\Users\\bgree\\Documents\\capstone\\Ar\\sentences.pkl', 'rb') as f:
    ar_sentences = pickle.load(f)
    f.close()
    
with open('C:\\Users\\bgree\\Documents\\capstone\\Ar\\files.pkl', 'rb') as f:
    ar_files = pickle.load(f)
    f.close()

In [12]:
def build_corpus(data):
    "Creates a list of lists containing words from each sentence"
    corpus = [0] * len(data)
    for i, sentence in enumerate(data):
        word_list = sentence.split(" ")
        corpus[i] = word_list
            
    return corpus

In [13]:
ar_corpus = build_corpus(ar_sentences)

In [15]:
ar_model = word2vec.Word2Vec(ar_corpus, size=100, window=5, min_count=50, workers=4)

In [16]:
ar_model.save("C:\\Users\\bgree\\Documents\\capstone\\Ar\\model")

## RUN THIS BLOCK

In [4]:
ar_model = word2vec.Word2Vec.load("C:\\Users\\bgree\\Documents\\capstone\\Ar\\model")

# Add vector similarity operations here

In [8]:
# kill - evil + love
ar_model.wv.most_similar(positive=['قتل', 'حب'], negative=['شر'])

[('عاش', 0.5300116539001465),
 ('ترعرع', 0.5264725685119629),
 ('نشأ', 0.525036096572876),
 ('تربى', 0.5183085203170776),
 ('واستشهد', 0.5158625841140747),
 ('طفولته', 0.5106531977653503),
 ('رباه', 0.49314388632774353),
 ('وترعرع', 0.4907999634742737),
 ('ربى', 0.48872873187065125),
 ('تأثر', 0.48401132225990295)]

In [11]:
# husband - man + woman
ar_model.wv.most_similar(positive=['النساء', 'الزوج'], negative=['رجل'])

[('الوالدين', 0.5931550860404968),
 ('النساء،', 0.5799429416656494),
 ('عليهن', 0.5544040203094482),
 ('السيئات', 0.5504500865936279),
 ('إهمال', 0.5453451871871948),
 ('الظن', 0.5373238921165466),
 ('المسلمات', 0.524681806564331),
 ('الزوجات', 0.5192272663116455),
 ('الطلاق', 0.5082723498344421),
 ('بالنساء', 0.503494143486023)]

In [10]:
# america - hope
ar_model.wv.most_similar(positive=['أمريكا'], negative=['أمل'])

[('الإرهاب', 0.49271541833877563),
 ('وبريطانيا', 0.47849082946777344),
 ('الأمريكية', 0.4498194754123688),
 ('العراق،', 0.44678795337677),
 ('العصابات', 0.4376118779182434),
 ('قطر', 0.4372726380825043),
 ('الواليات', 0.43432122468948364),
 ('مجازر', 0.4332858920097351),
 ('ليبيا', 0.42830702662467957),
 ('بالعراق', 0.42714524269104004)]

In [12]:
def tsne_plot(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(n_components=2, random_state=23, verbose=3)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    d = {"x": x, 
         "y":y,
         "words": labels}
    
    return pd.DataFrame(data=d)

In [13]:
tsne_data = tsne_plot(ar_model)

  import sys


[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 23378 samples in 0.137s...
[t-SNE] Computed neighbors for 23378 samples in 113.351s...
[t-SNE] Computed conditional probabilities for sample 1000 / 23378
[t-SNE] Computed conditional probabilities for sample 2000 / 23378
[t-SNE] Computed conditional probabilities for sample 3000 / 23378
[t-SNE] Computed conditional probabilities for sample 4000 / 23378
[t-SNE] Computed conditional probabilities for sample 5000 / 23378
[t-SNE] Computed conditional probabilities for sample 6000 / 23378
[t-SNE] Computed conditional probabilities for sample 7000 / 23378
[t-SNE] Computed conditional probabilities for sample 8000 / 23378
[t-SNE] Computed conditional probabilities for sample 9000 / 23378
[t-SNE] Computed conditional probabilities for sample 10000 / 23378
[t-SNE] Computed conditional probabilities for sample 11000 / 23378
[t-SNE] Computed conditional probabilities for sample 12000 / 23378
[t-SNE] Computed conditional probabilities for s

In [16]:
# Pickle the list so we don't have to read it in again
with open('C:\\Users\\bgree\\Documents\\capstone\\Ar\\word_tsne.pkl', 'wb') as f:
    pickle.dump(tsne_data, f)
    f.close()

In [17]:
with open('C:\\Users\\bgree\\Documents\\capstone\\Ar\\word_tsne.pkl', 'rb') as f:
    word_tsne = pickle.load(f)
    f.close()

In [15]:
tsne_data.to_csv("C:\\Users\\bgree\\Documents\\capstone\\Ar\\wordTSNE.csv", index=False, encoding='utf-8')

## doc2vec

In [14]:
# Get the document labels
group_to_labels = {}
with open("C:\\Users\\bgree\\Documents\\capstone\\Ar\\arabic-groups-labels.txt", "r") as file:
    for line in file.readlines():
        line_split = line.split(",")
        group_to_labels[line_split[0]] = int(line_split[1])
        
groups_labels = [0] * len(ar_sentences)
binary_labels = [0] * len(ar_sentences)
for i, file in enumerate(ar_files):
    if '-g-' in file:
        f = file.split('-g-')[0]
    else:
        f = file.split('_g-')[0]    
    groups_labels[i] = f
    binary_labels[i] = group_to_labels[f]

In [16]:
ar_docs = [0] * len(ar_sentences)
for i, doc in enumerate(ar_sentences):
     str_list = doc.split()
     t = doc2vec.TaggedDocument(str_list,[i])
     ar_docs[i] = t

In [17]:
ar_modelDoc = doc2vec.Doc2Vec(ar_docs, size=100, window=8, min_count=5, workers=4)



In [18]:
ar_modelDoc.save("C:\\Users\\bgree\\Documents\\capstone\\Ar\\modelDoc")

In [19]:
ar_modelDoc = word2vec.Word2Vec.load("C:\\Users\\bgree\\Documents\\capstone\\Ar\\modelDoc")

In [22]:
def tsne_doc(model):
    docs = []
    for i in range(14858):
        docs.append(model.docvecs[i])
    
    tsne_model = TSNE(n_components=2, random_state=23, verbose=3)
    new_values = tsne_model.fit_transform(docs)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    d = {"x": x, 
         "y":y
        }
    
    return pd.DataFrame(data=d)

In [23]:
tsne_data2 = tsne_doc(ar_modelDoc)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 14858 samples in 0.088s...
[t-SNE] Computed neighbors for 14858 samples in 40.356s...
[t-SNE] Computed conditional probabilities for sample 1000 / 14858
[t-SNE] Computed conditional probabilities for sample 2000 / 14858
[t-SNE] Computed conditional probabilities for sample 3000 / 14858
[t-SNE] Computed conditional probabilities for sample 4000 / 14858
[t-SNE] Computed conditional probabilities for sample 5000 / 14858
[t-SNE] Computed conditional probabilities for sample 6000 / 14858
[t-SNE] Computed conditional probabilities for sample 7000 / 14858
[t-SNE] Computed conditional probabilities for sample 8000 / 14858
[t-SNE] Computed conditional probabilities for sample 9000 / 14858
[t-SNE] Computed conditional probabilities for sample 10000 / 14858
[t-SNE] Computed conditional probabilities for sample 11000 / 14858
[t-SNE] Computed conditional probabilities for sample 12000 / 14858
[t-SNE] Computed conditional probabilities for sa

In [24]:
tsne_data2['binary'] = binary_labels
tsne_data2['group'] = groups_labels

In [25]:
tsne_data2.to_csv("C:\\Users\\bgree\\Documents\\capstone\\Ar\\docTSNE.csv")