## construct word2vec from book

In [1]:
import gensim
with open('./data/lemma_book.txt') as f:
    documents = []
    for line in f:
        documents.append(line.split())
# print(documents)

ModuleNotFoundError: No module named 'gensim'

## visualize and validate the word embeddings
    - tweaking iter, epoch improves quality of word embeddings
    - use iter=500, epoch=20, for now
    - trained only on book data (no external/pre-trained knowledge)

In [42]:
from scipy import spatial
model = gensim.models.Word2Vec(
        documents,
        sg=0,
        size=300,
        window=20,
        min_count=5,
        iter=20, # this is important, b/c we want overfitting
        workers=4)
model.train(documents, total_examples=len(documents), epochs=20)
print(model)

def getDistance(w1, w2):
    return abs(1-spatial.distance.cosine(model.wv[w1], model.wv[w2]))

print('------ 20 iters ----------')
print(getDistance('man', 'woman'))
print(getDistance('cat', 'dog'))
# print(getDistance('king', 'queen'))

Word2Vec(vocab=1224, size=300, alpha=0.025)
------ 20 iters ----------
0.09285947680473328
0.9575470089912415


In [151]:
from scipy import spatial
model = gensim.models.Word2Vec(
        documents,
        sg=0,
        size=300,
        window=20,
        min_count=2,
        iter=100, # this is important, b/c we want overfitting
        workers=4)
model.train(documents, total_examples=len(documents), epochs=20)
print(model)

def getDistance(w1, w2):
    return abs(1-spatial.distance.cosine(model.wv[w1], model.wv[w2]))

print('------ 100 iters ----------')
print(getDistance('man', 'woman'))
print(getDistance('cat', 'dog'))
print(getDistance('king', 'queen'))

Word2Vec(vocab=2828, size=300, alpha=0.025)
------ 100 iters ----------
0.10086068511009216
0.6262087225914001
0.0211331807076931


In [153]:
from scipy import spatial
model = gensim.models.Word2Vec(
        documents,
        sg=0,
        size=300,
        window=20,
        min_count=2,
        iter=300, # this is important, b/c we want overfitting
        workers=4)
model.train(documents, total_examples=len(documents), epochs=20)
print(model)

def getDistance(w1, w2):
    return abs(1-spatial.distance.cosine(model.wv[w1], model.wv[w2]))

print('------ 300 iters ----------')
print(getDistance('man', 'woman'))
print(getDistance('cat', 'dog'))
print(getDistance('king', 'queen'))

Word2Vec(vocab=2828, size=300, alpha=0.025)
------ 300 iters ----------
0.17458155751228333
0.447554349899292
0.02189727872610092


In [43]:
from scipy import spatial
model = gensim.models.Word2Vec(
        documents,
        sg=0,
        size=300,
        window=20,
        min_count=5,
        iter=500, # this is important, b/c we want overfitting
        workers=4)
model.train(documents, total_examples=len(documents), epochs=20)
print(model)

def getDistance(w1, w2):
    return abs(1-spatial.distance.cosine(model.wv[w1], model.wv[w2]))

print('------ 500 iters ----------')
print(getDistance('man', 'woman'))
print(getDistance('cat', 'dog'))
# print(getDistance('king', 'queen'))

Word2Vec(vocab=1224, size=300, alpha=0.025)
------ 500 iters ----------
0.060461245477199554
0.37299448251724243


### save & load model

In [44]:
from gensim.test.utils import common_texts, get_tmpfile
path = get_tmpfile("word2vec.model")
model.save("word2vec.model")
model = gensim.models.Word2Vec.load("word2vec.model")

## build igraph using word2vec

In [100]:
# TODO: examine the word2vec effectiveness using visualizations

# with open('data/intersection.txt', 'r') as f:
#     inter_set = []
#     for word in f:
#         inter_set.append(word)

with open('data/book_words_set.txt', 'r') as f:
    book_set = []
    for word in f:
        book_set.append(word)

with open('data/rebecca_words.txt', 'r') as f:
    edu_set = []
    for word in f:
        edu_set.append(word)
        
print("word2vec:\t", len(model.wv.vocab))
# print("inter_set:\t", len(inter_set))
print("book_set:\t", len(book_set))
print("edu_set:\t", len(edu_set))

word2vec:	 1224
book_set:	 4751
edu_set:	 228


### construct igraph based on word2vec

In [46]:

from igraph import *
# token2vec
token2vec = model.wv.vocab
# create id2token
idx = 0
id2token = {}
token2id = {}
words 
for word in token2vec:
    id2token[idx] = word
    token2id[word] = idx
    idx += 1

vertices = [idx for idx in range(len(token2vec))]
edges = [(i, j) for i in vertices for j in vertices if i < j]
g = Graph(vertex_attrs={"label":vertices}, edges=edges, directed=False)
    
g.vs["word"] = words.keys()
weights = [getDistance(id2token[i], id2token[j]) for i, j in edges]
g.es["weight"] = weights

### test the validity of the network

In [51]:
assert(getDistance('man', 'woman') == g[token2id['man'], token2id['woman']] )
assert(getDistance('cat', 'dog') == g[token2id['cat'], token2id['dog']] )

## Centrality Measures of graph

### Strength

In [135]:
strengthRank = g2.strength(None,  weights=g2.es['weight'])

In [136]:
strengthRank[:10]

[37.450281295969035,
 41.4577521471947,
 38.27075478559709,
 42.38103917868648,
 37.95091226203294,
 41.44975711300867,
 42.29899334170841,
 41.77512261062293,
 44.814972953652614,
 38.73943052738855]

### closeness

In [137]:
closenessRank = g2.closeness(None, 'all', weights=g2.es['weight'], normalized=True)

In [138]:
closenessRank[:10]

[1640.1718075756514,
 1560.6994895158048,
 1372.73361439548,
 1725.3999198642887,
 1793.039373291286,
 1999.8577433068444,
 1993.6150039523927,
 1817.171252494204,
 2091.06016232289,
 1765.687344501397]

### betweenness


In [78]:
# remove edges whose dsitance >= 0.5
max_dist=0.5

new_edges, new_weights = [], []
for edge, dist in zip(edges, weights):
    if dist < max_dist:
        new_edges.append(edge)
        new_weights.append(dist)
g1 = Graph(vertex_attrs={"label":vertices}, edges=new_edges, directed=False)
g1.vs["word"] = words.keys()
g1.es["weight"] = new_weights

In [139]:
betweennessRank = g2.betweenness(directed=False, weights=g2.es['weight'])

In [140]:
betweennessRank[:10]

[2182.0, 121.0, 0.0, 2092.0, 3435.0, 8678.0, 11481.0, 7251.0, 18909.0, 2885.0]

### eigen_centrality 

In [145]:
eigen_centralityRank = g2.eigenvector_centrality(directed=False, weights=g2.es['weight'])

In [146]:
eigen_centralityRank[:10]

[0.8059955392207956,
 0.8916044740535753,
 0.8241751917866988,
 0.9085864311307865,
 0.8196731211567353,
 0.8868333184381991,
 0.9069631463712312,
 0.8960718432158904,
 0.95833428996227,
 0.8355183596354967]

### degree

In [93]:
# remove edges
max_dist=0.1

new_edges, new_weights = [], []
for edge, dist in zip(edges, weights):
    if dist < max_dist:
        new_edges.append(edge)
        new_weights.append(dist)
g2 = Graph(vertex_attrs={"label":vertices}, edges=new_edges, directed=False)
g2.vs["word"] = words.keys()
g2.es["weight"] = new_weights

In [95]:
degreeRank = g2.degree(mode='all')

In [96]:
degreeRank[:10]

[863, 955, 864, 1034, 868, 1012, 966, 1062, 1098, 869]

### Frequency

In [99]:
# problem with frequency (only counts words in the graph)
from collections import defaultdict
wordCounter = defaultdict(int)
with open("data/lemma_book.txt") as f:
    for line in f:
        for w in line.split():
            w = w.strip()
            if w in token2id:
                wordCounter[w] += 1
freqRank = [wordCounter[w] for w in token2id]
freqRank[:10]

[9, 147, 9, 419, 67, 686, 109, 1466, 3269, 26]

## generate dataframe for modeling

In [115]:
# only pick words both in graph and rebecca list

with open('data/rebecca_words.txt', 'r') as f:
    edu_set = []
    for word in f:
        edu_set.append(word.strip())
        
rebeccaRank = {w:i+1 for i, w in enumerate(edu_set)}

final_words_set = set(token2id.keys()).intersection(edu_set)
print("# of final words:\t", len(final_words_set))
print(rebeccaRank)

# of final words:	 39
{'ball': 1, 'dog': 2, 'spoon': 3, 'foot': 4, 'duck': 5, 'banana': 6, 'shoe': 7, 'cup': 8, 'eat': 9, 'bus': 10, 'flower': 11, 'mouth': 12, 'pencil': 13, 'cookie': 14, 'drum': 15, 'turtle': 16, 'red': 17, 'jump': 18, 'carrot': 19, 'read': 20, 'toe': 21, 'belt': 22, 'fly': 23, 'paint': 24, 'dance': 25, 'whistle': 26, 'kick': 27, 'lamp': 28, 'square': 29, 'fence': 30, 'empty': 31, 'happy': 32, 'fire': 33, 'castle': 34, 'squirrel': 35, 'throw': 36, 'farm': 37, 'penguin': 38, 'gift': 39, 'feather': 40, 'cobweb': 41, 'elbow': 42, 'juggle': 43, 'fountain': 44, 'net': 45, 'shoulder': 46, 'dress': 47, 'roof': 48, 'peek': 49, 'ruler': 50, 'tunnel': 51, 'branch': 52, 'envelope': 53, 'diamond': 54, 'calender': 55, 'buckle': 56, 'sawing': 57, 'panda': 58, 'vest': 59, 'arrow': 60, 'pick': 61, 'target': 62, 'drip': 63, 'knight': 64, 'deliver': 65, 'cactus': 66, 'dentist': 67, 'float': 68, 'claw': 69, 'uniform': 70, 'gigantic': 71, 'furry': 72, 'violin': 73, 'group': 74, 'globe': 

In [147]:
# create dataframe

data = []
for i, word in enumerate(token2id.keys()):
    word_idx = token2id[word]
    if word in final_words_set:
        data.append([word,
                    rebeccaRank[word],
                    strengthRank[i], 
                    closenessRank[i], 
                    betweennessRank[i], 
                    eigen_centralityRank[i],
                    degreeRank[i],
                    freqRank[i]])

import pandas as pd
df = pd.DataFrame(data, columns=['word', 'rebec', 'strgth', 'close', 'betw', 'eigen', 'degree','freq'])
print(df.head())

    word  rebec     strgth        close     betw     eigen  degree  freq
0   pick     61  39.271378  1552.453369    778.0  0.845262     881    50
1    eat      9  42.516464  2001.123908  16473.0  0.911681     989   135
2   foot      4  41.647457  1138.184542     31.0  0.892675     950    29
3  catch     89  38.877337  1473.191100    456.0  0.838180     854    29
4    fly     23  40.017928  1833.844072   6132.0  0.861678     900    81


## inspect correlations between Rebecca's rank and our measures

In [None]:
### using all nodes for strength and closeness

In [123]:
for col in df.columns[2:]:  
    print('rebecca vs. {}'.format(col), df['rebec'].corr(df[col]))

rebecca vs. strgth 0.3339430480684464
rebecca vs. close -0.1836991936367226
rebecca vs. betw -0.3387505231017212
rebecca vs. eigen 0.3429530589923347
rebecca vs. degree -0.364914025988664
rebecca vs. freq -0.2575779191436691


In [None]:
### using g1 for strength and closeness

In [134]:
for col in df.columns[2:]:  
    print('rebecca vs. {}'.format(col), df['rebec'].corr(df[col]))

rebecca vs. strgth 0.34766129701642823
rebecca vs. close -0.1836991936367226
rebecca vs. betw -0.3387505231017212
rebecca vs. eigen 0.3429530589923347
rebecca vs. degree -0.364914025988664
rebecca vs. freq -0.2575779191436691


In [None]:
### using g2 for everything (reason: degree shows strong correlation)

In [148]:
for col in df.columns[2:]:  
    print('rebecca vs. {}'.format(col), df['rebec'].corr(df[col]))

rebecca vs. strgth -0.36432207290381824
rebecca vs. close -0.1836991936367226
rebecca vs. betw -0.3387505231017212
rebecca vs. eigen -0.3536867425684928
rebecca vs. degree -0.364914025988664
rebecca vs. freq -0.2575779191436691


In [152]:
df = df.sort_values(by=['rebec']) # using g2-version for everything (highest correlation)

In [156]:
df.to_csv("./ranks_04_04.csv",index=False)