## construct word2vec from book

In [1]:
import gensim
with open('./data/lemma_book.txt') as f:
    documents = []
    for line in f:
        documents.append(line.split())
# print(documents)

## visualize and validate the word embeddings
    - tweaking iter, epoch improves quality of word embeddings
    - use iter=500, epoch=20, for now
    - trained only on book data (no external/pre-trained knowledge)

In [2]:
from scipy import spatial
model = gensim.models.Word2Vec(
        documents,
        sg=0,
        size=300,
        window=20,
        min_count=2,
        iter=20, # this is important, b/c we want overfitting
        workers=4)
model.train(documents, total_examples=len(documents), epochs=20)
print(model)

def getDistance(w1, w2):
    return abs(1-spatial.distance.cosine(model.wv[w1], model.wv[w2]))

print('------ 20 iters ----------')
print(getDistance('man', 'woman'))
print(getDistance('cat', 'dog'))
# print(getDistance('king', 'queen'))

Word2Vec(vocab=2828, size=300, alpha=0.025)
------ 20 iters ----------
0.6432896256446838
0.9624505639076233


In [3]:
from scipy import spatial
model = gensim.models.Word2Vec(
        documents,
        sg=0,
        size=300,
        window=20,
        min_count=2,
        iter=100, # this is important, b/c we want overfitting
        workers=4)
model.train(documents, total_examples=len(documents), epochs=20)
print(model)

def getDistance(w1, w2):
    return abs(1-spatial.distance.cosine(model.wv[w1], model.wv[w2]))

print('------ 100 iters ----------')
print(getDistance('man', 'woman'))
print(getDistance('cat', 'dog'))
print(getDistance('king', 'queen'))

Word2Vec(vocab=2828, size=300, alpha=0.025)
------ 100 iters ----------
0.10735931247472763
0.6275115013122559
0.10306362807750702


In [4]:
from scipy import spatial
model = gensim.models.Word2Vec(
        documents,
        sg=0,
        size=300,
        window=20,
        min_count=2,
        iter=300, # this is important, b/c we want overfitting
        workers=4)
model.train(documents, total_examples=len(documents), epochs=20)
print(model)

def getDistance(w1, w2):
    return abs(1-spatial.distance.cosine(model.wv[w1], model.wv[w2]))

print('------ 300 iters ----------')
print(getDistance('man', 'woman'))
print(getDistance('cat', 'dog'))
print(getDistance('king', 'queen'))

Word2Vec(vocab=2828, size=300, alpha=0.025)
------ 300 iters ----------
0.1841917634010315
0.44950127601623535
0.10599736869335175


In [16]:
from scipy import spatial
model = gensim.models.Word2Vec(
        documents,
        sg=0,
        size=300,
        window=20,
        min_count=5,
        iter=500, # this is important, b/c we want overfitting
        workers=4)
model.train(documents, total_examples=len(documents), epochs=20)
print(model)

def getDistance(w1, w2):
    return abs(1-spatial.distance.cosine(model.wv[w1], model.wv[w2]))

print('------ 500 iters ----------')
print(getDistance('man', 'woman'))
print(getDistance('cat', 'dog'))
# print(getDistance('king', 'queen'))

Word2Vec(vocab=1224, size=300, alpha=0.025)
------ 500 iters ----------
0.06545022875070572
0.3772120177745819


### save & load model

In [17]:
from gensim.test.utils import common_texts, get_tmpfile
path = get_tmpfile("word2vec.model")
model.save("word2vec.model")
model = gensim.models.Word2Vec.load("word2vec.model")

## build igraph using word2vec

In [18]:
# TODO: examine the word2vec effectiveness using visualizations

# with open('data/intersection.txt', 'r') as f:
#     inter_set = []
#     for word in f:
#         inter_set.append(word)

with open('data/book_words_set.txt', 'r') as f:
    book_set = []
    for word in f:
        book_set.append(word)

with open('data/rebecca_words.txt', 'r') as f:
    edu_set = []
    for word in f:
        edu_set.append(word)
        
print("word2vec:\t", len(model.wv.vocab))
# print("inter_set:\t", len(inter_set))
print("book_set:\t", len(book_set))
print("edu_set:\t", len(edu_set))

word2vec:	 1224
book_set:	 4751
edu_set:	 228


### construct igraph based on word2vec

In [27]:
from igraph import *
# token2vec
token2vec = model.wv.vocab
# create id2token
idx = 0
id2token = {}
token2id = {}
words = token2id

for word in token2vec:
    id2token[idx] = word
    token2id[word] = idx
    idx += 1

vertices = [idx for idx in range(len(token2vec))]
edges = [(i, j) for i in vertices for j in vertices if i < j]
g = Graph(vertex_attrs={"label":vertices}, edges=edges, directed=False)
    
g.vs["word"] = token2id.keys()
weights = [getDistance(id2token[i], id2token[j]) for i, j in edges]
g.es["weight"] = weights

### test the validity of the network

In [28]:
assert(getDistance('man', 'woman') == g[token2id['man'], token2id['woman']] )
assert(getDistance('cat', 'dog') == g[token2id['cat'], token2id['dog']] )

## Centrality Measures of graph

### Strength

In [48]:
strengthRank = g2.strength(None,  weights=g2.es['weight'])
strengthRank[:10]

[38.75003078268492,
 42.48841784886463,
 38.917647697952816,
 42.114721707424906,
 35.994142794017534,
 43.51131318032276,
 42.20643922920863,
 40.37464757099042,
 44.98500001744469,
 37.89474887335746]

### closeness

In [49]:
closenessRank = g2.closeness(None, 'all', weights=g2.es['weight'], normalized=True)
closenessRank[:10]

[1808.958927083432,
 1749.6477087074572,
 1915.9969874838953,
 1621.9277408499327,
 1773.914299832439,
 1484.6054957951096,
 1341.230963977371,
 1902.5358331617228,
 1674.4189310987413,
 1420.9826123979083]

### betweenness


In [29]:
# remove edges whose dsitance >= 0.5
max_dist=0.5

new_edges, new_weights = [], []
for edge, dist in zip(edges, weights):
    if dist < max_dist:
        new_edges.append(edge)
        new_weights.append(dist)
g1 = Graph(vertex_attrs={"label":vertices}, edges=new_edges, directed=False)
g1.vs["word"] = words.keys()
g1.es["weight"] = new_weights

In [50]:
betweennessRank = g2.betweenness(directed=False, weights=g2.es['weight'])
betweennessRank[:10]

[1500.0, 2343.0, 11210.0, 1579.0, 6948.0, 580.0, 429.0, 15632.0, 6533.0, 160.0]

### eigen_centrality 

In [51]:
eigen_centralityRank = g2.eigenvector_centrality(directed=False, weights=g2.es['weight'])
eigen_centralityRank[:10]

[0.8308252745300805,
 0.9072724801467226,
 0.8328147082121062,
 0.8985276867293528,
 0.7696499887426691,
 0.9284151293181844,
 0.9009343249756069,
 0.8602218363028828,
 0.956036647686703,
 0.8131211224894905]

### degree

In [31]:
# remove edges
max_dist=0.1

new_edges, new_weights = [], []
for edge, dist in zip(edges, weights):
    if dist < max_dist:
        new_edges.append(edge)
        new_weights.append(dist)
g2 = Graph(vertex_attrs={"label":vertices}, edges=new_edges, directed=False)


g2.vs["word"] = words.keys()
g2.es["weight"] = new_weights

In [32]:
degreeRank = g2.degree(mode='all')
degreeRank[:10]

[878, 957, 874, 1023, 848, 1020, 965, 1058, 1098, 869]

### Frequency

In [33]:
# problem with frequency (only counts words in the graph)
from collections import defaultdict
wordCounter = defaultdict(int)
with open("data/lemma_book.txt") as f:
    for line in f:
        for w in line.split():
            w = w.strip()
            if w in token2id:
                wordCounter[w] += 1
freqRank = [wordCounter[w] for w in token2id]
freqRank[:10]

[9, 147, 9, 419, 67, 686, 109, 1466, 3269, 26]

## generate dataframe for modeling

In [44]:
# only pick words both in graph and rebecca list

with open('data/rebecca_words.txt', 'r') as f:
    edu_set = []
    for word in f:
        edu_set.append(word.strip())
        
rebeccaRank = {w:i+1 for i, w in enumerate(edu_set)}

final_words_set = set(token2id.keys()).intersection(edu_set)
print("# of final words:\t", len(final_words_set))
print(rebeccaRank)

# of final words:	 39
{'ball': 1, 'dog': 2, 'spoon': 3, 'foot': 4, 'duck': 5, 'banana': 6, 'shoe': 7, 'cup': 8, 'eat': 9, 'bus': 10, 'flower': 11, 'mouth': 12, 'pencil': 13, 'cookie': 14, 'drum': 15, 'turtle': 16, 'red': 17, 'jump': 18, 'carrot': 19, 'read': 20, 'toe': 21, 'belt': 22, 'fly': 23, 'paint': 24, 'dance': 25, 'whistle': 26, 'kick': 27, 'lamp': 28, 'square': 29, 'fence': 30, 'empty': 31, 'happy': 32, 'fire': 33, 'castle': 34, 'squirrel': 35, 'throw': 36, 'farm': 37, 'penguin': 38, 'gift': 39, 'feather': 40, 'cobweb': 41, 'elbow': 42, 'juggle': 43, 'fountain': 44, 'net': 45, 'shoulder': 46, 'dress': 47, 'roof': 48, 'peek': 49, 'ruler': 50, 'tunnel': 51, 'branch': 52, 'envelope': 53, 'diamond': 54, 'calender': 55, 'buckle': 56, 'sawing': 57, 'panda': 58, 'vest': 59, 'arrow': 60, 'pick': 61, 'target': 62, 'drip': 63, 'knight': 64, 'deliver': 65, 'cactus': 66, 'dentist': 67, 'float': 68, 'claw': 69, 'uniform': 70, 'gigantic': 71, 'furry': 72, 'violin': 73, 'group': 74, 'globe': 

In [52]:
# create dataframe

data = []
for i, word in enumerate(token2id.keys()):
    word_idx = token2id[word]
    if word in final_words_set:
        data.append([word,
                    rebeccaRank[word],
                    strengthRank[i], 
                    closenessRank[i], 
                    betweennessRank[i], 
                    eigen_centralityRank[i],
                    degreeRank[i],
                    freqRank[i]])

import pandas as pd
df = pd.DataFrame(data, columns=['word', 'rebec', 'strgth', 'close', 'betw', 'eigen', 'degree','freq'])
print(df.head())

    word  rebec     strgth        close    betw     eigen  degree  freq
0   pick     61  39.275792  1749.764991  1104.0  0.839877     887    50
1    eat      9  42.850509  1609.885049  1653.0  0.913919     986   135
2   foot      4  42.100066  1426.893151     3.0  0.897943     952    29
3  catch     89  38.164909  1684.754610  1868.0  0.817990     852    29
4    fly     23  39.287387  1304.106602     0.0  0.840051     885    81


## inspect correlations between Rebecca's rank and our measures

### using all nodes 
- for strength and closeness (g)

### using max_dist = 0.5 
- for betweenness and eigen centrality (g1)

### using max_dist = 0.1 
- for degree (g2)

In [39]:
for col in df.columns[2:]:  
    print('rebecca vs. {}'.format(col), df['rebec'].corr(df[col]))

rebecca vs. strgth 0.3135913621782361
rebecca vs. close 0.10808335379126607
rebecca vs. betw 0.013219015565545184
rebecca vs. eigen 0.3295114256127249
rebecca vs. degree -0.35072034004458963
rebecca vs. freq -0.2575779191436691


In [40]:
df = df.sort_values(by=['rebec']) # using g2-version for everything (highest correlation)
print(df.head())
df.to_csv("./ranks_04_06_continuous_close_strgth.csv",index=False)

     word  rebec     strgth        close    betw     eigen  degree  freq
35   ball      1  62.704245  1665.439269  4942.0  0.571568    1079    10
8     dog      2  95.111616  1519.493029   877.0  0.871343     875    66
11  spoon      3  93.671600  1419.554591   225.0  0.846802     883     7
2    foot      4  83.597866  1426.893151     3.0  0.780135     952    29
7    duck      5  96.382156  1561.317217   174.0  0.903327     877    17


### using max_dist = 0.5 
- for strength and closeness, betweenness and eigen centrality (g1)

### using max_dist = 0.1 
- for degree (g2)

In [46]:
for col in df.columns[2:]:  
    print('rebecca vs. {}'.format(col), df['rebec'].corr(df[col]))

rebecca vs. strgth 0.32776575680353076
rebecca vs. close 0.10808335379126607
rebecca vs. betw 0.013219015565545184
rebecca vs. eigen 0.3295114256127249
rebecca vs. degree -0.35072034004458963
rebecca vs. freq -0.2575779191436691


In [47]:
df = df.sort_values(by=['rebec']) # using g2-version for everything (highest correlation)
print(df.head())
df.to_csv("./ranks_04_06_maxdistance_0_5.csv",index=False)

     word  rebec     strgth        close    betw     eigen  degree  freq
35   ball      1  62.704245  1665.439269  4942.0  0.571568    1079    10
8     dog      2  92.688268  1519.493029   877.0  0.871343     875    66
11  spoon      3  90.065604  1419.554591   225.0  0.846802     883     7
2    foot      4  83.597866  1426.893151     3.0  0.780135     952    29
7    duck      5  95.874417  1561.317217   174.0  0.903327     877    17


### using max_dist = 0.1 for everything
 - strength and closeness, betweenness and eigen centrality, degree (g2)

In [53]:
for col in df.columns[2:]:  
    print('rebecca vs. {}'.format(col), df['rebec'].corr(df[col]))

rebecca vs. strgth -0.40733964984980287
rebecca vs. close 0.10808335379126607
rebecca vs. betw 0.013219015565545184
rebecca vs. eigen -0.4075250778740258
rebecca vs. degree -0.35072034004458963
rebecca vs. freq -0.2575779191436691


In [54]:
df = df.sort_values(by=['rebec']) # using g2-version for everything (highest correlation)
print(df.head())
df.to_csv("./ranks_04_06_maxdistance_0_1.csv",index=False)

     word  rebec     strgth        close    betw     eigen  degree  freq
35   ball      1  44.491207  1665.439269  4942.0  0.947106    1079    10
8     dog      2  38.267111  1519.493029   877.0  0.817279     875    66
11  spoon      3  38.745796  1419.554591   225.0  0.830752     883     7
2    foot      4  42.100066  1426.893151     3.0  0.897943     952    29
7    duck      5  39.796608  1561.317217   174.0  0.851659     877    17
