In [2]:
from preprocess import * 

DIR = 'fb_data'
BOOK_FILE = '{}/cbt.txt'.format(DIR)
BOOK_LEMMA_FILE = '{}/cbt_lemma.txt'.format(DIR)
PPVT_LEMMA_FILE = '{}/PPVT_lemma.csv'.format(DIR)

## Preprocessing

In [3]:
ranked_words= read_rebecca_lemma(PPVT_LEMMA_FILE)
book_words, pos_words = lemmatize_book(BOOK_FILE, BOOK_LEMMA_FILE)
generate_wordset_files(book_words, ranked_words, DIR)

170
43439
228


## Training

In [36]:
from model import *

documents = read_docs()
model = train_word2vec(documents, size=300, min_count=5, iters=500, window=20)

In [37]:
freq_counts(documents)
print('man', 'woman', getDistance('man', 'woman', model))
print('cat', 'dog', getDistance('cat', 'dog', model))
print('coffee', 'girl', getDistance('coffee', 'girl', model))  

freq  1 180076
freq  2 60524
freq  3 37660
freq  4 28916
freq  5 23983
man woman 0.01287195086479187
cat dog 0.1432747095823288
coffee girl 0.22811923921108246


In [None]:
## intersection with Word2Vec Google + our corpus

In [32]:
len([w for w in ranked_words if w in set(model.wv.vocab)])

128

In [34]:
book_set = set(book_words)
len([w for w in ranked_words if w in book_set])

170

## Visualization

In [41]:
for i in range(0, 20, 1):
	max_dist = 0.01*(i+1)

	g = build_pos_graph(model, pos_words['n'], sampling_size=10, max_dist=max_dist)

	visual_style = {}
	visual_style["vertex_label_color"] = "#0088ff"
	visual_style["vertex_label_size"] = 15
	visual_style["vertex_size"] = 25
	visual_style["vertex_label"] = g.vs["label"]
	layout = g.layout("kk")
	visual_style["layout"] = layout
	visual_style["bbox"] = (300, 300)
	visual_style["margin"] = 20
	filename = "./output/noun{}.pdf".format(max_dist)
	plot(g, filename, **visual_style)

In [44]:
# VERBS
for i in range(0, 20, 1):
	max_dist = 0.01*(i+1)

	g = build_pos_graph(model, pos_words['v'], sampling_size=10, max_dist=max_dist)

	visual_style = {}
	visual_style["vertex_label_color"] = "#0088ff"
	visual_style["vertex_label_size"] = 15
	visual_style["vertex_size"] = 20
	visual_style["vertex_label"] = g.vs["label"]
	layout = g.layout("kk")
	visual_style["layout"] = layout
	visual_style["bbox"] = (300, 300)
	visual_style["margin"] = 20
	filename = "./output/verb{}.pdf".format(max_dist)
	plot(g, filename, **visual_style)

## rank words

In [2]:
from model import *

documents = read_docs()

# no pretrained
model = train_word2vec(documents, 300, 5, 200)
# use pretrained model
# model = load_pretrained_word2vec(documents, 300, 5, 100, retrain=False)
# use pretrained + retrain
# model = load_pretrained_word2vec(documents, 300, 5, 30, retrain=True)

freq_counts(documents)
print('man', 'woman', getDistance('man', 'woman', model))
print('cat', 'dog', getDistance('cat', 'dog', model))
print('sun', 'kid', getDistance('sun', 'kid', model))        
        
# build graph     
g, (edges, vertices), (id2token, token2id) = build_graph(model, sampling=True)

# compute measures
df = compute_measures_df(g, edges, vertices, id2token, token2id)

# see correlations
for col in df.columns[2:]:  
    print('rebecca vs. {}'.format(col), df['rebec'].corr(df[col]))

# save to csv
df.to_csv("./ranks_04_16_continuous_close_strgth.csv",index=False)

freq  1 10183
freq  2 2828
freq  3 1809
freq  4 1454
freq  5 1224
man woman 0.11811476200819016
cat dog 0.38874325156211853
sun kid 0.12517696619033813
word2vec:	 1224
book_set:	 4751
edu_set:	 228
surrounding words 1125
intersertion words 28
     word  rebec     strgth       close   betw     eigen  degree  freq
23   ball      1   5.983983  219.075029  230.0  0.447835     116    10
8     dog      2   9.965104  168.385991   42.0  0.814703      89    66
27  spoon      3  10.953181  242.473746  153.0  0.749478      89     7
5    foot      4   9.342622  237.922658  491.0  0.756830      98    29
18   duck      5  10.762405  167.400740    0.0  0.865990      88    17
rebecca vs. strgth 0.2411645579712757
rebecca vs. close 0.03563099953891488
rebecca vs. betw -0.09712636853686307
rebecca vs. eigen 0.313182398913958
rebecca vs. degree -0.2909009215348
rebecca vs. freq -0.30977605925871654
