In [1]:
from preprocess import * 

############## ONLY NEED TO CHANGE THE FOLLOWING TO USE preprocess module##########
DIR = 'data' 
BOOK_FILE = '{}/book.txt'.format(DIR)
BOOK_LEMMA_FILE = '{}/lemmatize_book.txt'.format(DIR)
PPVT_LEMMA_FILE = '{}/PPVT lemma.csv'.format(DIR)

## Preprocessing

In [2]:
ranked_words= read_rebecca_lemma(PPVT_LEMMA_FILE)
book_words, pos_words = lemmatize_book(BOOK_FILE, BOOK_LEMMA_FILE)
generate_wordset_files(book_words, ranked_words, DIR)

85
4751
228


## Training
    - Good to see increase in distance for these 3 pairs of examples

In [117]:
import gensim
from collections import defaultdict
from scipy import spatial
from gensim.models import KeyedVectors
from igraph import *
import random
import pandas as pd
from collections import Counter
"""
task
1. `model.py`: construct simple pipeline to build igraph, form df
2. `preprocess.py`: build several graphs (V,N) in ipython book
3. `visualize.py`: visualize network by sampling
"""

DIR = 'data'
# paths to the 3 generated files from preprocess.py
BOOK_WORD_SET_FILE = '{}/book_words_set.txt'.format(DIR)
REBECCA_WORD_FILE = '{}/rebecca_words.txt'.format(DIR)
LEMMA_BOOK_FILE = "{}/lemmatize_book.txt".format(DIR)

def read_file_to_list(filename):
	words = []
	with open(filename, 'r') as f:
	    for word in f:
	        words.append(word.strip())
	return words

def getSimilarity(w1, w2, model):

    return abs(1-spatial.distance.cosine(model.wv[w1], model.wv[w2]))

def load_pretrained_word2vec(documents, size, min_count, iters, retrain=False):
	GOOGLE_W2V_FILE = './model/GoogleNews-vectors-negative300.bin'
	model = KeyedVectors.load_word2vec_format(GOOGLE_W2V_FILE, binary=True)  
	if not retrain:
		return model
	model_2 = gensim.models.Word2Vec(
	            documents,
	            sg=0,
	            size=size,
	            window=20,
	            min_count=min_count,
	            iter=iters,
	            workers=4)
	total_examples = model_2.corpus_count
	model_2.build_vocab([list(model.vocab.keys())], update=True)
	model_2.intersect_word2vec_format(GOOGLE_W2V_FILE, binary=True, lockf=1.0)
	model_2.train(documents, total_examples=total_examples, epochs=model_2.epochs)
	return model_2

def train_word2vec(documents, size=300, min_count=5, iters=100, window=20):
	model = gensim.models.Word2Vec(
        documents,
        sg=0,
        size=size,
        window=window,
        min_count=min_count,
        iter=iters,
        workers=4)
	model.train(documents, total_examples=len(documents), epochs=model.epochs)
	return model

def build_graph(model, sampling_size=0):

	book_set = read_file_to_list(BOOK_WORD_SET_FILE)
	edu_set = read_file_to_list(REBECCA_WORD_FILE)

	print("word2vec:\t", len(model.wv.vocab))
	print("book_set:\t", len(book_set))
	print("edu_set:\t", len(edu_set))
# 	surrounding_words = set(model.wv.vocab.keys()).intersection(book_set)
# 	surrounding_words = set(surrounding_words).difference(edu_set)
# 	print('surrounding words', len(surrounding_words))
	intersection_words = set(model.wv.vocab.keys()).intersection(edu_set)
	print('intersertion words', len(intersection_words))

	if sampling_size:
		token2vec = {}
		rand_words = random.sample(intersection_words, sampling_size)
		for word in rand_words:
			token2vec[word] = model.wv[word]
		for word in intersection_words:
			token2vec[word] = model.wv[word]
	else:
		token2vec = model.wv.vocab # bad

	idx = 0
	id2token = {}
	token2id = {}

	for word in token2vec:
	    id2token[idx] = word
	    token2id[word] = idx
	    idx += 1

	vertices = [idx for idx in range(len(token2vec))]
	edges = [(i, j) for i in vertices for j in vertices if i < j]
	g = Graph(vertex_attrs={"label":vertices}, edges=edges, directed=False)
	g.es["sim"] = [getSimilarity(id2token[i], id2token[j], model) for i,j in edges]
	g.es["dist"] = np.array(1-np.array(g.es['sim'])).tolist()
	# test validity
	# assert(getSimilarity('man', 'woman', model) == g[token2id['man'], token2id['woman']] )
	# assert(getSimilarity('cat', 'dog', model) == g[token2id['cat'], token2id['dog']] )

	return g, (edges, vertices), (id2token, token2id)

def filter_graph(weights, edges, vertices, id2token, min_sim_strength=1.0):

	new_edges, new_weights = [], [] # weights: similarity
	new_distances = []
	for edge, w in zip(edges, weights):
	    if w >= min_sim_strength:
	        new_edges.append(edge)
	        new_weights.append(w)
	        new_distances.append(1-w)
	g = Graph(vertex_attrs={"label":vertices}, edges=new_edges, directed=False)

	g.es["sim"] = new_weights
	g.es["dist"] = new_distances
	g.vs["label"] = [id2token[idx] for idx in vertices]

	return g

def compute_measures_df(g, edges, vertices, id2token, token2id):
	"""
	compute centrality measures to output df
	"""

	# g -> strength, closeness (continuous)
	strengthRank = g.strength(None,  weights=g.es['sim'])
	closenessRank = g.closeness(None, 'all', weights=g.es['dist'], normalized=True)

	# g1 -> betweenness, eigen_centrality
	g1 = filter_graph(g.es["sim"], edges, vertices, id2token, min_sim_strength=0.5)

	betweennessRank = g1.betweenness(directed=False, weights=g1.es['dist'])
	eigen_centralityRank = g1.eigenvector_centrality(directed=False, weights=g1.es['sim'])

	# g2 -> degree
	g2 = filter_graph(g.es["sim"], edges, vertices, id2token, min_sim_strength=0.5)

	degreeRank = g2.degree(mode='all')

	# frequency
	wordCounter = defaultdict(int)
	with open(LEMMA_BOOK_FILE) as f:
	    for line in f:
	        for w in line.split():
	            w = w.strip()
	            if w in token2id:
	                wordCounter[w] += 1
	freqRank = [wordCounter[w] for w in token2id]

	# rebecca
	with open(REBECCA_WORD_FILE, 'r') as f:
	    edu_list = []
	    for word in f:
	        edu_list.append(word.strip())
	        
	rebeccaRank = {w:i+1 for i, w in enumerate(edu_list)}
	final_words_set = set(token2id.keys()).intersection(edu_list)

	# create df
	data = []
	words_inorder = [id2token[idx] for idx in range(len(token2id))]
	for i, word in enumerate(words_inorder):
	    if word in final_words_set:
	        data.append([word,
	                    rebeccaRank[word],
	                    strengthRank[i], 
	                    closenessRank[i], 
	                    betweennessRank[i], 
	                    eigen_centralityRank[i],
	                    degreeRank[i],
	                    freqRank[i]])


	df = pd.DataFrame(data, columns=['word', 'ppvt', 'strgth', 'close', 'betw', 'eigen', 'degree', 'freq'])
	df = df.sort_values(by=['ppvt'])
	print(df.head())
	return df

def build_pos_graph(model, pos_word_set, sampling_size=0, max_dist=1):
	"""
	build semantic graph without referring Rebecca's word list;
	only to explore noun, verb's centrality in vocab
	"""

	book_set = read_file_to_list(BOOK_WORD_SET_FILE)
	edu_set = read_file_to_list(REBECCA_WORD_FILE)
	valid_pos_words = set(model.wv.vocab.keys()).intersection(edu_set)
	valid_pos_words = set(valid_pos_words).intersection(pos_word_set)
	# print('pos_word_set', len(pos_word_set))

	token2vec = {}
	if sampling_size > 0:
		rand_words = random.sample(valid_pos_words, sampling_size)
		for word in rand_words:
			token2vec[word] = model.wv[word]			
	else:
		token2vec = valid_pos_words

	idx = 0
	id2token = {}
	token2id = {}

	for word in token2vec:
	    id2token[idx] = word
	    token2id[word] = idx
	    idx += 1

	vertices = [idx for idx in range(len(token2vec))]
	edges = [(i, j) for i in vertices for j in vertices if i < j]
	weights = [getSimilarity(id2token[i], id2token[j], model) for i, j in edges]

	g = filter_graph(weights, edges, vertices, id2token, max_dist)

	return g, (edges, vertices), (id2token, token2id)

def read_docs():
	with open(LEMMA_BOOK_FILE) as f:
	    documents = []
	    for line in f:
	        documents.append(line.split())
	return documents

def freq_counts(documents):
    allwords = []
    for doc in documents:
        allwords += doc
    
    ct = Counter(allwords)
    for i in range(5):
        print('freq ', i+1, len([w for w in ct if ct[w] >= i+1]))

In [118]:
# build graph     
g, (edges, vertices), (id2token, token2id) = build_graph(model, sampling_size=10)

# compute measures
df = compute_measures_df(g, edges, vertices, id2token, token2id)

# see correlations
for col in df.columns[2:]:  
    print('rebecca vs. {}'.format(col), df['ppvt'].corr(df[col].rank(ascending=False)))

word2vec:	 10183
book_set:	 4751
edu_set:	 228
intersertion words 75
     word  ppvt     strgth     close  betw         eigen  degree  freq
16   ball     1   6.749831  1.100369   0.0  6.885495e-09       0    10
3     dog     2  18.277181  1.356707   4.0  5.739247e-01      14    66
70  spoon     3  17.148174  1.337213   2.0  5.864365e-01      13     7
35   foot     4  19.469876  1.386460   8.0  8.249493e-01      19    29
53   duck     5  19.297002  1.376698   2.0  7.142619e-01      17    17
rebecca vs. strgth 0.34670707555383795
rebecca vs. close 0.3449171577657438
rebecca vs. betw 0.344892356335694
rebecca vs. eigen 0.30977558006665695
rebecca vs. degree 0.3400108732925529
rebecca vs. freq 0.5152850749928397


In [75]:
# from model import *

documents = read_docs()
model = train_word2vec(documents, size=100, min_count=0, iters=40, window=10)

In [76]:
freq_counts(documents)
print('table', 'girl', getSimilarity('table', 'girl', model))
print('man', 'woman', getSimilarity('man', 'woman', model))
print('girl', 'boy', getSimilarity('girl', 'boy', model))
print('sun', 'moon', getSimilarity('sun', 'moon', model))
print('cat', 'dog', getSimilarity('cat', 'dog', model))

freq  1 10183
freq  2 2828
freq  3 1809
freq  4 1454
freq  5 1224
table girl 0.07762456685304642
man woman 0.4199064075946808
girl boy 0.5913309454917908
sun moon 0.7748672366142273
cat dog 0.8432342410087585


In [64]:
pmodel = load_pretrained_word2vec(documents, 100, 0, 40, retrain=False)

In [65]:
freq_counts(documents)
print('table', 'girl', getSimilarity('table', 'girl', pmodel))
print('man', 'woman', getSimilarity('man', 'woman', pmodel))
print('girl', 'boy', getSimilarity('girl', 'boy', pmodel))
print('sun', 'moon', getSimilarity('sun', 'moon', pmodel))
print('cat', 'dog', getSimilarity('cat', 'dog', pmodel))

freq  1 10183
freq  2 2828
freq  3 1809
freq  4 1454
freq  5 1224
table girl 0.03306927904486656
man woman 0.7664012312889099
girl boy 0.8543271422386169
sun moon 0.42628341913223267
cat dog 0.760945737361908




In [28]:
from sklearn.metrics.pairwise import cosine_similarity

from numpy import dot
from numpy.linalg import norm
a = model.wv['cat']
b = model.wv['dog']
cos_sim = dot(a, b)/(norm(a)*norm(b))
cos_sim

0.43008167

## Overlapping in Vocab
    - therefore, we do not use google pre-trained word embeddings

In [120]:
vocab_overlapping_googleW2V = len([w for w in ranked_words if w in set(model.wv.vocab)])
print("Intersection b/w Google W2V, FB data, and PPVT is {}".format(vocab_overlapping_googleW2V))

book_set = set(book_words)
vocab_overlapping_fb_data = len([w for w in ranked_words if w in book_set])
print("Intersection b/w FB data, and PPVT is {}".format(vocab_overlapping_fb_data))

Intersection b/w Google W2V, FB data, and PPVT is 75
Intersection b/w FB data, and PPVT is 85


## Visualization
    - nouns
    - verbs

In [121]:
def normalize_list(l, lower, upper):
	max_min_range = max(l)-min(l)
# 	print('times', (upper-lower)/max_min_range)
	return [lower + (upper - lower) / max_min_range * x for x in l]

for i in range(0, 20, 1):
	max_dist = 0.01*(i+1)
	g, (edges, vertices), (id2token, token2id) = build_pos_graph(model, pos_words['n'], sampling_size=10, max_dist=max_dist)
	wordCounter = defaultdict(int)
	with open(LEMMA_BOOK_FILE) as f:
	    for line in f:
	        for w in line.split():
	            w = w.strip()
	            if w in token2id:
	                wordCounter[w] += 1
	rank = [wordCounter[w] for w in token2id]
	vertexSize = normalize_list(rank, 10, 40)
# 	vertexLabelSize = normalize_list(rank, 20, 40)
	visual_style = {}
	visual_style["vertex_color"] = "#94a1b6"
	visual_style["vertex_label_color"] = "#f20606"
	visual_style["vertex_label_size"] = 15 # 15
	visual_style["vertex_size"] = vertexSize # 15
	visual_style["vertex_label"] = g.vs["label"]
	layout = g.layout("kk")
	visual_style["layout"] = layout
	visual_style["bbox"] = (300, 300)
	visual_style["margin"] = 20
	filename = "./output/noun{}.pdf".format(max_dist)
	plot(g, filename, **visual_style)

In [122]:
def normalize_list(l, lower, upper):
	max_min_range = max(l)-min(l)
# 	print('times', (upper-lower)/max_min_range)
	return [lower + (upper - lower) / max_min_range * x for x in l]

for i in range(0, 20, 1):
	max_dist = 0.01*(i+1)
	g, (edges, vertices), (id2token, token2id) = build_pos_graph(model, pos_words['v'], sampling_size=10, max_dist=max_dist)
	wordCounter = defaultdict(int)
	with open(LEMMA_BOOK_FILE) as f:
	    for line in f:
	        for w in line.split():
	            w = w.strip()
	            if w in token2id:
	                wordCounter[w] += 1
	rank = [wordCounter[w] for w in token2id]
	vertexSize = normalize_list(rank, 10, 40)
# 	vertexLabelSize = normalize_list(rank, 20, 40)
	visual_style = {}
	visual_style["vertex_color"] = "#94a1b6"
	visual_style["vertex_label_color"] = "#f20606"
	visual_style["vertex_label_size"] = 15 # 15
	visual_style["vertex_size"] = vertexSize # 15
	visual_style["vertex_label"] = g.vs["label"]
	layout = g.layout("kk")
	visual_style["layout"] = layout
	visual_style["bbox"] = (300, 300)
	visual_style["margin"] = 20
	filename = "./output/noun{}.pdf".format(max_dist)
	plot(g, filename, **visual_style)

## rank words

In [124]:
# from model import *

# documents = read_docs()

# # no pretrained
# model = train_word2vec(documents, 300, 5, 200)
# # use pretrained model
# # model = load_pretrained_word2vec(documents, 300, 5, 100, retrain=False)
# # use pretrained + retrain
# # model = load_pretrained_word2vec(documents, 300, 5, 30, retrain=True)

# freq_counts(documents)
# print('man', 'woman', getSimilarity('man', 'woman', model))
# print('cat', 'dog', getSimilarity('cat', 'dog', model))
# print('sun', 'kid', getSimilarity('sun', 'kid', model))        
        
# build graph     
g, (edges, vertices), (id2token, token2id) = build_graph(model, sampling_size=10)

# compute measures
df = compute_measures_df(g, edges, vertices, id2token, token2id)

# see correlations
for col in df.columns[2:]:  
    print('rebecca vs. {}'.format(col), df['ppvt'].corr(df[col].rank(ascending=False)))
    
# save to csv
df.to_csv("./output/rankings_0623_old_book.csv",index=False)

word2vec:	 10183
book_set:	 4751
edu_set:	 228
intersertion words 75
     word  ppvt     strgth     close  betw     eigen  degree  freq
9    ball     1   6.749831  1.100369   0.0  0.049723       0    10
52    dog     2  18.277181  1.356707   4.0  0.573873      14    66
70  spoon     3  17.148174  1.337213   2.0  0.586251      13     7
35   foot     4  19.469876  1.386460   8.0  0.824783      19    29
51   duck     5  19.297002  1.376698   2.0  0.714131      17    17
rebecca vs. strgth 0.34670707555383795
rebecca vs. close 0.3449171577657438
rebecca vs. betw 0.344892356335694
rebecca vs. eigen 0.30977558006665695
rebecca vs. degree 0.3400108732925529
rebecca vs. freq 0.5152850749928397
