In [1]:
from preprocess import * 

############## ONLY NEED TO CHANGE THE FOLLOWING TO USE preprocess module##########
DIR = 'fb_data' 
BOOK_FILE = '{}/cbt.txt'.format(DIR)
BOOK_LEMMA_FILE = '{}/cbt_lemma.txt'.format(DIR)
PPVT_LEMMA_FILE = '{}/PPVT_lemma.csv'.format(DIR)

## Preprocessing

In [2]:
ranked_words= read_rebecca_lemma(PPVT_LEMMA_FILE)
book_words, pos_words = lemmatize_book(BOOK_FILE, BOOK_LEMMA_FILE)
generate_wordset_files(book_words, ranked_words, DIR)

170
43439
228


## Training
    - Good to see increase in distance for these 3 pairs of examples

In [27]:
import gensim
from collections import defaultdict
from scipy import spatial
from gensim.models import KeyedVectors
from igraph import *
import random
import pandas as pd
from collections import Counter
"""
task
1. `model.py`: construct simple pipeline to build igraph, form df
2. `preprocess.py`: build several graphs (V,N) in ipython book
3. `visualize.py`: visualize network by sampling
"""

DIR = 'fb_data'
# paths to the 3 generated files from preprocess.py
BOOK_WORD_SET_FILE = '{}/book_words_set.txt'.format(DIR)
REBECCA_WORD_FILE = '{}/rebecca_words.txt'.format(DIR)
LEMMA_BOOK_FILE = "{}/cbt_lemma.txt".format(DIR)

def read_file_to_list(filename):
	words = []
	with open(filename, 'r') as f:
	    for word in f:
	        words.append(word.strip())
	return words

def getSimilarity(w1, w2, model):
    return abs(1-spatial.distance.cosine(model.wv[w1], model.wv[w2]))

def load_pretrained_word2vec(documents, size, min_count, iters, retrain=False):
	GOOGLE_W2V_FILE = './model/GoogleNews-vectors-negative300.bin'
	model = KeyedVectors.load_word2vec_format(GOOGLE_W2V_FILE, binary=True)  
	if not retrain:
		return model
	model_2 = gensim.models.Word2Vec(
	            documents,
	            sg=0,
	            size=size,
	            window=20,
	            min_count=min_count,
	            iter=iters,
	            workers=4)
	total_examples = model_2.corpus_count
	model_2.build_vocab([list(model.vocab.keys())], update=True)
	model_2.intersect_word2vec_format(GOOGLE_W2V_FILE, binary=True, lockf=1.0)
	model_2.train(documents, total_examples=total_examples, epochs=model_2.epochs)
	return model_2

def train_word2vec(documents, size=300, min_count=5, iters=100, window=20):
	model = gensim.models.Word2Vec(
        documents,
        sg=0,
        size=size,
        window=window,
        min_count=min_count,
        iter=iters,
        workers=4)
	model.train(documents, total_examples=len(documents), epochs=model.epochs)
	return model

def build_graph(model, sampling_size=0):

	book_set = read_file_to_list(BOOK_WORD_SET_FILE)
	edu_set = read_file_to_list(REBECCA_WORD_FILE)

	print("word2vec:\t", len(model.wv.vocab))
	print("book_set:\t", len(book_set))
	print("edu_set:\t", len(edu_set))
# 	surrounding_words = set(model.wv.vocab.keys()).intersection(book_set)
# 	surrounding_words = set(surrounding_words).difference(edu_set)
# 	print('surrounding words', len(surrounding_words))
	intersection_words = set(model.wv.vocab.keys()).intersection(edu_set)
	print('intersertion words', len(intersection_words))

	if sampling_size:
		token2vec = {}
		rand_words = random.sample(intersection_words, sampling_size)
		for word in rand_words:
			token2vec[word] = model.wv[word]
		for word in intersection_words:
			token2vec[word] = model.wv[word]
	else:
		token2vec = model.wv.vocab # bad

	idx = 0
	id2token = {}
	token2id = {}

	for word in token2vec:
	    id2token[idx] = word
	    token2id[word] = idx
	    idx += 1

	vertices = [idx for idx in range(len(token2vec))]
	edges = [(i, j) for i in vertices for j in vertices if i < j]
	g = Graph(vertex_attrs={"label":vertices}, edges=edges, directed=False)
	g.es["sim"] = [getSimilarity(id2token[i], id2token[j], model) for i,j in edges]
	g.es["dist"] = np.array(1-np.array(g.es['sim'])).tolist()
	# test validity
	# assert(getSimilarity('man', 'woman', model) == g[token2id['man'], token2id['woman']] )
	# assert(getSimilarity('cat', 'dog', model) == g[token2id['cat'], token2id['dog']] )

	return g, (edges, vertices), (id2token, token2id)

def filter_graph(weights, edges, vertices, id2token,  threshold=1.0):

	new_edges, new_weights = [], [] # weights: similarity
	new_distances = []
	for edge, w in zip(edges, weights):
	    if w >= threshold:
	        new_edges.append(edge)
	        new_weights.append(w)
	        new_distances.append(1-w)
	g0 = Graph(vertex_attrs={"label":vertices}, edges=new_edges, directed=False)

	g0.es["sim"] = new_weights
	g0.es["dist"] = new_distances
	g0.vs["label"] = [id2token[idx] for idx in vertices]

	return g0

def compute_measures_df(g, edges, vertices, id2token, token2id):
	"""
	compute centrality measures to output df
	"""

	# g -> strength, closeness (continuous)
	strengthRank = g.strength(None,  weights=g.es['sim'])
	closenessRank = g.closeness(None, 'all', weights=g.es['dist'], normalized=True)

	# g1 -> betweenness, eigen_centrality
	g1 = filter_graph(g.es["sim"], edges, vertices, id2token, threshold=0.5)
    
	eigen_centralityRank = g.eigenvector_centrality(directed=False, weights=g.es['sim'])

	betweennessRank = g.betweenness(vertices=None, 
                                     directed=False, 
                                     weights=g.es['dist'])

	# g2 -> degree
	g2 = filter_graph(g.es["sim"], edges, vertices, id2token, threshold=0.1)

	degreeRank = g2.degree(None, mode='all')
	print(degreeRank)

	# frequency
	wordCounter = defaultdict(int)
	with open(LEMMA_BOOK_FILE) as f:
	    for line in f:
	        for w in line.split():
	            w = w.strip()
	            if w in token2id:
	                wordCounter[w] += 1
	freqRank = [wordCounter[w] for w in token2id]

	# rebecca
	with open(REBECCA_WORD_FILE, 'r') as f:
	    edu_list = []
	    for word in f:
	        edu_list.append(word.strip())
	        
	rebeccaRank = {w:i+1 for i, w in enumerate(edu_list)}
	final_words_set = set(token2id.keys()).intersection(edu_list)

	print(len(strengthRank))
	print(len(betweennessRank))
	print(len(closenessRank))
	print(len(eigen_centralityRank))
	print(len(degreeRank))

	# create df
	data = []
	words_inorder = [id2token[idx] for idx in range(len(token2id))]
	for i, word in enumerate(words_inorder):
	    if word in final_words_set:
	        data.append([word,
	                    rebeccaRank[word],
	                    strengthRank[i], 
	                    closenessRank[i], 
	                    betweennessRank[i], 
	                    eigen_centralityRank[i],
	                    degreeRank[i],
	                    freqRank[i]])


	df = pd.DataFrame(data, columns=['word', 'ppvt', 'strgth', 
                                     'close', 'betw', 'eigen', 'degree', 'freq'])
	df = df.sort_values(by=['ppvt'])
	print(df.head())
	return df

def build_pos_graph(model, pos_word_set, sampling_size=0, max_dist=1):
	"""
	build semantic graph without referring Rebecca's word list;
	only to explore noun, verb's centrality in vocab
	"""

	book_set = read_file_to_list(BOOK_WORD_SET_FILE)
	edu_set = read_file_to_list(REBECCA_WORD_FILE)
	valid_pos_words = set(model.wv.vocab.keys()).intersection(edu_set)
	valid_pos_words = set(valid_pos_words).intersection(pos_word_set)
	# print('pos_word_set', len(pos_word_set))

	token2vec = {}
	if sampling_size > 0:
		rand_words = random.sample(valid_pos_words, sampling_size)
		for word in rand_words:
			token2vec[word] = model.wv[word]			
	else:
		token2vec = valid_pos_words

	idx = 0
	id2token = {}
	token2id = {}

	for word in token2vec:
	    id2token[idx] = word
	    token2id[word] = idx
	    idx += 1

	vertices = [idx for idx in range(len(token2vec))]
	edges = [(i, j) for i in vertices for j in vertices if i < j]
	weights = [getSimilarity(id2token[i], id2token[j], model) for i, j in edges]

	g = filter_graph(weights, edges, vertices, id2token, max_dist)

	return g, (edges, vertices), (id2token, token2id)

def read_docs():
	with open(LEMMA_BOOK_FILE) as f:
	    documents = []
	    for line in f:
	        documents.append(line.split())
	return documents

def freq_counts(documents):
    allwords = []
    for doc in documents:
        allwords += doc
    
    ct = Counter(allwords)
    for i in range(5):
        print('freq ', i+1, len([w for w in ct if ct[w] >= i+1]))

In [37]:
# from model import *

documents = read_docs()
model = train_word2vec(documents, size=100, min_count=0, iters=50, window=20)

In [38]:
freq_counts(documents)
print('table', 'girl', getSimilarity('table', 'girl', model))
print('man', 'woman', getSimilarity('man', 'woman', model))
print('girl', 'boy', getSimilarity('girl', 'boy', model))
print('sun', 'moon', getSimilarity('sun', 'moon', model))
print('cat', 'dog', getSimilarity('cat', 'dog', model))

freq  1 180076
freq  2 60524
freq  3 37660
freq  4 28916
freq  5 23983
table girl 0.27819833159446716
man woman 0.7985084056854248
girl boy 0.4241322875022888
sun moon 0.7583851218223572
cat dog 0.022278811782598495


## Overlapping in Vocab
    - therefore, we do not use google pre-trained word embeddings

In [23]:
vocab_overlapping_googleW2V = len([w for w in ranked_words if w in set(model.wv.vocab)])
print("Intersection b/w Google W2V, FB data, and PPVT is {}".format(vocab_overlapping_googleW2V))

book_set = set(book_words)
vocab_overlapping_fb_data = len([w for w in ranked_words if w in book_set])
print("Intersection b/w FB data, and PPVT is {}".format(vocab_overlapping_fb_data))

Intersection b/w Google W2V, FB data, and PPVT is 165
Intersection b/w FB data, and PPVT is 170


## Visualization
    - nouns
    - verbs

In [13]:
def normalize_list(l, lower, upper):
	max_min_range = max(l)-min(l)
# 	print('times', (upper-lower)/max_min_range)
	return [lower + (upper - lower) / max_min_range * x for x in l]

for i in range(0, 20, 1):
	max_dist = 0.01*(i+1)
	g, (edges, vertices), (id2token, token2id) = build_pos_graph(model, pos_words['n'], sampling_size=10, max_dist=max_dist)
	wordCounter = defaultdict(int)
	with open(LEMMA_BOOK_FILE) as f:
	    for line in f:
	        for w in line.split():
	            w = w.strip()
	            if w in token2id:
	                wordCounter[w] += 1
	rank = [wordCounter[w] for w in token2id]
	vertexSize = normalize_list(rank, 10, 40)
# 	vertexLabelSize = normalize_list(rank, 20, 40)
	visual_style = {}
	visual_style["vertex_color"] = "#94a1b6"
	visual_style["vertex_label_color"] = "#f20606"
	visual_style["vertex_label_size"] = 15 # 15
	visual_style["vertex_size"] = vertexSize # 15
	visual_style["vertex_label"] = g.vs["label"]
	layout = g.layout("kk")
	visual_style["layout"] = layout
	visual_style["bbox"] = (300, 300)
	visual_style["margin"] = 20
	filename = "./output/noun{}.pdf".format(max_dist)
	plot(g, filename, **visual_style)

In [14]:
def normalize_list(l, lower, upper):
	max_min_range = max(l)-min(l)
# 	print('times', (upper-lower)/max_min_range)
	return [lower + (upper - lower) / max_min_range * x for x in l]

for i in range(0, 20, 1):
	max_dist = 0.01*(i+1)
	g, (edges, vertices), (id2token, token2id) = build_pos_graph(model, pos_words['v'], sampling_size=10, max_dist=max_dist)
	wordCounter = defaultdict(int)
	with open(LEMMA_BOOK_FILE) as f:
	    for line in f:
	        for w in line.split():
	            w = w.strip()
	            if w in token2id:
	                wordCounter[w] += 1
	rank = [wordCounter[w] for w in token2id]
	vertexSize = normalize_list(rank, 10, 40)
# 	vertexLabelSize = normalize_list(rank, 20, 40)
	visual_style = {}
	visual_style["vertex_color"] = "#94a1b6"
	visual_style["vertex_label_color"] = "#f20606"
	visual_style["vertex_label_size"] = 15 # 15
	visual_style["vertex_size"] = vertexSize # 15
	visual_style["vertex_label"] = g.vs["label"]
	layout = g.layout("kk")
	visual_style["layout"] = layout
	visual_style["bbox"] = (300, 300)
	visual_style["margin"] = 20
	filename = "./output/noun{}.pdf".format(max_dist)
	plot(g, filename, **visual_style)

## rank words

In [39]:
# from model import *

# documents = read_docs()

# # no pretrained
# model = train_word2vec(documents, 300, 5, 200)
# # use pretrained model
# # model = load_pretrained_word2vec(documents, 300, 5, 100, retrain=False)
# # use pretrained + retrain
# # model = load_pretrained_word2vec(documents, 300, 5, 30, retrain=True)

# freq_counts(documents)
# print('man', 'woman', getDistance('man', 'woman', model))
# print('cat', 'dog', getDistance('cat', 'dog', model))
# print('sun', 'kid', getDistance('sun', 'kid', model))        
        
# build graph     
g, (edges, vertices), (id2token, token2id) = build_graph(model, sampling_size=10)

# compute measures
df = compute_measures_df(g, edges, vertices, id2token, token2id)

# see correlations
for col in df.columns[2:]:  
    print('rebecca vs. {}'.format(col), df['ppvt'].corr(df[col].rank(ascending=False)))    
    
for mname in ['strgth', 'close', 'betw', 'eigen', 'degree', 'freq']:
    cname = mname + '-rank'
    df[cname] = df[mname].rank(ascending=False)    

# save to csv
df.to_csv("./output/rankings_0625_170.csv",index=False)

word2vec:	 180076
book_set:	 43439
edu_set:	 228
intersertion words 165
[59, 55, 65, 67, 48, 41, 54, 75, 58, 49, 43, 48, 41, 71, 80, 43, 43, 51, 48, 75, 73, 46, 65, 48, 73, 55, 54, 53, 42, 81, 66, 34, 58, 67, 54, 44, 50, 62, 57, 68, 46, 59, 49, 71, 62, 73, 36, 70, 59, 55, 75, 42, 62, 49, 70, 53, 42, 48, 56, 41, 61, 59, 42, 44, 52, 44, 73, 54, 76, 49, 72, 40, 61, 48, 37, 49, 42, 81, 48, 42, 49, 73, 49, 64, 76, 50, 59, 49, 38, 46, 52, 44, 75, 80, 51, 43, 55, 70, 68, 47, 76, 46, 73, 49, 43, 42, 61, 33, 57, 52, 54, 63, 51, 54, 45, 48, 44, 59, 82, 63, 50, 61, 44, 49, 78, 66, 61, 66, 44, 46, 69, 51, 59, 57, 55, 42, 45, 47, 70, 74, 58, 54, 76, 47, 77, 70, 60, 38, 51, 47, 73, 41, 58, 56, 53, 51, 43, 54, 65, 59, 58, 66, 55, 71, 64]
165
165
165
165
165
      word  ppvt     strgth     close  betw     eigen  degree  freq
47    ball     1  27.373324  1.230778   6.0  0.786280      70   391
105    dog     2  12.242857  1.080674   0.0  0.166297      42  1327
75   spoon     3  13.525252  1.089972   0.0

In [40]:
df.head()

Unnamed: 0,word,ppvt,strgth,close,betw,eigen,degree,freq,strgth-rank,close-rank,betw-rank,eigen-rank,degree-rank,freq-rank
47,ball,1,27.373324,1.230778,6.0,0.78628,70,391,23.0,19.0,19.5,21.0,30.0,21.0
105,dog,2,12.242857,1.080674,0.0,0.166297,42,1327,134.0,135.0,94.0,131.0,150.5,7.0
75,spoon,3,13.525252,1.089972,0.0,0.235678,49,87,72.0,72.0,94.0,58.0,109.5,53.0
139,foot,4,31.64591,1.252969,17.0,0.941987,74,2389,7.0,9.0,12.5,6.0,16.0,2.0
93,duck,5,31.847777,1.26139,27.0,0.939853,80,344,4.0,3.0,10.0,7.0,4.5,26.0
