# snap2vec

usinn stanford social network datasets + word2vec

In [1]:
# you'll probably get a lot of warnings here -- sorry 
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import json
import matplotlib.pyplot as plt
import seaborn as sns
import bokeh
from sklearn.mixture import GaussianMixture as GMM
from sklearn.decomposition import PCA
import pandas as pd
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
import hdbscan
% matplotlib inline
output_notebook()



## `1. Preprocessing 

We want to write both sparsified and regular forms of our graph to adjaceny list. We want to test word2vec on various sparsified graphs and see if we can predict links that occur in the full on adjacency list

*Write Full graph*

In [101]:
! python preprocess_snaps.py twitter_combined.txt 1 directed

Constructing adjacency list
Writing to file


*Write 90% of edges*

In [102]:
! python preprocess_snaps.py twitter_combined.txt .9 directed

Constructing adjacency list
Writing to file


*Write 80% of edges*

In [103]:
! python preprocess_snaps.py twitter_combined.txt .8 directed

Constructing adjacency list
Writing to file


*Write 70% of edges*

In [104]:
! python preprocess_snaps.py twitter_combined.txt .7 directed

Constructing adjacency list
Writing to file


In [116]:
def adjListFromFile(fpath):
    ''' 
        function that constructs a dictionary 
        of nodes -> list of nodes their connected to
        (an adjacency list representation of the graph)
        from a file
    '''
    adjList = {}
    with open(fpath, 'r') as f: 
        for i, line in enumerate(f): 
            if i != 0: 
                totalList = line.strip().split(' ')
                adjList[totalList[0]] = [x.strip() for x in totalList[1:]]
    return adjList 

In [106]:
fullGraph = adjListFromFile('twitter_combine1_lst.txt')
partialGraph = adjListFromFile('twitter_combine0.9_lst.txt')
partialGraphS = adjListFromFile('twitter_combine0.7_lst.txt')

In [107]:
# sanity check 
print(len(fullGraph), len(partialGraphS), len(partialGraph))

70097 67904 69505


In [108]:
def testNodes(partialGraph, fullGraph):
    '''
        another quick sanity check: function that 
        ensures every node in the partial graph
        is also in the full graph 
        
        (this wasn't the case before -- there was a bug
        where spaces weren't written but i fixed hehe)
    '''
    num_errors = 0 
    for node_id in partialGraph.keys(): 
        if node_id not in fullGraph.keys(): 
            num_errors += 1 
    return num_errors

assert not(testNodes(partialGraph, fullGraph))
assert not(testNodes(partialGraphS, fullGraph))

In [136]:
! python documentize.py twitter_combine0.9_lst.txt twitter.9vec.txt .01 100000

PROCESSING GRAPH
WRITING TO FILE


In [137]:
! python documentize.py twitter_combine0.7_lst.txt twitter.7vec.txt .01 100000

PROCESSING GRAPH
WRITING TO FILE


## 2. word2vec

In [147]:
%%time
model = Word2Vec(LineSentence('twitter.7vec.txt'), size=100, window=5, min_count=10, workers=4)

CPU times: user 14.9 s, sys: 183 ms, total: 15.1 s
Wall time: 8.67 s


In [150]:
def testPrediction(fullGraph, partialGraph, model):
    numExisting, numFound, numWrong = 0, 0, 0
    for i, user_id in enumerate(model.vocab.keys()):
        if i % 1000 == 0: 
            print("On word %s out of %s" % (i, len(model.vocab)))
        most_similar = [t[0] for t in model.most_similar(user_id)[:5]]
        for similar_user in most_similar: 
            if similar_user in partialGraph[user_id]:
                numExisting += 1
            elif similar_user in fullGraph[user_id]: 
                numFound += 1 
            else: 
                numWrong += 1 
    return numExisting, numFound, numWrong

In [151]:
testPrediction(fullGraph, partialGraphS, model)

On word 0 out of 16698
On word 1000 out of 16698
On word 2000 out of 16698
On word 3000 out of 16698
On word 4000 out of 16698
On word 5000 out of 16698
On word 6000 out of 16698
On word 7000 out of 16698
On word 8000 out of 16698
On word 9000 out of 16698
On word 10000 out of 16698
On word 11000 out of 16698
On word 12000 out of 16698
On word 13000 out of 16698
On word 14000 out of 16698
On word 15000 out of 16698
On word 16000 out of 16698


(18264, 3752, 61474)

# 3. Comparison with LinkPred

In [154]:
import linkpred

In [155]:
# copy of the file just renamed .edgelist so it knows what to do lmaooo
G = linkpred.read_network("twitter_combined.edgelist")

In [158]:
simrank = linkpred.predictors.SimRank(G, excluded=G.edges())