# import library

In [1]:
%matplotlib inline
import nltk
import numpy as np
import tensorflow as tf
from tensorflow.python.client import timeline
import networkx as nx
from collections import defaultdict,namedtuple,Counter
from glob import glob
import sys
import os
import math
import random
from six.moves import xrange
if sys.version_info[0] >= 3:
    unicode = str

import logging
logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV,train_test_split
from gensim.models.word2vec import Word2Vec

random.seed(0)
np.random.seed(0)

2017-02-21 10:03:39,272 'pattern' package not found; tag filters are not available for English


# load data

In [2]:
CORA = namedtuple('CORA', 'words tags')

datasets = []
labels = defaultdict(list)
with open("cora.content") as f:
    for line in f:
        line = line.split()
        ID = line[0]
        labels[line[-1]].append(ID)
        words = []
        for i,w in enumerate(line[1:-1]):
            if w == "1":
                words.append(str(i))
        datasets.append(
            CORA(
                words,
                [ID]
            )
        )

logging.info("done... %s papers loaded" % (len(datasets)))
logging.info("%s labels" % (len(labels)))

2017-02-21 10:03:39,919 done... 2708 papers loaded
2017-02-21 10:03:39,922 7 labels


# pretrain doc2vec

In [3]:
import random
from gensim.models.doc2vec import Doc2Vec
#model = Doc2Vec(dbow_words=1,iter=5,batch_words=100,negative=20,min_count=0,sample=0.001,dm=0)
model = Doc2Vec(alpha=0.025, window=10, min_count=10, min_alpha=0.025, size=100)
model.build_vocab(datasets)

# decrease alpha
for i in range(10):
    random.shuffle(datasets)
    model.alpha = 0.025-0.002*i
    model.min_alpha = model.alpha
    model.train(datasets)

2017-02-21 10:03:39,940 collecting all words and their counts
2017-02-21 10:03:39,941 PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2017-02-21 10:03:39,962 collected 1432 word types and 2708 unique tags from a corpus of 2708 examples and 49216 words
2017-02-21 10:03:39,963 Loading a fresh vocabulary
2017-02-21 10:03:39,967 min_count=10 retains 968 unique words (67% of original 1432, drops 464)
2017-02-21 10:03:39,968 min_count=10 leaves 46251 word corpus (93% of original 49216, drops 2965)
2017-02-21 10:03:39,971 deleting the raw counts dictionary of 1432 items
2017-02-21 10:03:39,972 sample=0.001 downsamples 76 most-common words
2017-02-21 10:03:39,973 downsampling leaves estimated 39952 word corpus (86.4% of prior 46251)
2017-02-21 10:03:39,974 estimated required memory for 968 words and 100 dimensions: 2883200 bytes
2017-02-21 10:03:39,978 resetting layer weights
2017-02-21 10:03:40,035 training model with 3 workers on 968 vocabulary and 100 features, using 

# predict

In [4]:
# classify with 50% data as training dataset
X = []
Y = []
with open('doc2vec.embd','w') as f:
    f.write("%s %s\n"%(len(datasets),100))
    for y,key in enumerate(labels.keys()):
        for index,paper in enumerate(labels[key]):
            f.write(paper+" "+" ".join([str(x) for x in model.docvecs[paper]])+"\n")
            X.append(model.docvecs[paper])
            Y.append(y)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.5, random_state=0)
clf = SVC(kernel='rbf',C=1.5).fit(X_train,y_train)
print(clf.score(X_test, y_test))

# classify with 10-fold
parameters = {
    "kernel":["rbf"],
    "C" :[1.5]
             }
tunedclf = GridSearchCV(clf,parameters,cv=10,n_jobs=24)
tunedclf.fit(X,Y)
print("scores %s" % tunedclf.best_score_)

0.620384047267
scores 0.670605612999


# build graph

In [5]:
G = defaultdict(dict)

for data in datasets:
    for n in model.docvecs.most_similar(data.tags,topn=2):
        G[data.tags[0]][n[0]] = None
        G[n[0]][data.tags[0]] = None

with open('cora.cites') as f:
    for line in f:
        line = line.rstrip().split("\t")
        try:
            G[line[0]][line[1]] = None
            G[line[1]][line[0]] = None
        except:
            print(line)

neighbors = []

# default parameters for deepwalk
# 10 iterations
for i in range(10):
    for node in G:
        path = [node]
        # 40 walks per node
        while len(path) < 40:
            cur = path[-1]
            path.append(random.choice(list(G[cur].keys())))
        neighbors.append(path)

2017-02-21 10:03:52,934 precomputing L2-norms of doc weight vectors


In [6]:
from gensim.models.word2vec import Word2Vec
p2v = Word2Vec(size=100, window=5, min_count=0)
p2v.build_vocab(neighbors)
p2v.intersect_word2vec_format('doc2vec.embd')
p2v.train(neighbors)
#model = Word2Vec.load_word2vec_format('p2v.emb')

2017-02-21 10:03:55,778 collecting all words and their counts
2017-02-21 10:03:55,780 PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-02-21 10:03:55,844 PROGRESS: at sentence #10000, processed 400000 words, keeping 2708 word types
2017-02-21 10:03:55,909 PROGRESS: at sentence #20000, processed 800000 words, keeping 2708 word types
2017-02-21 10:03:55,958 collected 2708 word types from a corpus of 1083200 raw words and 27080 sentences
2017-02-21 10:03:55,959 Loading a fresh vocabulary
2017-02-21 10:03:55,970 min_count=0 retains 2708 unique words (100% of original 2708, drops 0)
2017-02-21 10:03:55,971 min_count=0 leaves 1083200 word corpus (100% of original 1083200, drops 0)
2017-02-21 10:03:55,980 deleting the raw counts dictionary of 2708 items
2017-02-21 10:03:55,981 sample=0.001 downsamples 4 most-common words
2017-02-21 10:03:55,982 downsampling leaves estimated 1075390 word corpus (99.3% of prior 1083200)
2017-02-21 10:03:55,983 estimated required memory for

5377107

# predict

In [7]:
X = []
Y = []
for y,key in enumerate(labels.keys()):
    for index,paper in enumerate(labels[key]):
        X.append(p2v[paper])
        Y.append(y)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.5, random_state=42)
clf = SVC(kernel='rbf',C=1.5).fit(X_train,y_train)
print(clf.score(X_test, y_test))

clf = SVC()
parameters = {
    "kernel":["rbf"],
    "C" :[1,10,100]
             }
tunedclf = GridSearchCV(clf,parameters,cv=10,n_jobs=24)
tunedclf.fit(X,Y)
logging.info("scores %s" % tunedclf.best_score_)

0.822008862629


2017-02-21 10:04:04,687 scores 0.820901033973


# predict neighbor

In [8]:
G = defaultdict(dict)

for data in datasets:
    for n in model.docvecs.most_similar(data.tags,topn=2):
        G[data.tags[0]][n[0]] = None
        G[n[0]][data.tags[0]] = None

with open('cora.cites') as f:
    for line in f:
        line = line.rstrip().split("\t")
        try:
            G[line[0]][line[1]] = None
            G[line[1]][line[0]] = None
        except:
            print(line)

        #f.write("%s,%s\n" % (data.tags[0],n[0]))
neighbors = []

for node in G:
    for neighbor in node:
        neighbors.append([node,neighbor])
print(len(neighbors))

15943


# method in the paper

In [9]:
p2v = Word2Vec(size=100, window=5, min_count=0)
p2v.build_vocab(neighbors)
p2v.intersect_word2vec_format('doc2vec.embd')
p2v.train(neighbors)
#model = Word2Vec.load_word2vec_format('p2v.emb')

2017-02-21 10:04:05,145 collecting all words and their counts
2017-02-21 10:04:05,146 PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-02-21 10:04:05,153 PROGRESS: at sentence #10000, processed 20000 words, keeping 1711 word types
2017-02-21 10:04:05,158 collected 2718 word types from a corpus of 31886 raw words and 15943 sentences
2017-02-21 10:04:05,159 Loading a fresh vocabulary
2017-02-21 10:04:05,167 min_count=0 retains 2718 unique words (100% of original 2718, drops 0)
2017-02-21 10:04:05,168 min_count=0 leaves 31886 word corpus (100% of original 31886, drops 0)
2017-02-21 10:04:05,176 deleting the raw counts dictionary of 2718 items
2017-02-21 10:04:05,177 sample=0.001 downsamples 10 most-common words
2017-02-21 10:04:05,178 downsampling leaves estimated 18484 word corpus (58.0% of prior 31886)
2017-02-21 10:04:05,179 estimated required memory for 2718 words and 100 dimensions: 3533400 bytes
2017-02-21 10:04:05,187 resetting layer weights
2017-02-21 10:04:0

92376

In [10]:
X = []
Y = []
for y,key in enumerate(labels.keys()):
    for index,paper in enumerate(labels[key]):
        X.append(p2v[paper])
        Y.append(y)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.5, random_state=42)
clf = SVC(kernel='rbf',C=1.5).fit(X_train,y_train)
print(clf.score(X_test, y_test))

clf = SVC()
parameters = {
    "kernel":["rbf"],
    "C" :[1,10,100]
             }
tunedclf = GridSearchCV(clf,parameters,cv=10,n_jobs=24)
tunedclf.fit(X,Y)
print("scores %s" % tunedclf.best_score_)

0.599704579025
scores 0.706425406204
