In [1]:
# compile the model
!python3 setup.py build_ext --inplace

running build_ext


In [2]:
# remove console log and use file log 
import logging
logging.basicConfig(format='%(asctime)s %(message)s',level=logging.INFO)

# formatter = logging.Formatter('%(asctime)s %(message)s')

# filelogger = logging.getLogger()

# for h in filelogger.handlers:
#     filelogger.removeHandler(h)

# filelogger.setLevel(logging.INFO)
# fh = logging.FileHandler("log.log",mode='w')
# fh.setLevel(logging.DEBUG)
# fh.setFormatter(formatter)
# filelogger.addHandler(fh)

In [3]:
from sklearn.metrics import classification_report

from sklearn.cross_validation import cross_val_predict,train_test_split
from sklearn.linear_model import LogisticRegression as lg
from sklearn.preprocessing import normalize,StandardScaler



In [4]:
import numpy as np
np.random.seed(0)

workers = 1
alg = 'p2v_weighted'
w2v_ratio = 1
d2v_ratio = 1
n2v_ratio = 0.2
l2 = 1
n2v_p = 0.7
total_samples = 2e7
word_window = 10
output = './cora_enrich.emb'
datapath = "/home/zhang18f/datasets/Cora_enrich/"


from P2VDataIterator import DataIterator
dataset = DataIterator(data_path=datapath)

import model as paper2vec
import numpy as np
from collections import defaultdict



if "unweighted" in alg:
    def file_len(fname):
        import subprocess
        p = subprocess.Popen(['wc', '-w', fname], stdout=subprocess.PIPE,
                                                  stderr=subprocess.PIPE)
        result, err = p.communicate()
        if p.returncode != 0:
            raise IOError(err)
        return int(result.strip().split()[0])
    len_w = file_len("%s/texts.txt" % datapath)
    len_n = file_len("%s/links.txt" % datapath)
    w2v_ratio = word_window if w2v_ratio > 0 else 0
    d2v_ratio = 1 if d2v_ratio > 0 else 0
    n2v_ratio = len_n/len_w if n2v_ratio > 0 else 0
    
if "p2v" in alg.lower() and d2v_ratio <= 0 and n2v_ratio <= 0:
    raise ValueError('p2v need d2v_ratio or n2v_ratio > 0')

model = paper2vec.paper2vec(
    dataset,
    w2v_ratio = w2v_ratio,
    d2v_ratio = d2v_ratio,
    n2v_ratio = n2v_ratio,
    workers = workers,
    w2v_window = word_window,
    alpha = 0.025,
    min_alpha = 0.0001,
    w2v_min_count = 0,
    negative = 5,
    noise_distribution = 0.75,
    w2v_subsampling = 0,
    d2v_subsampling = 0,
    n2v_subsampling = 0,
    n2v_p = n2v_p,
    l2 = l2,
    batch_size = int(1e6),
    total_samples = total_samples,
    shuffle = 0 if ("pv_dbow" in alg or "sg" in alg) and total_samples <= 5e8 else 1,
    tfidf = 1 if "tfidf" in alg else 0,
    LDE = 1 if "LDE" in alg else 0,
)

if alg == "sg+pv_dbow":
    model.w2v_ratio = 1
    model.d2v_ratio = 0
    model.train(
        workers = workers,
        report_delay = 1,
        total_samples = total_samples,
    )
    model.w2v_ratio = 0
    model.d2v_ratio = 1
    model.train(
        workers = workers,
        report_delay = 1,
        total_samples = total_samples,
    )
elif alg == "LDE_doc":
    model.w2v_ratio = 1
    model.d2v_ratio = 0
    model.n2v_ratio = 0
    model.train(
        workers = workers,
        report_delay = 1,
        total_samples = total_samples,
    )
elif alg == "LDE":
    model.w2v_ratio = 1
    model.d2v_ratio = 0
    model.n2v_ratio = 1
    model.train(
        workers = workers,
        report_delay = 1,
        total_samples = total_samples * 2,
    )
elif alg == "LDE_link":
    print("LDE_link")
    model.w2v_ratio = 0
    model.d2v_ratio = 0
    model.n2v_ratio = 1
    model.train(
        workers = workers,
        report_delay = 1,
        total_samples = total_samples,
    )
else:
    ratio = 0
    ratio += 1 if w2v_ratio > 0 else 0
    ratio += 1 if d2v_ratio > 0 else 0
    ratio += 1 if n2v_ratio > 0 else 0

    model.train(
        workers = workers,
        report_delay = 1,
        total_samples = total_samples * ratio,
    )

# save embeddings
with open(output,'w') as f:
    f.write("%s %s\n" % (model.paper_embeddings.shape[0],model.paper_embeddings.shape[1]))
    for pid in range(model.paper_embeddings.shape[0]):
        f.write("%s " % model.id2paper[pid])
        f.write(" ".join([str(x) for x in model.paper_embeddings[pid]]))
        f.write("\n")

2019-04-16 22:17:27,010 initializing cython module
2019-04-16 22:17:27,031 cython module initialized
2019-04-16 22:17:27,034 use P2V
2019-04-16 22:17:27,034 use TF-ICF
2019-04-16 22:17:27,035 initilize the model
2019-04-16 22:17:27,036 counting frequence
2019-04-16 22:17:28,088 done
2019-04-16 22:17:28,089 loading data
2019-04-16 22:17:29,337 data contains 2708 papers, 25955 words
2019-04-16 22:17:29,384 pre-processing negative sampling for w2v
2019-04-16 22:17:29,394 pre-processing negative sampling for d2v
2019-04-16 22:17:29,403 pre-processing negative sampling for n2v
2019-04-16 22:17:29,405 init embeddings
2019-04-16 22:17:29,418 done
2019-04-16 22:17:29,420 building word draw table
2019-04-16 22:17:29,421 building node draw table
2019-04-16 22:17:29,422 starting trainig threads with 60000000 samples
2019-04-16 22:17:32,112 progress: 1.67%, 1M samples trained, current loss 0.5798, current speed 0.37M/s, overall speed 0.37M/s, ETA: 158s
2019-04-16 22:17:34,475 progress: 3.33%, 2M s

2019-04-16 22:19:43,949 progress: 91.67%, 55M samples trained, current loss 0.3212, current speed 0.42M/s, overall speed 0.41M/s, ETA: 12s
2019-04-16 22:19:46,316 progress: 93.33%, 56M samples trained, current loss 0.3211, current speed 0.42M/s, overall speed 0.41M/s, ETA: 9s
2019-04-16 22:19:48,683 progress: 95.00%, 57M samples trained, current loss 0.3210, current speed 0.42M/s, overall speed 0.41M/s, ETA: 7s
2019-04-16 22:19:51,084 progress: 96.67%, 58M samples trained, current loss 0.3208, current speed 0.42M/s, overall speed 0.41M/s, ETA: 4s
2019-04-16 22:19:53,931 progress: 98.33%, 59M samples trained, current loss 0.3204, current speed 0.35M/s, overall speed 0.41M/s, ETA: 2s
2019-04-16 22:19:57,372 progress: 100.00%, 60M samples trained, current loss 0.3200, current speed 0.29M/s, overall speed 0.41M/s, ETA: 0s
2019-04-16 22:19:57,373 60000000 samples trained in 147 seconds


In [5]:
i = open('%s/idxs.txt' % datapath)
l = open('%s/labels.txt' % datapath)
X = []
Y = []
labels = {}
for idx,label in zip(i,l):
    idx = idx.rstrip()
    X.append(model.paper(idx))
    label = label.rstrip()
    if label not in labels:
        labels[label] = len(labels)
    Y.append(labels[label])
    

# X = normalize(X)
# X = StandardScaler().fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X,Y,train_size=0.8,random_state=0)
Y_predict = lg(random_state=0).fit(X_train,y_train).predict(X_test)

print(classification_report(y_test,Y_predict,digits=4))

             precision    recall  f1-score   support

          0     0.9000    0.8438    0.8710        32
          1     0.8914    0.9231    0.9070       169
          2     0.9057    0.8421    0.8727        57
          3     0.9255    0.9775    0.9508        89
          4     0.8082    0.8551    0.8310        69
          5     0.9487    0.8810    0.9136        42
          6     0.9744    0.9048    0.9383        84

avg / total     0.9057    0.9041    0.9041       542

