In [None]:
import os
import sys
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', -1)
import matplotlib.pyplot as plt
from networkx.readwrite import json_graph
import json
import scipy

In [None]:
from similarities import Similarities

In [None]:
import sys
sys.path.insert(0, os.path.join(os.getcwd(), "..", "..", "data"))
sys.path.insert(0, os.path.join(os.getcwd(), "..", "..", "utils"))
from DataLoader import DataLoader

In [None]:
# Get test data with abstracts and citations ready for evaluation
d = DataLoader()
d.validation_data_with_abstracts_citations()
query_test = list(zip(list(d.data.chapter), list(d.data.chapter_title), 
                 list(d.data.chapter_abstract), list(d.data.chapter_citations)))

conferences_truth = list()
confidences_truth = list()

for conferenceseries in list(d.data.conferenceseries):
    conferences_truth.append([conferenceseries])
    confidences_truth.append([1])

truth = [conferences_truth, confidences_truth]

In [None]:
val_ids = list(d.data.chapter)
len(val_ids)

In [None]:
d = DataLoader()
d.training_data_with_abstracts_citations()

In [None]:
train_ids = list(d.data["chapter"])
len(train_ids)

In [None]:
emb_file = "../../../data/processed/graphsage/AVG_2L/graphsage_mean_small_0.000010/embeddings.npy"
emb_ids_file = "../../../data/processed/graphsage/AVG_2L/graphsage_mean_small_0.000010/embeddings_ids.txt"

In [None]:
embeddings = np.load(emb_file)
print(embeddings.shape)

In [None]:
# Map embeddings to node ids
emb_ids = {}
with open(emb_ids_file) as f:
    for i, line in enumerate(f):
        emb_ids[line.strip()] = i
emb = embeddings[[emb_ids[id] for id in train_ids]]
len(emb)

In [None]:
emb_ids = {}
with open(emb_ids_file) as f:
    for i, line in enumerate(f):
        emb_ids[line.strip()] = i
val_emb = embeddings[[emb_ids[id] for id in val_ids]]
len(val_emb)

In [None]:
sim = Similarities(emb, train_ids)

In [None]:
from tqdm import tqdm
topn = len(val_emb)
recs = 10
similarities = []
with tqdm(total=topn) as pbar:
    for vector in val_emb:
        similarities.append(sim.similar_by_vector(vector, topn=topn))
        pbar.update(1)
len(similarities)  

In [None]:
conferences = []
confidences = []
with tqdm(total=len(similarities)) as pbar:
    for similarity in similarities:
        confer = set()
        confid = []
        for i in range(len(similarity)):
            l = len(confer)
            if l<10:
                confer.add(list(d.data[d.data.chapter==similarity[i][0]].conferenceseries)[0])
                if len(confer) != l:
                    confid.append(similarity[i][1])
        conferences.append(list(confer))
        confidences.append(confid)
        pbar.update(1)
results = [conferences, confidences]

In [None]:
sys.path.insert(0, os.path.join(os.getcwd(), "..", "evaluations"))
from EvaluationContainer import EvaluationContainer

In [None]:
results[0][0]

In [None]:
evaluation = EvaluationContainer()
evaluation.evaluate(results,truth)

### New nodes

#### Preprocess data

In [None]:
# Step 1: Load training data from file
# Step 2: Load test data from DataLoader
d_test = DataLoader()
df_test = d_test.test_data_with_abstracts_citations().data
# Step 3: Pass df_test, G_train to preprocess_data.test()
# Step 4: Retrieve new graph, id_map, features
# Step 5: Get new embeddings
# Step 6: Map test ids to embeddings
# Step 7: Call classifier or similarities
# Step 8: Get predictions

In [None]:
# Load labels (change for conferences)
class_map = json.load(open(class_map_file))
labels_train = [class_map[str(id)] for id in train_ids]
len(labels_train)

type(labels_train)

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

labels = encoder.fit_transform(labels_train)


## Execute script

In [None]:
import subprocess 
import sys

In [None]:
cmd = ["python", "unsupervised_train.py", "--train_prefix", "example_data/toy-ppi", "--model", "graphsage_mean"]

In [None]:
def execute(command):
    process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

    # Poll process for new output until finished
    while True:
        nextline = process.stdout.readline()
        if nextline == '' and process.poll() is not None:
            break
        sys.stdout.write(nextline)
        sys.stdout.flush()
        
    exitCode = process.returncode
    if exitCode == 0:
        print("Finished with exit code {}.".format(str(exitCode)))
    else:
        raise ProcessException(command, exitCode, output)

In [None]:
out=execute(cmd)