In [None]:
import os
import pickle
import gzip
import copy
import torch
from torch import nn
import torch.nn.functional as F
import random, time
import numpy as np
import matplotlib.pyplot as plt
from scipy import sparse
from scipy.stats import rankdata
import networkx as nx
import pandas as pd
from collections import defaultdict,Counter
from datetime import datetime, date
from itertools import combinations
from preprocess_utils import *
from features_utils import *
from train_model_utils import *
 

## read pairs and solutions data (both)

In [None]:
store_folder="data_pair_solution"
pair_solution_data1=os.path.join(store_folder,"unconnected_2019_pair_solution_connected_2022.parquet")
pair_solution_data2=os.path.join(store_folder,"unconnected_2019_pair_solution_unconnected_2022.parquet")

time_start = time.time()
eval_pair_solution1 = pd.read_parquet(pair_solution_data1)
eval_pair_solution1=eval_pair_solution1[['v1','v2','citation']]
print(f"Done, read pair_solution_yes: {len(eval_pair_solution1)}; elapsed_time: {time.time() - time_start}")

time_start = time.time()
eval_pair_solution2 = pd.read_parquet(pair_solution_data2)
print(f"Done, read pair_solution_not: {len(eval_pair_solution2)}; elapsed_time: {time.time() - time_start}")

time_start = time.time()
full_eval_pair_result = pd.concat([eval_pair_solution1, eval_pair_solution2])
print(f"Done, combine all: {len(full_eval_pair_result)}; elapsed_time: {time.time() - time_start}")

#### fix random seed

In [None]:
day_origin = date(1990,1,1)
vertex_degree_cutoff=1
years_delta=3
min_edges=1
year_start=2022-years_delta

rnd_seed=42
random.seed(rnd_seed)
torch.manual_seed(rnd_seed)
np.random.seed(rnd_seed)

### randomly 10M 

In [None]:
edges_used=10**7
num_row = int(min(edges_used, len(full_eval_pair_result)))

time_start = time.time()
shuffled = full_eval_pair_result.sample(frac=1, random_state=rnd_seed)
eval_data_pair_solution = shuffled.head(num_row)

print(f"Done, eval_data_pair_solution: {len(eval_data_pair_solution)}; elapsed_time: {time.time() - time_start}")

## store unconnected pairs and citation, time information

In [None]:

store_eval_folder="data_eval"
if not os.path.exists(store_eval_folder):
    os.makedirs(store_eval_folder)
print(f"store files in {store_eval_folder}.....")

time_start = time.time()
store_name=os.path.join(store_eval_folder,"data_eval_pair_solution.parquet")

eval_data_pair_solution.to_parquet(store_name, compression='gzip')
print(f"eval_data_pair_solution: {len(eval_data_pair_solution)}; elapsed_time: {time.time() - time_start}")


#### prepare properties

In [None]:
time_start = time.time()
data_folder="data_concept_graph"
graph_file=os.path.join(data_folder,"full_dynamic_graph.parquet")
full_dynamic_graph = pd.read_parquet(graph_file)
print(f"{datetime.now()}: Done, read full_dynamic_graph: {len(full_dynamic_graph)}; elapsed_time: {time.time() - time_start}")

In [None]:
day_origin = date(1990,1,1)
vertex_degree_cutoff=1
years_delta=3
min_edges=1
year_start=2022-years_delta

In [None]:
start_time=time.time()
adj_mat_sparse=[]
node_neighbor_list=[]
num_neighbor_list=[]
for yy in [year_start,year_start-1,year_start-2]:
    data_file=os.path.join("data_for_features", f"adjacency_matrix_{yy}.gz")
    adj_mat=get_adjacency_matrix(full_dynamic_graph, year_start, data_file)
    adj_mat_sparse.append(adj_mat)
    
    curr_node_neighbor=get_node_neighbor(adj_mat)
    node_neighbor_list.append(curr_node_neighbor)
    
    curr_num_neighbor = np.array(adj_mat.sum(axis=0)).flatten() # array 
    num_neighbor_list.append(curr_num_neighbor)
    
print(f"{datetime.now()}: Done, adjacency_matrix_sparse; elapsed_time: {time.time() - start_time}")

In [None]:
start_time=time.time()
vertex_features=get_all_node_feature(adj_mat_sparse, year_start, "data_for_features")
print(f"{datetime.now()}: Done, vertex_features; elapsed_time: {time.time() - start_time}")


In [None]:
start_time=time.time()
vc_feature_list=[]
for yy in [year_start,year_start-1,year_start-2]:
    data_file=os.path.join("data_for_features", f"concept_node_citation_data_{yy}.parquet")
    vc_df=pd.read_parquet(data_file)
    vc_feature=vc_df.values
    vc_feature_list.append(vc_feature)
    
vertex_cfeatures=get_all_node_cfeature(vc_feature_list)
print(f"{datetime.now()}: Done, vertex_cfeatures; elapsed_time: {time.time() - start_time}") 

In [None]:
node_parameter=[vc_feature_list, node_neighbor_list, num_neighbor_list, vertex_features, vertex_cfeatures]

In [None]:

logs_file_name='logs_eval_data_infos'
time_start = time.time()
eval_pair_solution=eval_data_pair_solution.values
unconnected_vertex_pairs=eval_pair_solution[:,:2]
 
pair_features, pair_cfeatures=get_all_pair_features(vc_feature_list, node_neighbor_list, num_neighbor_list, unconnected_vertex_pairs, logs_file_name)

all_features=[vertex_features, vertex_cfeatures, pair_features, pair_cfeatures]

eval_data_features=get_all_feature(all_features, unconnected_vertex_pairs, logs_file_name)

print(f"finish; {len(eval_data_features)}; time: {time.time()-time_start}")

In [None]:
time_start = time.time()

store_name=os.path.join(store_eval_folder,"eval_data_pair_feature.parquet")
data_eval_2022 = pd.DataFrame(eval_data_features)
data_eval_2022.to_parquet(store_name, compression='gzip')  

print(f"data_eval_2022: {len(data_eval_2022)}; elapsed_time: {time.time() - time_start}")

### random shuffle and chose 10M

In [None]:
### random shuffle

edges_used=1e7
num_rows = int(min(edges_used, len(full_eval_pair_result)))

time_start = time.time()
shuffled = full_eval_pair_result.sample(frac=1, random_state=rnd_seed)
#eval_data_pair_solution = shuffled.sample(n=num_rows, random_state=rnd_seed)
eval_data_pair_solution = shuffled.head(num_row_no)

print(f"Done, eval_data_pair_solution: {len(eval_data_pair_solution)}; elapsed_time: {time.time() - time_start}")


time_start = time.time()
eval_pair_solution=eval_data_pair_solution.values

## to be sure there are no bias in v1>v2
mask = np.random.rand(len(eval_pair_solution)) < 0.5
eval_pair_solution_temp = eval_pair_solution[mask, 0].copy()
eval_pair_solution[mask, 0] = eval_pair_solution[mask, 1]
eval_pair_solution[mask, 1] = eval_pair_solution_temp

print(f"Done, array : {len(eval_pair_solution)}; elapsed_time: {time.time() - time_start}")
