## prepare unconnected pairs and its solution year_delta=3

In [1]:
import os
import sys
import json
import pickle
import gzip
from datetime import datetime, date
import random, time
import numpy as np
from scipy import sparse
import networkx as nx
from collections import defaultdict,Counter
import itertools
import copy
from itertools import combinations
import pandas as pd

### read full graph
#### ['v1', 'v2', 'time', 'ct', 'c2023', 'c2022', 'c2021', 'c2020', 'c2019', 'c2018', 'c2017', 'c2016', 'c2015', 'c2014', 'c2013', 'c2012'])

In [2]:
time_start = time.time()

graph_folder="data_concept_graph"
graph_file=os.path.join(graph_folder,"full_dynamic_graph.parquet")

full_graph = pd.read_parquet(graph_file)
print(f"Done, read full_graph: {len(full_graph)}; elapsed_time: {time.time() - time_start} seconds")

Done, read full_graph: 193977096; elapsed_time: 31.906290769577026 seconds


In [3]:
time_start=time.time()
is_smaller = np.all(full_graph['v1'] < full_graph['v2'])
print(f"is_smaller: {is_smaller}; elapsed_time: {time.time()-time_start}\n")

is_smaller: True; elapsed_time: 0.44752979278564453



### get the unconnected-pairs years_delta=3

In [3]:
NUM_OF_VERTICES=37960 ## number of concepts

vertex_degree_cutoff=1
min_edges=1
years_delta=3

day_origin = date(1990,1,1)
day_2016 = (date(2016, 12, 31)- day_origin).days
day_2019 = (date(2019, 12, 31)- day_origin).days
day_2022 = (date(2022, 12, 31)- day_origin).days
print(f"day_2016: {day_2016}; day_2019: {day_2019}; day_2022: {day_2022}\n")

day_2016: 9861; day_2019: 10956; day_2022: 12052



#### get full_graph up to 2016,2019,2022

In [4]:
print(f"full_dynamic_graph: {len(full_graph)}")
time_start = time.time()
full_graph_2016=full_graph[full_graph['time']<=day_2016]
print(f"full_graph_2016: {len(full_graph_2016)}; elapsed_time: {time.time()-time_start}")


time_start = time.time()
full_graph_2019=full_graph[full_graph['time']<=day_2019]
print(f"full_graph_2019: {len(full_graph_2019)}; elapsed_time: {time.time()-time_start}")


time_start = time.time()
full_graph_2022=full_graph[full_graph['time']<=day_2022]
print(f"full_graph_2022: {len(full_graph_2022)}; elapsed_time: {time.time()-time_start}")

full_dynamic_graph: 193977096
full_graph_2016: 127557712; elapsed_time: 10.194115400314331
full_graph_2019: 154251800; elapsed_time: 11.304606437683105
full_graph_2022: 191487049; elapsed_time: 13.10411262512207


#### get all the vertex pairs

In [5]:
time_start=time.time()
pairs_2016 = full_graph_2016[['v1', 'v2']].drop_duplicates()
print(f"pairs_2016: {len(pairs_2016)}; elapsed_time: {time.time()-time_start}")

time_start=time.time()
pairs_2019 = full_graph_2019[['v1', 'v2']].drop_duplicates()
print(f"pairs_2019: {len(pairs_2019)}; elapsed_time: {time.time()-time_start}")

time_start=time.time()
pairs_2022 = full_graph_2022[['v1', 'v2']].drop_duplicates()
print(f"pairs_2022: {len(pairs_2022)}; elapsed_time: {time.time()-time_start}")

pairs_2016: 20146168; elapsed_time: 12.15381407737732
pairs_2019: 22841349; elapsed_time: 14.504824161529541
pairs_2022: 26010946; elapsed_time: 19.0612576007843


#### get all-combine-pairs while degree >= vertex_degree_cutoff

In [6]:
time_start=time.time()
# Flatten the array and count the frequency of each node (this gives the degree of each node)
all_nodes_2016, degrees_2016 = np.unique(pairs_2016.values.flatten(), return_counts=True)

# Create a mask for nodes with a degree greater than the cutoff
large_degree_mask = degrees_2016 >= vertex_degree_cutoff
# Get the nodes with a degree greater than the cutoff
vertex_large_degs_2016 = all_nodes_2016[large_degree_mask]
print(f"vertex_used_2016: {len(vertex_large_degs_2016)}; elapsed_time: {time.time()-time_start}")


time_start=time.time()
all_nodes_2019, degrees_2019 = np.unique(pairs_2019.values.flatten(), return_counts=True)
large_degree_mask = degrees_2019 >= vertex_degree_cutoff
vertex_large_degs_2019 = all_nodes_2019[large_degree_mask]
print(f"vertex_used_2019: {len(vertex_large_degs_2019)}; elapsed_time: {time.time()-time_start}")


time_start=time.time()
all_nodes_2022, degrees_2022 = np.unique(pairs_2022.values.flatten(), return_counts=True)
large_degree_mask = degrees_2022 >= vertex_degree_cutoff
vertex_large_degs_2022 = all_nodes_2022[large_degree_mask]
print(f"vertex_used_2022: {len(vertex_large_degs_2022)}; elapsed_time: {time.time()-time_start}")

vertex_used_2016: 37662; elapsed_time: 2.568633794784546
vertex_used_2019: 37901; elapsed_time: 2.9276671409606934
vertex_used_2022: 37955; elapsed_time: 3.2913098335266113


#### get all the combination of the used vertex

In [7]:
 
time_start=time.time()
n = len(vertex_large_degs_2016)
c, r = np.triu_indices(n, k=1)  # Gets the upper triangle indices excluding the diagonal
combine_pairs_2016 = np.column_stack((vertex_large_degs_2016[c], vertex_large_degs_2016[r]))
print(f"all combine_pairs_2016: {len(combine_pairs_2016)}; elapsed_time: {time.time()-time_start}")

time_start=time.time()
n = len(vertex_large_degs_2019)
c, r = np.triu_indices(n, k=1)  # Gets the upper triangle indices excluding the diagonal
combine_pairs_2019 = np.column_stack((vertex_large_degs_2019[c], vertex_large_degs_2019[r]))
print(f"all combine_pairs_2019: {len(combine_pairs_2019)}; elapsed_time: {time.time()-time_start}")


time_start=time.time()
n = len(vertex_large_degs_2022)
c, r = np.triu_indices(n, k=1)  # Gets the upper triangle indices excluding the diagonal
combine_pairs_2022 = np.column_stack((vertex_large_degs_2022[c], vertex_large_degs_2022[r]))
print(f"all combine_pairs_2022: {len(combine_pairs_2022)}; elapsed_time: {time.time()-time_start}")

all combine_pairs_2016: 709194291; elapsed_time: 13.8586585521698
all combine_pairs_2019: 718223950; elapsed_time: 13.997780561447144
all combine_pairs_2022: 720272035; elapsed_time: 14.053169250488281


In [8]:
# Convert numpy arrays to pandas DataFrames
time_start=time.time()
all_combine_pairs_2016 = pd.DataFrame(combine_pairs_2016, columns=['v1', 'v2'])
print(f"Convert combine_pairs_2016: {len(all_combine_pairs_2016)}, elapsed_time: {time.time()-time_start}")

time_start=time.time()
all_combine_pairs_2019 = pd.DataFrame(combine_pairs_2019, columns=['v1', 'v2'])
print(f"Convert combine_pairs_2019: {len(all_combine_pairs_2019)}, elapsed_time: {time.time()-time_start}")

time_start=time.time()
all_combine_pairs_2022 = pd.DataFrame(combine_pairs_2022, columns=['v1', 'v2'])
print(f"Convert combine_pairs_2022: {len(all_combine_pairs_2022)}, elapsed_time: {time.time()-time_start}")

Convert combine_pairs_2016: 709194291, elapsed_time: 0.002460002899169922
Convert combine_pairs_2019: 718223950, elapsed_time: 0.0006887912750244141
Convert combine_pairs_2022: 720272035, elapsed_time: 0.000698089599609375


### prepare unconnected_pairs

#### unconnected pairs in 2016

In [10]:
time_start=time.time()

unconnected_pairs_2016 = pd.merge(all_combine_pairs_2016, pairs_2016, on=['v1', 'v2'], how='outer', indicator=True)
unconnected_pairs_2016 = unconnected_pairs_2016[unconnected_pairs_2016['_merge'] == 'left_only']
unconnected_pairs_2016 = unconnected_pairs_2016.drop(columns=['_merge'])

print(f"unconnected_pairs_2016: {len(unconnected_pairs_2016)}; elapsed_time: {time.time()-time_start}")

unconnected_pairs_2016: 689048123; elapsed_time: 201.25491285324097


#### check unconnected pairs in 2016 for 2019 (unconnected+citation and connected+citation)

In [13]:
### in unconnected_pair_2016 but not in pairs_2019
### unconnected pairs keep unconnected in 2019

time_start=time.time()

unconnected_pair_2016_2019 = pd.merge(unconnected_pairs_2016, pairs_2019, on=['v1', 'v2'], how='outer', indicator=True)
unconnected_pair_2016_2019 = unconnected_pair_2016_2019[unconnected_pair_2016_2019['_merge'] == 'left_only']
unconnected_pair_2016_2019 = unconnected_pair_2016_2019.drop(columns=['_merge'])

print(f"unconnected_pair_2016_2019: {len(unconnected_pair_2016_2019)}; elapsed_time: {time.time()-time_start}")

### in unconnected_pair_2016 but also in pairs_2019
### unconnected pairs becomes connected in 2019
time_start=time.time()
connected_pair_2016_2019 = pd.merge(pairs_2019,unconnected_pairs_2016, on=['v1', 'v2'], how='inner', indicator=True)
connected_pair_2016_2019 = connected_pair_2016_2019[connected_pair_2016_2019['_merge'] == 'both']
connected_pair_2016_2019 = connected_pair_2016_2019.drop(columns=['_merge'])

print(f"connected_pair_2016_2019: {len(connected_pair_2016_2019)}; elapsed_time: {time.time()-time_start}\n")
print(f"train 2016-2019: total- {len(unconnected_pairs_2016)}; connected-- {len(connected_pair_2016_2019)}; unconnected--{len(unconnected_pair_2016_2019)}")

unconnected_pair_2016_2019: 686362822; elapsed_time: 235.92881035804749
connected_pair_2016_2019: 2685301; elapsed_time: 180.5428500175476

train 2016-2019: total- 689048123; connected-- 2685301; unconnected--686362822


#### unconnected pairs in 2019

In [14]:
time_start=time.time()

unconnected_pairs_2019 = pd.merge(all_combine_pairs_2019, pairs_2019, on=['v1', 'v2'], how='outer', indicator=True)
unconnected_pairs_2019 = unconnected_pairs_2019[unconnected_pairs_2019['_merge'] == 'left_only']
unconnected_pairs_2019 = unconnected_pairs_2019.drop(columns=['_merge'])

print(f"unconnected_pairs_2019: {len(unconnected_pairs_2019)}; elapsed_time: {time.time()-time_start}")

unconnected_pairs_2019: 695382601; elapsed_time: 196.07343530654907


#### check unconnected pairs in 2019 for 2022 (unconnected+citation and connected+citation)

In [15]:
time_start=time.time()
unconnected_pair_2019_2022 = pd.merge(unconnected_pairs_2019, pairs_2022, on=['v1', 'v2'], how='outer', indicator=True)
unconnected_pair_2019_2022 = unconnected_pair_2019_2022[unconnected_pair_2019_2022['_merge'] == 'left_only']
unconnected_pair_2019_2022 = unconnected_pair_2019_2022.drop(columns=['_merge'])
print(f"unconnected_pair_2019_2022: {len(unconnected_pair_2019_2022)}; elapsed_time: {time.time()-time_start}")


time_start=time.time()
connected_pair_2019_2022 = pd.merge(pairs_2022, unconnected_pairs_2019, on=['v1', 'v2'], how='inner', indicator=True)
connected_pair_2019_2022 = connected_pair_2019_2022[connected_pair_2019_2022['_merge'] == 'both']
connected_pair_2019_2022 = connected_pair_2019_2022.drop(columns=['_merge'])
print(f"connected_pair_2019_2022, {len(connected_pair_2019_2022)}; elapsed_time: {time.time()-time_start}")
print(f"eval 2019-2022: total- {len(unconnected_pairs_2019)}; connected-- {len(connected_pair_2019_2022)}; unconnected--{len(unconnected_pair_2019_2022)}")

unconnected_pair_2019_2022: 692215385; elapsed_time: 239.53186178207397
connected_pair_2019_2022, 3167216; elapsed_time: 183.01924514770508
evalu 2019-2022: total- 695382601; connected-- 3167216; unconnected--692215385


#### unconnected pair in 2022 (no future eval)

In [9]:
time_start=time.time()

unconnected_pairs_2022 = pd.merge(all_combine_pairs_2022, pairs_2022, on=['v1', 'v2'], how='outer', indicator=True)
unconnected_pairs_2022 = unconnected_pairs_2022[unconnected_pairs_2022['_merge'] == 'left_only']
unconnected_pairs_2022 = unconnected_pairs_2022.drop(columns=['_merge'])

print(f"unconnected_pairs_2022: {len(unconnected_pairs_2022)}; elapsed_time: {time.time()-time_start}")


store_folder="data_pair_solution"
if not os.path.exists(store_folder):
    os.makedirs(store_folder)
print(f"store files in {store_folder}.....")

### unconnected pair are connected in 2019, 2022

time_start = time.time()
store_name=os.path.join(store_folder,"unconnected_pair_2022.parquet")
unconnected_pairs_2022.to_parquet(store_name, compression='gzip')
print(f"unconnected_pairs_2022: {len(unconnected_pairs_2022)}; elapsed_time: {time.time() - time_start}")


unconnected_pairs_2022: 694261089; elapsed_time: 201.59090304374695
store files in data_pair_solution.....
unconnected_pairs_2022: 694261089; elapsed_time: 428.5876655578613


### unconnected pair and solution (citation information); train

In [16]:
time_start=time.time()
pair_solution_connected_2019=pd.merge(connected_pair_2016_2019,full_graph_2019, on=['v1', 'v2'], how='inner')
print(f"2016 connected in 2019  : {len(pair_solution_connected_2019)}; elapsed_time: {time.time()-time_start}")

time_start=time.time()
pair_solution_unconnected_2019=unconnected_pair_2016_2019
pair_solution_unconnected_2019.insert(2, 'citation', 0)
print(f"2016 unconnected in 2019: {len(pair_solution_unconnected_2019)}; elapsed_time: {time.time()-time_start}")

2016 connected in 2019  : 3093044; elapsed_time: 34.91400408744812
2016 unconnected in 2019: 686362822; elapsed_time: 1.2425153255462646


In [18]:
time_start=time.time()
pair_solution_connected_2022=pd.merge(connected_pair_2019_2022,full_graph_2022, on=['v1', 'v2'], how='inner')
print(f"2019 connected in 2022  : {len(pair_solution_connected_2022)}; elapsed_time: {time.time()-time_start}")

time_start=time.time()
pair_solution_unconnected_2022=unconnected_pair_2019_2022
pair_solution_unconnected_2022.insert(2, 'citation', 0)
print(f"2019 unconnected in 2022: {len(pair_solution_unconnected_2022)}; elapsed_time: {time.time()-time_start}")


2019 connected in 2022  : 3700606; elapsed_time: 44.165722370147705
2019 unconnected in 2022: 692215385; elapsed_time: 1.2526342868804932


#### store orginal cases

In [21]:
store_folder="data_pair_solution"
if not os.path.exists(store_folder):
    os.makedirs(store_folder)
print(f"store files in {store_folder}.....")

### unconnected pair are connected in 2019, 2022

time_start = time.time()
store_name=os.path.join(store_folder,"unconnected_2016_pair_solution_connected_2019_full.parquet")
pair_solution_connected_2019.to_parquet(store_name, compression='gzip')
print(f"pair_solution_connected_2019 full: {len(pair_solution_connected_2019)}; elapsed_time: {time.time() - time_start}")


time_start = time.time()
store_name=os.path.join(store_folder,"unconnected_2019_pair_solution_connected_2022_full.parquet")
pair_solution_connected_2022.to_parquet(store_name, compression='gzip')
print(f"pair_solution_connected_2022 full: {len(pair_solution_connected_2022)}; elapsed_time: {time.time() - time_start}\n")


### unconnected pair are not connected in 2019, 2022

time_start = time.time()
store_name=os.path.join(store_folder,"unconnected_2016_pair_solution_unconnected_2019.parquet")
pair_solution_unconnected_2019.to_parquet(store_name, compression='gzip')
print(f"pair_solution_unconnected_2019: {len(pair_solution_unconnected_2019)}; elapsed_time: {time.time() - time_start}")


time_start = time.time()
store_name=os.path.join(store_folder,"unconnected_2019_pair_solution_unconnected_2022.parquet")
pair_solution_unconnected_2022.to_parquet(store_name, compression='gzip')
print(f"pair_solution_unconnected_2022: {len(pair_solution_unconnected_2022)}; elapsed_time: {time.time() - time_start}")

store files in data_pair_solution.....
pair_solution_connected_2019 full: 3093044; elapsed_time: 4.522568225860596
pair_solution_connected_2022 full: 3700606; elapsed_time: 4.545568227767944

pair_solution_unconnected_2019: 686362822; elapsed_time: 427.5400719642639
pair_solution_unconnected_2022: 692215385; elapsed_time: 432.4645767211914


#### merge repeated pairs 

In [35]:
time_start = time.time()

# Use .groupby to group by 'v1' and 'v2', then use .sum to get the total citations for each pair
grouped_data_df=pair_solution_connected_2019.copy()
grouped_data_df['citation']=pair_solution_connected_2019[['c2019', 'c2018', 'c2017']].sum(axis=1)
dynamic_grouped_data = grouped_data_df.groupby(['v1','v2']).agg({'citation':'sum','v1':'size'}).rename(columns={'v1':'num'}).reset_index()
dynamic_grouped_data['citation_m'] = dynamic_grouped_data[f'citation'] / dynamic_grouped_data['num']
print(f"elapsed_time: {time.time() - time_start}")

time_start = time.time()
store_folder="data_pair_solution" 
store_name=os.path.join(store_folder,"unconnected_2016_pair_solution_connected_2019.parquet")
dynamic_grouped_data.to_parquet(store_name, compression='gzip')
print(f"unconnected_2016_pair_solution_connected_2019: {len(dynamic_grouped_data)}; elapsed_time: {time.time() - time_start}")

elapsed_time: 1.885589361190796
unconnected_2016_pair_solution_connected_2019: 2685301; elapsed_time: 1.5675153732299805


In [36]:
time_start = time.time()
grouped_data_df=pair_solution_connected_2022.copy()
grouped_data_df['citation']=pair_solution_connected_2022[['c2022', 'c2021', 'c2020']].sum(axis=1)
dynamic_grouped_data = grouped_data_df.groupby(['v1','v2']).agg({'citation':'sum','v1':'size'}).rename(columns={'v1':'num'}).reset_index()
dynamic_grouped_data['citation_m'] = dynamic_grouped_data['citation'] / dynamic_grouped_data['num']
print(f"elapsed_time: {time.time() - time_start}")

time_start = time.time()
store_folder="data_pair_solution" 
store_name=os.path.join(store_folder,"unconnected_2019_pair_solution_connected_2022.parquet")
dynamic_grouped_data.to_parquet(store_name, compression='gzip')
print(f"unconnected_2019_pair_solution_connected_2022_processed: {len(dynamic_grouped_data)}; elapsed_time: {time.time() - time_start}")

elapsed_time: 2.318059206008911
unconnected_2019_pair_solution_connected_2022_processed: 3167216; elapsed_time: 1.850799560546875
