In [None]:
import os
from datetime import datetime, date
import random, time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
from torch import nn
from scipy import sparse
from collections import defaultdict
import pandas as pd
import networkx as nx
import copy
import gzip
import pickle
from scipy.stats import rankdata
import time

### single concept's citation features

In [None]:
time_start = time.time()
data_folder="data_concept_graph"

# Read all concepts together with time, citation information
dynamic_concept_file=os.path.join(data_folder,"full_dynamic_concept.parquet")
full_concepts_dynamic_data = pd.read_parquet(dynamic_concept_file)

# Read all concepts from full_concepts_for_openalex.txt
concepts_files = os.path.join(data_folder, 'full_domain_concepts.txt')
with open(concepts_files, 'r') as file:
    full_concepts = [concept.strip() for concept in file.readlines()]

print(f"Done, elapsed_time: {time.time() - time_start}\n full_concepts_dynamic_data: {len(full_concepts_dynamic_data)};\n full_concept: {len(full_concepts)}")


In [None]:
NUM_OF_VERTICES=len(full_concepts)
vertex_degree_cutoff=1
years_delta=3
min_edges=1

In [None]:

years=[2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]

day_origin = date(1990,1,1)
all_concepts_df = pd.DataFrame({'v1': range(0, NUM_OF_VERTICES)})

store_folder="data_for_features"
if not os.path.exists(store_folder):
    os.makedirs(store_folder)

start_time=time.time()
for yy in years:  
    print(f'Year: {yy}')
    day_curr=(date(yy,12,31)- day_origin).days
    columns_to_subtract = [f'c{i}' for i in range(2023, yy, -1)]
    print(columns_to_subtract)
    cols_to_sum = [f'c{i}' for i in range(yy, yy-years_delta, -1)]
    print(cols_to_sum)
    
    dynamic_concepts=full_concepts_dynamic_data[full_concepts_dynamic_data['time']<=day_curr]
    dynamic_concepts_df = dynamic_concepts.copy()
    
    dynamic_concepts_df[f'ct_{yy}'] = dynamic_concepts_df['ct'] - dynamic_concepts_df[columns_to_subtract].sum(axis=1)
    
    dynamic_concepts_df['ct_delta'] = dynamic_concepts_df[cols_to_sum].sum(axis=1)
    
    dynamic_concepts_df=dynamic_concepts_df[['v1', f'c{yy}', f'ct_{yy}', 'ct_delta']]
    
    dynamic_concepts_grouped = dynamic_concepts_df.groupby('v1').agg({f'c{yy}':'sum', f'ct_{yy}':'sum', 'ct_delta':'sum', 'v1':'size'}).rename(columns={'v1':f'num'}).reset_index()
    
    dynamic_concepts_grouped[f'c{yy}_m'] = dynamic_concepts_grouped[f'c{yy}'] / dynamic_concepts_grouped[f'num']
    dynamic_concepts_grouped[f'ct_{yy}_m'] = dynamic_concepts_grouped[f'ct_{yy}'] / dynamic_concepts_grouped[f'num']
    dynamic_concepts_grouped[f'ct_delta_m'] = dynamic_concepts_grouped['ct_delta'] / dynamic_concepts_grouped[f'num']
     
    
    # Merge with all_concepts_df
    dynamic_concepts_data = pd.merge(all_concepts_df, dynamic_concepts_grouped, on='v1', how='left')
    dynamic_concepts_data.fillna(0, inplace=True) # Fill NaN values with 0
    dynamic_concepts_data.sort_values(by='v1')
    
    data_file = os.path.join(store_folder, f"concept_node_citation_data_{yy}.parquet")
    dynamic_concepts_data.to_parquet(data_file, compression='gzip')
    print(f"in {yy}; time: {time.time()-start_time}\n")
    start_time=time.time()


### concept pair's citation features

In [None]:
time_start = time.time()
data_folder="data_concept_graph"

# Read all concepts together with time, citation information
graph_file=os.path.join(data_folder,"full_dynamic_graph.parquet")
full_edge_dynamic_data = pd.read_parquet(graph_file)

print(f"Done, elapsed_time: {time.time() - time_start}\n full_edge_dynamic_data: {len(full_edge_dynamic_data)};\n")


In [None]:

years=[2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]

day_origin = date(1990,1,1)
 
store_folder="data_for_features"
start_time=time.time()
for yy in years:  
    print(f'Year: {yy}')
    day_curr=(date(yy,12,31)- day_origin).days
    columns_to_subtract = [f'c{i}' for i in range(2023, yy, -1)]
    print(columns_to_subtract)
    cols_to_sum = [f'c{i}' for i in range(yy, yy-years_delta, -1)]
    print(cols_to_sum)
    
    dynamic_pairs=full_edge_dynamic_data[full_edge_dynamic_data['time']<=day_curr]
    dynamic_pairs_df = dynamic_pairs.copy()
    
    dynamic_pairs_df[f'ct_{yy}'] = dynamic_pairs_df['ct'] - dynamic_pairs_df[columns_to_subtract].sum(axis=1)
    
    dynamic_pairs_df['ct_delta'] = dynamic_pairs_df[cols_to_sum].sum(axis=1)
    
    dynamic_pairs_df=dynamic_pairs_df[['v1', 'v2', f'c{yy}', f'ct_{yy}', 'ct_delta']]
    
    dynamic_pairs_grouped = dynamic_pairs_df.groupby(['v1','v2']).agg({f'c{yy}':'sum', f'ct_{yy}':'sum', 'ct_delta':'sum', 'v1':'size'}).rename(columns={'v1':f'num'}).reset_index()
    
    dynamic_pairs_grouped[f'c{yy}_m'] = dynamic_pairs_grouped[f'c{yy}'] / dynamic_pairs_grouped[f'num']
    dynamic_pairs_grouped[f'ct_{yy}_m'] = dynamic_pairs_grouped[f'ct_{yy}'] / dynamic_pairs_grouped[f'num']
    dynamic_pairs_grouped[f'ct_delta_m'] = dynamic_pairs_grouped['ct_delta'] / dynamic_pairs_grouped[f'num']
    
    data_file = os.path.join(store_folder, f"concept_pair_citation_data_{yy}.parquet")
    dynamic_pairs_grouped.to_parquet(data_file, compression='gzip')
    print(f"in {yy}; time: {time.time()-start_time}\n")
    start_time=time.time()
    