In [82]:
import networkx as nx
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from itertools import combinations

In [3]:
### Some data processing methods

# get prolific authors(0-99) from the author list
def retain_prolific(author_list: list):
    return [x for x in author_list if x < 100]

# get coauthors(>99) from the author list
def get_coauthor(author_list: list):
    return [x for x in author_list if x >= 100]

In [14]:
# read train json file
train_filename = './data/train.json'
with open('./data/train.json', 'r', encoding='utf-8') as f:
    raw_train = json.load(f)
# read test json file
test_filename = './data/test.json'
with open('./data/test.json', 'r', encoding='utf-8') as f:
    raw_test = json.load(f)
    
# get a copy
train = raw_train.copy()
test = raw_test.copy()

data_df = pd.DataFrame.from_dict(train)
data_df['prolific'] = data_df.apply(lambda x: retain_prolific(x['authors']), axis=1)
data_df['coauthors'] = data_df.apply(lambda x: get_coauthor(x['authors']), axis=1)
test_df = pd.DataFrame.from_dict(test)

In [15]:
train_df, validation_df = train_test_split(data_df, test_size=0.05,random_state=42)
length = train_df['prolific'].str.len()
# number of papers without any prolific author
count_empty = (length == 0).sum()
# number of papers with prolific authors
count_non_emtpy = (length >0).sum()
print(count_empty)
print(count_non_emtpy)
train_without = train_df[train_df['prolific'].str.len() == 0]
train_with = train_df[train_df['prolific'].str.len() > 0]
# sample same amount of data and concat
train_df = pd.concat([train_with, train_without.sample(len(train_with))])

17409
7094


In [52]:
authors = list(train_df.authors)
train_prolific = list(train_df.prolific)
train_coauthor = list(train_df.coauthors)



In [80]:
tuple_list = []
for i in range(len(authors)):
    if len(train_prolific[i]) > 0 and len(train_coauthor[i]) > 0:
        authors[i].sort()
        comb = list(combinations(authors[i],2))
        for c in comb:
            tuple_list.append(tuple(sorted(c)))

prolific_relation = []
for i in tuple_list:
    if i[0]<100 or i[1]<100:
        prolific_relation.append(i)
counts = {item:prolific_relation.count(item) for item in prolific_relation}


In [83]:
unique_nodes=[range(0,21245)]
g = nx.Graph()
g.add_nodes_from(unique_nodes)

In [105]:
for item, count in counts.items():
    g.add_edge(item[0], item[1], weight = count)

In [108]:

def if_connected(u, v):
    return u in g.neighbors(v)

In [110]:
if_connected(99, 3000)

False