In [1]:
# all required imports
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
from geopy.distance import geodesic
import json
import os
import sys
import math
import random
import datetime
import scipy as sp
from scipy.stats.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
#reading the training data and test data

file_name = 'dblp-ref-'
num_files = 4
train_data = []

for i in range(num_files):
    train_data.append(pd.read_json("./data/dblp-ref/dblp-ref-" + str(i) + ".json", lines=2))
    train_data[i].head(3)

In [5]:
#train_data.fillna(0,inplace=True)
author_data = {}
author_id = {
    'start': 1,
    'curr': 1
}

assigned_ids = {}

def create_author_data(train_data, author_data, author_id, assigned_ids):
    for i in range(0, len(train_data)):#len(train_data)):
        authors = train_data.authors[i]
        
        try:
            citations = train_data.n_citation[i]/len(authors)
        except:
            continue

        for author in authors:
            names = author.split(' ')
            unique_name = names[0] + "_" + names[len(names)-1]
            if unique_name not in author_data:
                author_data[unique_name] = {
                    'num_citations': citations,
                    'paper_count': 1,
                    'name': unique_name,
                    'author_id': author_id['curr'],
                    'co_authors': {},
                    'citations': [train_data.n_citation[i]]
                }
                assigned_ids[unique_name] = author_id['curr']
                author_id['curr'] += 1
                
            else:
                author_data[unique_name]['num_citations'] += citations
                author_data[unique_name]['paper_count'] += 1
                author_data[unique_name]['citations'].append(train_data.n_citation[i])
                
            for co_author in authors:
                co_author_names = co_author.split(' ')
                co_author_unique_name = co_author_names[0] + "_" + co_author_names[len(co_author_names)-1]
                if co_author_unique_name != unique_name:
                    author_data[unique_name]['co_authors'][co_author_unique_name] = 1
                        
            
            
# call for each data file
for i in range(num_files):
    create_author_data(train_data[i], author_data, author_id, assigned_ids)

# add average citations
for data in author_data:
    author_data[data]['average_citations'] = author_data[data]['num_citations'] / author_data[data]['paper_count']
    
# adding h-index
def get_h_index(citations):
    return ([0] + [i + 1 for i, c in enumerate(sorted(citations, reverse = True)) if c >= i + 1])[-1]

data_to_df = []
for data in author_data:
    each_author = author_data[data]
    co_authors = each_author['co_authors']
    co_author_ids = []
    co_author_avg_citations = 0
    for co_author in co_authors:
        co_author_avg_citations += author_data[co_author]['average_citations']
        co_author_ids.append(assigned_ids[co_author])
    each_author['co_authors'] = co_author_ids
    each_author['co_author_avg_citations'] = co_author_avg_citations/len(co_author_ids) if len(co_author_ids) != 0 else 0
    data_to_df.append(each_author)
    
df = pd.DataFrame.from_dict(data_to_df, orient='columns')

df['h_index'] = df.apply(lambda x: get_h_index(x.citations), axis=1)
        
#df['average_citations'] = df['num_citations'] / df['paper_count']

df.head(100)

Unnamed: 0,author_id,average_citations,citations,co_author_avg_citations,co_authors,name,num_citations,paper_count,h_index
0,1,4.828571,"[0, 50, 50, 0, 2]",4.397172,"[2, 3, 4, 5, 6, 7, 8, 40149, 463789, 463790, 4...",Makoto_Satoh,24.142857,5,2
1,2,0.000000,[0],4.877965,"[1, 3, 4, 5, 6, 7, 8]",Ryo_Muramatsu,0.000000,1,0
2,3,6.684375,"[0, 50, 0, 50, 0, 0, 50, 6, 50, 2, 50, 50, 1, ...",3.616932,"[1, 2, 4, 5, 6, 7, 8, 73800, 30142, 198087, 49...",Mizue_Kayama,213.900000,32,13
3,4,4.667500,"[0, 50, 0, 0, 0, 10, 1, 0, 50, 50, 50, 50, 1, ...",2.655301,"[1, 2, 3, 5, 6, 7, 8, 73800, 201615, 334037, 4...",Kazunori_Itoh,93.350000,20,8
4,5,2.310000,"[0, 0, 50, 1, 2, 0, 0, 50, 0, 50]",2.876817,"[1, 2, 3, 4, 6, 7, 8, 201615, 490012, 509990, ...",Masami_Hashimoto,23.100000,10,3
5,6,2.524359,"[0, 0, 50, 1, 0, 1, 2, 1, 0, 2, 0, 0, 0, 50, 2...",3.594990,"[1, 2, 3, 4, 5, 7, 8, 50209, 324798, 180194, 2...",Makoto_Otani,98.450000,39,9
6,7,6.583333,"[0, 0, 0, 0, 10, 50, 50, 50, 50, 50]",3.823610,"[1, 2, 3, 4, 5, 6, 8, 201615, 334037, 509990, ...",Michio_Shimizu,65.833333,10,6
7,8,6.547619,"[0, 0, 0, 50, 50, 50, 50]",4.010573,"[1, 2, 3, 4, 5, 6, 7, 509990, 509991, 419420, ...",Masahiko_Sugimoto,45.833333,7,4
8,9,7.996032,"[50, 0, 50, 10]",5.124213,"[10, 380963, 380964, 380965, 337260, 178003, 7...",Gareth_Beale,31.984127,4,3
9,10,12.055335,"[50, 50, 52, 50, 50, 253, 10, 50, 0]",7.604391,"[9, 323945, 446287, 446288, 109746, 569693, 56...",Graeme_Earl,108.498016,9,8


In [6]:
len(df)

1635106

In [7]:
df.to_csv('./data/authors.csv', sep=',')

In [8]:
authors_df = pd.read_csv('./data/authors.csv')

In [9]:
authors_df.head()

Unnamed: 0.1,Unnamed: 0,author_id,average_citations,citations,co_author_avg_citations,co_authors,name,num_citations,paper_count,h_index
0,0,1,4.828571,"[0, 50, 50, 0, 2]",4.397172,"[2, 3, 4, 5, 6, 7, 8, 40149, 463789, 463790, 4...",Makoto_Satoh,24.142857,5,2
1,1,2,0.0,[0],4.877965,"[1, 3, 4, 5, 6, 7, 8]",Ryo_Muramatsu,0.0,1,0
2,2,3,6.684375,"[0, 50, 0, 50, 0, 0, 50, 6, 50, 2, 50, 50, 1, ...",3.616932,"[1, 2, 4, 5, 6, 7, 8, 73800, 30142, 198087, 49...",Mizue_Kayama,213.9,32,13
3,3,4,4.6675,"[0, 50, 0, 0, 0, 10, 1, 0, 50, 50, 50, 50, 1, ...",2.655301,"[1, 2, 3, 5, 6, 7, 8, 73800, 201615, 334037, 4...",Kazunori_Itoh,93.35,20,8
4,4,5,2.31,"[0, 0, 50, 1, 2, 0, 0, 50, 0, 50]",2.876817,"[1, 2, 3, 4, 6, 7, 8, 201615, 490012, 509990, ...",Masami_Hashimoto,23.1,10,3


In [17]:
authors_df = authors_df.sort_values(by='h_index', ascending=False)
authors_df_sorted_h_index = authors_df[:2000]
authors_df_sorted_h_index.to_csv('./data/authors_sorted_by_h_index.csv', sep=',')
authors_df_sorted_h_index

Unnamed: 0.1,Unnamed: 0,author_id,average_citations,citations,co_author_avg_citations,co_authors,name,num_citations,paper_count,h_index
45286,45286,45287,90.959808,"[47, 27, 9, 57, 15, 50, 388, 8, 134, 23, 50, 8...",23.527023,"[45286, 66718, 66719, 67047, 67048, 67049, 787...",Anil_Jain,49936.934754,549,144
40254,40254,40255,40.253176,"[0, 92, 50, 11, 2, 50, 50, 257, 95, 173, 50, 5...",17.147145,"[42167, 11017, 44161, 16038, 64503, 64504, 645...",Jiawei_Han,29666.590508,737,144
737,737,738,27.901193,"[43, 131, 2251, 21, 197, 70, 40, 33, 14, 50, 1...",14.345400,"[2492, 9499, 9500, 13689, 13690, 24400, 16425,...",Philip_Yu,27231.563889,976,132
7014,7014,7015,72.169629,"[3, 50, 50, 0, 263, 29, 25, 50, 50, 0, 157, 76...",45.547509,"[5776, 46602, 96985, 31335, 81469, 100220, 100...",Andrew_Zisserman,25981.066598,360,123
15849,15849,15850,29.570117,"[0, 64, 7, 50, 50, 50, 0, 0, 111, 52, 0, 50, 1...",18.364136,"[15849, 27280, 27281, 45764, 82995, 88292, 640...",Thomas_Huang,24454.486760,827,122
51522,51522,51523,77.375410,"[50, 9, 35, 2, 9, 50, 229, 28, 50, 1, 23, 0, 6...",36.990219,"[51524, 7910, 55012, 31553, 55013, 26096, 8558...",Scott_Shenker,20813.985300,269,122
29353,29353,29354,56.858964,"[44, 101, 0, 39, 24, 50, 50, 5, 4, 8, 0, 865, ...",41.367042,"[13989, 29355, 46601, 24848, 56891, 28727, 568...",Hector_Garcia-Molina,22743.585762,400,117
78465,78465,78466,79.501952,"[3, 0, 0, 95, 0, 17, 14, 35, 56, 16, 8, 50, 27...",28.065848,"[52563, 80489, 94259, 4836, 94260, 4822, 94261...",Michael_Jordan,26553.652020,334,113
25820,25820,25821,86.767361,"[0, 11, 80, 60, 2, 52, 81, 8, 50, 166, 3, 156,...",27.207981,"[25815, 25816, 25817, 25818, 25819, 25820, 668...",Sebastian_Thrun,23253.652814,268,113
22284,22284,22285,44.048882,"[50, 9, 50, 19, 14, 33, 3, 11, 50, 50, 50, 50,...",22.405255,"[22283, 22284, 38001, 38002, 38003, 41734, 537...",Christos_Faloutsos,22553.027524,512,112


In [18]:
authors_df = authors_df.sort_values(by='num_citations', ascending=False)
authors_df_sorted_num_citations = authors_df[:2000]
authors_df_sorted_num_citations.to_csv('./data/authors_df_sorted_by_num_citations.csv', sep=',')
authors_df_sorted_num_citations

Unnamed: 0.1,Unnamed: 0,author_id,average_citations,citations,co_author_avg_citations,co_authors,name,num_citations,paper_count,h_index
3232,3232,3233,334.035057,"[387, 5, 185, 11, 65, 34, 94, 261, 113, 779, 7...",28.621615,"[3234, 3235, 19817, 47221, 20073, 2125, 21097,...",David_Goldberg,90523.500325,271,75
25448,25448,25449,414.398282,"[7, 1, 30, 0, 50, 8, 0, 50, 3, 13, 5, 4, 0, 17...",20.702528,"[25450, 7959, 168384, 95735, 13413, 74045, 459...",David_Lowe,70447.707937,170,51
45286,45286,45287,90.959808,"[47, 27, 9, 57, 15, 50, 388, 8, 134, 23, 50, 8...",23.527023,"[45286, 66718, 66719, 67047, 67048, 67049, 787...",Anil_Jain,49936.934754,549,144
357731,357731,357732,5250.000000,"[106, 0, 192, 219, 182, 443, 596, 16833, 28679]",0.000000,[],Leo_Breiman,47250.000000,9,8
50736,50736,50737,226.166717,"[1, 84, 36, 52, 50, 37, 239, 67, 417, 50, 58, ...",36.654893,"[69355, 69356, 30488, 69357, 61143, 35264, 166...",Jon_Kleinberg,42519.342857,188,82
253708,253708,253709,548.880044,"[8, 3, 248, 77, 1811, 797, 54, 3, 86, 56, 17, ...",47.268506,"[253710, 4811, 163293, 39395, 512028, 151367, ...",David_Donoho,41714.883333,76,50
91121,91121,91122,414.818041,"[50, 36, 3, 50, 351, 60, 50, 90, 50, 6, 13, 0,...",25.467982,"[5675, 23149, 13439, 57496, 6086, 219568, 3582...",Lotfi_Zadeh,41481.804113,100,50
30762,30762,30763,547.148259,"[62, 50, 50, 13, 19, 6, 1, 50, 25, 27, 13, 50,...",40.480554,"[30762, 30764, 194687, 267117, 80628, 286356, ...",Stéphane_Mallat,36658.933333,67,42
31071,31071,31072,843.012684,"[133, 48, 0, 0, 2, 0, 0, 90, 178, 655, 137, 0,...",43.736192,"[1806, 31070, 31071, 18157, 31073, 31074, 6147...",Fred_Davis,36249.545405,43,29
7400,7400,7401,182.568682,"[127, 107, 5, 297, 1862, 0, 50, 518, 156, 60, ...",53.239908,"[7402, 7403, 62378, 65776, 65777, 17087, 25761...",Geoffrey_Hinton,34688.049567,190,90
