In [1]:
# all required imports
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
from geopy.distance import geodesic
import json
import os
import sys
import math
import random
import datetime
import scipy as sp
from scipy.stats.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
#reading the training data and test data

file_name = 'dblp-ref-'
num_files = 4
train_data = []

for i in range(num_files):
    train_data.append(pd.read_json("./dblp-ref/dblp-ref-" + str(i) + ".json", lines=2))
    train_data[i].head(3)

In [3]:
#train_data.fillna(0,inplace=True)
author_data = {}
author_id = {
    'start': 1,
    'curr': 1
}

assigned_ids = {}

def create_author_data(train_data, author_data, author_id, assigned_ids):
    for i in range(0, 1000):#len(train_data)):
        authors = train_data.authors[i]
        
        try:
            citations = train_data.n_citation[i]
        except:
            continue

        for author in authors:
            names = author.split(' ')
            unique_name = names[0] + "_" + names[len(names)-1]
            if unique_name not in author_data:
                author_data[unique_name] = {
                    'num_citations': citations,
                    'paper_count': 1,
                    'name': unique_name,
                    'author_id': author_id['curr'],
                    'co_authors': {}
                }
                assigned_ids[unique_name] = author_id['curr']
                author_id['curr'] += 1
                
            else:
                author_data[unique_name]['num_citations'] += citations
                author_data[unique_name]['paper_count'] += 1
                
            for co_author in authors:
                co_author_names = co_author.split(' ')
                co_author_unique_name = co_author_names[0] + "_" + co_author_names[len(co_author_names)-1]
                if co_author_unique_name != unique_name:
                    author_data[unique_name]['co_authors'][co_author_unique_name] = 1
                        
            
            
# call for each data file
for i in range(num_files):
    create_author_data(train_data[i], author_data, author_id, assigned_ids)

# add average citations
for data in author_data:
    author_data[data]['average_citations'] = author_data[data]['num_citations'] / author_data[data]['paper_count']

data_to_df = []
for data in author_data:
    each_author = author_data[data]
    co_authors = each_author['co_authors']
    co_author_ids = []
    co_author_avg_citations = 0
    for co_author in co_authors:
        co_author_avg_citations += author_data[co_author]['average_citations']
        co_author_ids.append(assigned_ids[co_author])
    each_author['co_authors'] = co_author_ids
    each_author['co_author_avg_citations'] = co_author_avg_citations/len(co_author_ids) if len(co_author_ids) != 0 else 0
    data_to_df.append(each_author)
    
df = pd.DataFrame.from_dict(data_to_df, orient='columns')
        
#df['average_citations'] = df['num_citations'] / df['paper_count']

df.head(100)

Unnamed: 0,author_id,average_citations,co_author_avg_citations,co_authors,name,num_citations,paper_count
0,1,0.000000,0.000000,"[2, 3, 4, 5, 6, 7, 8]",Makoto_Satoh,0,1
1,2,0.000000,0.000000,"[1, 3, 4, 5, 6, 7, 8]",Ryo_Muramatsu,0,1
2,3,0.000000,0.000000,"[1, 2, 4, 5, 6, 7, 8]",Mizue_Kayama,0,1
3,4,0.000000,0.000000,"[1, 2, 3, 5, 6, 7, 8]",Kazunori_Itoh,0,1
4,5,0.000000,0.000000,"[1, 2, 3, 4, 6, 7, 8]",Masami_Hashimoto,0,1
5,6,0.000000,0.000000,"[1, 2, 3, 4, 5, 7, 8]",Makoto_Otani,0,1
6,7,0.000000,0.000000,"[1, 2, 3, 4, 5, 6, 8]",Michio_Shimizu,0,1
7,8,0.000000,0.000000,"[1, 2, 3, 4, 5, 6, 7]",Masahiko_Sugimoto,0,1
8,9,50.000000,50.000000,[10],Gareth_Beale,50,1
9,10,50.000000,50.000000,[9],Graeme_Earl,50,1
