In [1]:
# all required imports
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
from geopy.distance import geodesic
import json
import os
import sys
import math
import random
import datetime
import scipy as sp
from scipy.stats.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
#reading the training data and test data

train_data = pd.read_json("./dblp-ref/dblp-ref-0.json", lines=2)
                         #skiprows=lambda i: i>0 and random.random() > 0.02)
    
train_data.head(10)

Unnamed: 0,abstract,authors,id,n_citation,references,title,venue,year
0,The purpose of this study is to develop a lear...,"[Makoto Satoh, Ryo Muramatsu, Mizue Kayama, Ka...",00127ee2-cb05-48ce-bc49-9de556b93346,0,"[51c7e02e-f5ed-431a-8cf5-f761f266d4be, 69b625b...",Preliminary Design of a Network Protocol Learn...,international conference on human-computer int...,2013
1,This paper describes the design and implementa...,"[Gareth Beale, Graeme Earl]",001c58d3-26ad-46b3-ab3a-c1e557d16821,50,"[10482dd3-4642-4193-842f-85f3b70fcf65, 3133714...",A methodology for the physically accurate visu...,visual analytics science and technology,2011
2,This article applied GARCH model instead AR or...,"[Altaf Hossain, Faisal Zaman, Mohammed Nasser,...",001c8744-73c4-4b04-9364-22d31a10dbf1,50,"[2d84c0f2-e656-4ce7-b018-90eda1c132fe, a083a1b...","Comparison of GARCH, Neural Network and Suppor...",pattern recognition and machine intelligence,2009
3,,"[Jea-Bum Park, Byungmok Kim, Jian Shen, Sun-Yo...",00338203-9eb3-40c5-9f31-cbac73a519ec,0,"[8c78e4b0-632b-4293-b491-85b1976675e6, 9cdc54f...",Development of Remote Monitoring and Control D...,,2011
4,,"[Giovanna Guerrini, Isabella Merlo]",0040b022-1472-4f70-a753-74832df65266,2,,Reasonig about Set-Oriented Methods in Object ...,,1998
5,,"[Rafael Álvarez, Leandro Tortosa, José-Francis...",005ce28f-ed77-4e97-afdc-a296137186a1,0,,COMPARING GNG3D AND QUADRIC ERROR METRICS METH...,international conference on computer graphics ...,2009
6,,"[Jovan Dj. Golic, Guglielmo Morgari]",00638a94-23bf-4fa6-b5ce-40d799c65da7,2,,Vectorial fast correlation attacks.,,2004
7,,"[Guzin Ulutas, Mustafa Ulutas, Vasif V. Nabiyev]",00701b05-684f-45f9-b281-425abfec482c,0,"[5626736c-e434-4e2d-8405-54940fab88ab, 8e87e87...",Improved Secret Image Sharing Method By Encodi...,international symposium on computer and inform...,2011
8,,"[Pranay Chaudhuri, Hussein Thompson]",00745041-3636-4d18-bbec-783c4278c40d,0,,A Self-Stabilizing Algorithm for Finding the C...,parallel and distributed processing techniques...,2003
9,,"[Dominik Szajerman, Adam Jurczyński]",00964544-cbe2-4da9-bb5a-03333160eb34,0,"[3fcd7cdc-20e6-4ea3-a41c-db126fcc5cfe, bf3a11c...",Fur Visualisation for Computer Game Engines an...,international conference on computer vision an...,2014


In [3]:
train_data.isna().sum()

abstract      246354
authors            0
id                 0
n_citation         0
references    138520
title              0
venue              0
year               0
dtype: int64

In [16]:
#train_data.fillna(0,inplace=True)
author_data = {}
author_id = {
    'start': 1,
    'curr': 1
}

assigned_ids = {}

def create_author_data(author_data, author_id, assigned_ids):
    for i in range(0, 1000):#len(train_data)):
        authors = train_data.authors[i]
        
        try:
            citations = train_data.n_citation[i]
        except:
            continue

        for author in authors:
            names = author.split(' ')
            unique_name = names[0] + "_" + names[len(names)-1]
            if unique_name not in author_data:
                author_data[unique_name] = {
                    'num_citations': citations,
                    'paper_count': 1,
                    'name': unique_name,
                    'author_id': author_id['curr'],
                    'co_authors': {}
                }
                assigned_ids[unique_name] = author_id['curr']
                author_id['curr'] += 1
                
            else:
                author_data[unique_name]['num_citations'] += citations
                author_data[unique_name]['paper_count'] += 1
                
            for co_author in authors:
                co_author_names = co_author.split(' ')
                co_author_unique_name = co_author_names[0] + "_" + co_author_names[len(co_author_names)-1]
                if co_author_unique_name != unique_name:
                    author_data[unique_name]['co_authors'][co_author_unique_name] = 1
                        
            
            
# call for each data file
create_author_data(author_data, author_id, assigned_ids)

# add average citations
for data in author_data:
    author_data[data]['average_citations'] = author_data[data]['num_citations'] / author_data[data]['paper_count']

data_to_df = []
for data in author_data:
    each_author = author_data[data]
    co_authors = each_author['co_authors']
    co_author_ids = []
    co_author_avg_citations = 0
    for co_author in co_authors:
        co_author_avg_citations += author_data[co_author]['average_citations']
        co_author_ids.append(assigned_ids[co_author])
    each_author['co_authors'] = co_author_ids
    each_author['co_author_avg_citations'] = co_author_avg_citations/len(co_author_ids) if len(co_author_ids) != 0 else 0
    data_to_df.append(each_author)
    
df = pd.DataFrame.from_dict(data_to_df, orient='columns')
        
#df['average_citations'] = df['num_citations'] / df['paper_count']

df.head(100)

Unnamed: 0,author_id,average_citations,co_author_avg_citations,co_authors,name,num_citations,paper_count
0,1,0.0,0.0,"[2, 3, 4, 5, 6, 7, 8]",Makoto_Satoh,0,1
1,2,0.0,0.0,"[1, 3, 4, 5, 6, 7, 8]",Ryo_Muramatsu,0,1
2,3,0.0,0.0,"[1, 2, 4, 5, 6, 7, 8]",Mizue_Kayama,0,1
3,4,0.0,0.0,"[1, 2, 3, 5, 6, 7, 8]",Kazunori_Itoh,0,1
4,5,0.0,0.0,"[1, 2, 3, 4, 6, 7, 8]",Masami_Hashimoto,0,1
5,6,0.0,0.0,"[1, 2, 3, 4, 5, 7, 8]",Makoto_Otani,0,1
6,7,0.0,0.0,"[1, 2, 3, 4, 5, 6, 8]",Michio_Shimizu,0,1
7,8,0.0,0.0,"[1, 2, 3, 4, 5, 6, 7]",Masahiko_Sugimoto,0,1
8,9,50.0,50.0,[10],Gareth_Beale,50,1
9,10,50.0,50.0,[9],Graeme_Earl,50,1


In [4]:
x = 1

def fun(y):
    y = 2

fun(x)
x

1