In [1]:
!pip install gdown
!pip install ipython-autotime

Collecting ipython-autotime
  Downloading https://files.pythonhosted.org/packages/b4/c9/b413a24f759641bc27ef98c144b590023c8038dfb8a3f09e713e9dff12c1/ipython_autotime-0.3.1-py2.py3-none-any.whl
Installing collected packages: ipython-autotime
Successfully installed ipython-autotime-0.3.1


In [2]:
%load_ext autotime

time: 134 µs (started: 2021-02-08 08:24:04 +00:00)


In [3]:
!gdown "https://drive.google.com/uc?id=1Qbi954Bwx-PplM8F_7TrB_blcqcB-bF2"

Downloading...
From: https://drive.google.com/uc?id=1Qbi954Bwx-PplM8F_7TrB_blcqcB-bF2
To: /content/train.csv
  0% 0.00/305k [00:00<?, ?B/s]100% 305k/305k [00:00<00:00, 9.74MB/s]
time: 1.72 s (started: 2021-02-08 08:24:04 +00:00)


In [4]:
!gdown "https://drive.google.com/uc?id=1hWycEy8rQ8e_krGyUhqGQiMBxZtD9SWy"

Downloading...
From: https://drive.google.com/uc?id=1hWycEy8rQ8e_krGyUhqGQiMBxZtD9SWy
To: /content/test.csv
0.00B [00:00, ?B/s]2.47MB [00:00, 72.0MB/s]
time: 1.92 s (started: 2021-02-08 08:24:05 +00:00)


In [5]:
!gdown "https://drive.google.com/uc?id=1j7h-SUy2gMJBds6eQalw4CFZC3O7Pr05"

Downloading...
From: https://drive.google.com/uc?id=1j7h-SUy2gMJBds6eQalw4CFZC3O7Pr05
To: /content/collaboration_network.edgelist
38.8MB [00:00, 82.9MB/s]
time: 3.03 s (started: 2021-02-08 08:24:07 +00:00)


In [6]:
!gdown "https://drive.google.com/uc?id=1AEn8P631y-pq6szgAbTm538oM6waf-5E"

Downloading...
From: https://drive.google.com/uc?id=1AEn8P631y-pq6szgAbTm538oM6waf-5E
To: /content/author_embedding.csv
683MB [00:07, 96.9MB/s]
time: 11.3 s (started: 2021-02-08 08:24:10 +00:00)


In [7]:
!gdown "https://drive.google.com/uc?id=1TD3CkcmbvhIdXkCAWaBpY416qDen2vfD"

Downloading...
From: https://drive.google.com/uc?id=1TD3CkcmbvhIdXkCAWaBpY416qDen2vfD
To: /content/author_papers.txt
23.5MB [00:00, 74.2MB/s]
time: 2.63 s (started: 2021-02-08 08:24:22 +00:00)


In [26]:
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from pathlib import Path

# read training data
df_train = pd.read_csv('train.csv', dtype={'authorID': np.int64, 'h_index': np.float32})
n_train = df_train.shape[0]

# read test data
df_test = pd.read_csv('test.csv', dtype={'authorID': np.int64})
n_test = df_test.shape[0]

# load the graph
G = nx.read_edgelist('collaboration_network.edgelist',
                     delimiter=' ', nodetype=int)
n_nodes = G.number_of_nodes()
n_edges = G.number_of_edges()
print('Number of nodes:', n_nodes)
print('Number of edges:', n_edges)

# read embeddings of abstracts
embeddings = pd.read_csv("author_embedding.csv", header=None)
embeddings = embeddings.rename(columns={0: "authorID"})

Number of nodes: 231239
Number of edges: 1777338
time: 25.1 s (started: 2021-02-08 08:33:22 +00:00)


In [9]:
# create the training matrix. each author is represented by the average of
# the embeddings of the abstracts of his/her top-cited papers
df_train = df_train.merge(embeddings, on="authorID")

# create the test matrix. each author is represented by the average of
# the embeddings of the abstracts of his/her top-cited papers
df_test = df_test.merge(embeddings, on="authorID")
# X_test = df_test.iloc[:, 2:]

time: 1.55 s (started: 2021-02-08 08:24:49 +00:00)


In [10]:
df_train.head()

Unnamed: 0,authorID,h_index,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,...,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256
0,2124557175,1.0,0.011884,-1.474495,0.475431,0.761245,0.372421,-1.278481,-0.931237,1.012764,0.9334,-0.704856,0.784397,0.733632,0.355529,0.964007,-0.583914,0.528366,0.748525,-0.077608,-0.362534,-0.276943,-0.097285,0.996792,-0.295493,-0.154917,-1.121127,-0.191539,-0.231411,0.707991,0.629793,0.029784,-0.878563,0.795566,1.567196,-0.475378,0.933597,-1.019691,-2.078399,-2.758443,...,-0.770467,-0.262722,0.525519,-0.149412,0.081803,0.792591,-0.403451,1.036691,-0.161972,0.606072,-0.390188,-0.82473,-1.142385,-0.291382,0.361122,1.474065,0.956201,-1.467333,0.920381,-1.030914,0.729073,-0.788855,1.265254,0.663009,0.330089,-0.491955,0.476225,0.294243,0.626619,1.195963,-0.105072,-0.827668,1.672597,-0.207077,0.251072,-0.398389,-0.767606,-1.399617,1.284111,-0.491038
1,1964267543,4.0,-0.941447,-0.146221,0.836747,-1.257331,0.451242,-0.215515,1.080448,0.463444,0.77768,0.176316,0.198668,0.007866,-1.00062,0.669209,-1.043053,-0.7161,-1.087103,-0.398879,-0.412347,-0.13121,-0.018862,0.47087,-0.384256,0.655413,0.143723,-0.680893,-0.815258,-0.946246,1.248203,0.198083,0.02787,0.370975,1.354574,0.538946,0.098952,-0.863707,1.104675,-1.323895,...,-0.092297,-0.209134,0.004957,0.039698,-0.171137,1.284027,0.097372,-0.212618,-0.391834,0.932204,0.499848,0.302687,-0.084279,-0.913222,-0.497197,0.516038,0.327819,0.866613,0.955164,-0.950108,0.098256,-0.863164,-0.01709,1.380026,0.73366,-1.058517,-0.57427,1.18054,1.16436,-0.346797,0.074777,-0.273686,0.060186,-0.355505,-1.12772,-0.436617,-0.269324,-0.520198,0.506366,-1.664988
2,2762410113,1.0,-1.530304,-0.346675,2.442384,0.68517,-0.414881,0.211662,-0.186427,-0.041663,1.51534,-0.620786,-1.284518,-1.208413,1.806724,0.616266,-0.753918,-2.051487,-1.376353,-0.350286,-0.610686,1.38918,-1.478816,-0.998558,-0.709088,1.09928,-1.34387,-0.567973,0.071759,0.309125,0.251986,0.591007,-1.296995,-2.060488,0.725539,2.024396,-1.356195,-0.434862,-1.394312,-1.466734,...,0.120297,1.083913,1.10554,0.087702,-0.161807,2.624068,0.437089,-0.645089,-2.246255,-0.436635,-1.224813,-3.100993,0.344722,0.687891,1.905243,0.453318,-3.150965,-0.488399,1.311814,0.139054,-0.096975,1.414337,0.665574,0.247286,1.305022,-0.39528,0.761511,1.220916,0.875253,0.742394,2.560457,0.332845,-1.83093,-1.256303,0.626105,-0.81052,1.613209,-0.178558,-0.286176,-0.743096
3,2225319093,1.0,-0.129008,-1.32724,-0.727648,1.136473,1.130315,-0.423311,0.181678,0.554091,-0.231255,0.727548,1.04171,-1.605366,0.27974,1.608455,-0.793432,-0.45365,0.992398,-0.810396,-1.265649,1.338686,0.173262,1.255848,0.590473,0.433425,-0.622283,0.178423,0.530372,0.811803,0.558662,0.628117,-0.000144,0.221142,1.293048,-0.666839,-1.141459,0.604393,-0.780155,-0.641698,...,0.36315,0.718474,0.613515,0.176576,-0.286288,0.925159,1.12346,0.626039,-1.143193,-1.353026,-0.129152,0.212153,-0.500736,-0.750023,-0.527533,0.240602,-0.883445,-0.978181,1.466637,-0.512852,1.538082,-0.341608,-0.200128,1.389997,-0.316449,-0.352446,0.831606,-0.839991,0.894545,0.289335,-0.066281,0.664719,1.126354,-0.566391,-0.579283,0.662119,-1.454489,-0.973226,-0.192451,1.583816
4,2122039117,43.0,-0.255069,0.285396,1.018723,-0.32508,1.013789,0.177253,0.592353,0.208593,0.033127,0.876228,0.915318,-0.261559,0.037244,-0.650182,-1.359849,-1.203008,0.560354,0.230027,-0.4171,-1.098673,-0.320773,-1.119455,-1.02759,-0.154131,0.03953,-0.452954,1.034962,0.9293,-1.014582,0.154546,0.565082,-0.440206,1.652724,-0.244671,-0.7589,0.884478,-0.659663,0.059617,...,0.532843,0.629661,0.250447,0.302851,0.241603,0.215309,0.632135,-0.964263,0.569097,0.424411,-0.076652,0.376441,0.487996,-0.41563,0.00995,-0.250206,0.040206,0.257938,0.26625,-0.277946,0.31629,-0.267702,0.038262,-0.67302,0.932779,-0.513544,0.638799,-0.150237,0.636681,-0.302591,0.968621,-0.020236,-0.072942,-0.310042,0.304724,-0.539385,-0.354146,-0.629376,-1.457717,0.290012


time: 128 ms (started: 2021-02-08 08:24:50 +00:00)


In [11]:
import ast
import json 

f = open("author_papers.txt", "r")
print("Read author_papers.txt")

# loads the inverted abstracts and stores them as id-abstracts in a dictionary dic and in a folder fw
dic = {}
for l in f:
    if(l == "\n"):
        continue
    author, paper = l.split(":")
    paper = ast.literal_eval(paper)
    dic[author] = paper
with open('author.json', 'w') as outfile:
    json.dump(dic, outfile)

Read author_papers.txt
time: 7.28 s (started: 2021-02-08 08:24:50 +00:00)


In [12]:
df_author_paper = pd.read_json("author.json", orient="index", dtype=str)
df_author_paper.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
2908387141,2540479521,,,,,,,,,
2908425732,2553344037,,,,,,,,,
2908436250,2907086791,,,,,,,,,
2908499439,2081432213,2070621672.0,2079679191.0,32110345.0,2013975658.0,2748643138.0,605803309.0,1641369840.0,,
2908506980,2148994087,,,,,,,,,


time: 3.42 s (started: 2021-02-08 08:24:58 +00:00)


In [13]:
from tqdm.notebook import tqdm
tqdm.pandas()

df_author_paper["paper_per_author"] = df_author_paper.progress_apply(lambda x: np.sum(x != "nan"), axis=1)
df_author_paper.head()

HBox(children=(FloatProgress(value=0.0, max=231239.0), HTML(value='')))




Unnamed: 0,0,1,2,3,4,5,6,7,8,9,paper_per_author
1036332,1510273386,1827736641.0,1588673897.0,2252711322.0,2123653597.0,2278195862.0,1487185586.0,1948934242.0,2279426801.0,2209934846.0,10
1101850,133459021,179719743.0,2111787673.0,2126488676.0,31838995.0,2141985222.0,2164030175.0,2150222226.0,2083630376.0,2016084258.0,10
1336878,2122092249,2132109814.0,2100271871.0,2065672539.0,2036413831.0,2131236363.0,1529204628.0,2011783148.0,1716250762.0,657471641.0,10
1515524,2141827797,2127085795.0,2013547785.0,2138529788.0,1994863898.0,1978221130.0,1678723351.0,2101100654.0,2314661240.0,1981200361.0,10
1606427,1907724546,,,,,,,,,,1


time: 1min 2s (started: 2021-02-08 08:25:01 +00:00)


In [14]:
df_author_paper_slim = df_author_paper[["paper_per_author"]]
df_author_paper_slim.reset_index(inplace=True)
df_author_paper_slim.rename(columns={"index": "authorID"}, inplace=True)
df_author_paper_slim

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,authorID,paper_per_author
0,1036332,10
1,1101850,10
2,1336878,10
3,1515524,10
4,1606427,1
...,...,...
231234,2908387141,1
231235,2908425732,1
231236,2908436250,1
231237,2908499439,8


time: 30.5 ms (started: 2021-02-08 08:26:04 +00:00)


In [15]:
author_num_papers_train = df_author_paper_slim[df_author_paper_slim["authorID"].isin(df_train["authorID"])]
author_num_papers_train

Unnamed: 0,authorID,paper_per_author
13,7248981,10
14,7318656,10
19,8845325,4
20,9723467,10
28,12499814,10
...,...,...
231212,2907983075,10
231215,2908006952,1
231219,2908065712,1
231222,2908154030,2


time: 40 ms (started: 2021-02-08 08:26:04 +00:00)


In [16]:
author_num_papers_test = df_author_paper_slim[df_author_paper_slim["authorID"].isin(df_test["authorID"])]
author_num_papers_test

Unnamed: 0,authorID,paper_per_author
0,1036332,10
1,1101850,10
2,1336878,10
3,1515524,10
4,1606427,1
...,...,...
231234,2908387141,1
231235,2908425732,1
231236,2908436250,1
231237,2908499439,8


time: 45.6 ms (started: 2021-02-08 08:26:04 +00:00)


In [17]:
# computes structural features for each node
core_number = nx.core_number(G)
onion_number = nx.onion_layers(G)
avg_neighbor_degree = nx.average_neighbor_degree(G)
degree_centrality = nx.degree_centrality(G)
clustering = nx.clustering(G)
print("Features computed")

Features computed
time: 4min 44s (started: 2021-02-08 08:26:04 +00:00)


In [79]:
# create the training matrix. each node is represented as a vector of 3 features:
# (1) its degree, (2) its core number and (3) the average degree of its neighbors
# X_train_graph = np.zeros((n_train, 12))
X_train_graph = np.zeros((n_train, 8))
y_train_graph = np.zeros(n_train)
for i, row in df_train.iterrows():
    node = row['authorID']
    X_train_graph[i, 0] = G.degree(node)
    X_train_graph[i, 1] = core_number[node]
    X_train_graph[i, 2] = avg_neighbor_degree[node]
    X_train_graph[i, 3] = onion_number[node]
    X_train_graph[i, 4] = degree_centrality[node]
    X_train_graph[i, 5] = clustering[node]
    X_train_graph[i, 6] = row['authorID']
    #for author in G.neighbors(node):
    k_neigbors = df_author_paper_slim[df_author_paper_slim['authorID'].isin(list(G.neighbors(node)))]
    X_train_graph[i, 7] = k_neigbors['paper_per_author'].mean()
    # X_train_graph[i, 8] = k_neigbors['paper_per_author'].std()
    # X_train_graph[i, 9] = k_neigbors['paper_per_author'].median()
    # X_train_graph[i, 10] = k_neigbors['paper_per_author'].quantile(0.25)
    # X_train_graph[i, 11] = k_neigbors['paper_per_author'].quantile(0.75)
    y_train_graph[i] = row['h_index']

time: 2min (started: 2021-02-08 09:49:19 +00:00)


In [None]:
# X_test_graph = np.zeros((n_test, 12))
X_test_graph = np.zeros((n_test, 8))
for i, row in df_test.iterrows():
    node = row['authorID']
    X_test_graph[i, 0] = G.degree(node)
    X_test_graph[i, 1] = core_number[node]
    X_test_graph[i, 2] = avg_neighbor_degree[node]
    X_test_graph[i, 3] = onion_number[node]
    X_test_graph[i, 4] = degree_centrality[node]
    X_test_graph[i, 5] = clustering[node]
    X_test_graph[i, 6] = row['authorID']
    k_neigbors = df_author_paper_slim[df_author_paper_slim['authorID'].isin(list(G.neighbors(node)))]
    X_test_graph[i, 7] = k_neigbors['paper_per_author'].mean()
    # X_test_graph[i, 8] = k_neigbors['paper_per_author'].std()
    # X_test_graph[i, 9] = k_neigbors['paper_per_author'].median()
    # X_test_graph[i, 10] = k_neigbors['paper_per_author'].quantile(0.25)
    # X_test_graph[i, 11] = k_neigbors['paper_per_author'].quantile(0.75)

In [None]:
print(X_train_graph.shape)
print(y_train_graph.shape)
print(X_test_graph.shape)

In [None]:
## Merge text features and graph
# columns=["degree", "core_number", "avg_neighbors", "onion_number", "degree_centrality", "clustering", "authorID",'K_mean','K_std','K_median','K_q1','K_q3']
columns=["degree", "core_number", "avg_neighbors", "onion_number", "degree_centrality", "clustering", "authorID",'K_mean']
X_train_graph_df = pd.DataFrame(
    X_train_graph, 
    columns=columns
)
X_test_graph_df = pd.DataFrame(
    X_test_graph,
    columns=columns
)

In [None]:
X_train = X_train_graph_df.merge(df_train, on="authorID")
X_test = X_test_graph_df.merge(df_test, on="authorID")

In [None]:
X_train = X_train.merge(author_num_papers_train, on="authorID")
X_test = X_test.merge(author_num_papers_test, on="authorID")

In [None]:
X_test

In [None]:
# Removing h_index and author_id
y_train = X_train["h_index"]
X_train.drop(columns=["authorID", "h_index"], inplace=True)
print(X_train.head())

X_test.drop(columns=["authorID", "h_index_pred"], inplace=True)
print(X_test.head())

# A la francaise 

In [None]:
# scale for 0 mean and 1 std per col
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(
    X_train_scaled, y_train, test_size=0.2, random_state=42
)

In [None]:
# train a regression model and make predictions
model = xgb.XGBRegressor()
model.fit(X_train_final, y_train_final)

In [None]:
y_pred = model.predict(X_test_final)
print(f"Loss: {mean_absolute_error(y_test_final, y_pred)}")

In [None]:
# Save to file
y_pred_save = model.predict(X_test_scaled)
df_test['h_index_pred'].update(pd.Series(np.round_(y_pred_save, decimals=3)))
df_test.loc[:, ["authorID", "h_index_pred"]].to_csv(
    'predictions.csv', index=False
)

In [None]:
df_test.loc[:, ["authorID", "h_index_pred"]]

# finetuning 

---



In [None]:
## Hyper parameter search
train_dmatrix = xgb.DMatrix(data=X_train_final, label=y_train_final)

time: 39.3 ms (started: 2021-02-07 11:03:47 +00:00)


## Early stopping for num of boosting rounds

In [None]:
# Creata the parameter dictionary for each tree: params
params = {
    "objective":"reg:squarederror", 
    "max_depth": 5,
    "n_estimators": 200
}

# Perform cross-validation with early-stopping: cv_results
cv_results = xgb.cv(
    dtrain=train_dmatrix, 
    nfold=4, 
    params=params, 
    metrics="mae", 
    early_stopping_rounds=10, 
    num_boost_round=100,
    seed=123
)

cv_results

Unnamed: 0,train-mae-mean,train-mae-std,test-mae-mean,test-mae-std
0,6.512381,0.03353,6.530529,0.122788
1,5.230396,0.030879,5.292411,0.131111
2,4.681756,0.02785,4.796288,0.139419
3,4.440768,0.027814,4.615366,0.142417
4,4.314823,0.029789,4.557977,0.140311
5,4.239606,0.027406,4.545356,0.127317
6,4.182877,0.026721,4.539812,0.114327
7,4.144245,0.02454,4.537821,0.108271
8,4.10438,0.019467,4.541241,0.105299
9,4.059746,0.018686,4.538721,0.10368


time: 55.8 s (started: 2021-02-07 11:07:13 +00:00)


## Tunig eta (learning rate)

In [None]:
# Create list of eta values and empty list to store final round rmse per xgboost model
eta_vals = [0.001, 0.01, 0.1]
best_rmse = []

# Systematicallyvary the eta
for curr_val in eta_vals:
    params['eta'] = curr_val
    
    # Perform cross-validation: cv_results
    cv_results = xgb.cv(dtrain=train_dmatrix, params=params, nfold=4,
                        early_stopping_rounds=5, num_boost_round=10, metrics='mae', seed=123, 
                       as_pandas=True)
    
    # Append the final round rmse to best_rmse
    best_rmse.append(cv_results['test-mae-mean'].tail().values[-1])
    
# Print the result DataFrame
print(pd.DataFrame(list(zip(eta_vals, best_rmse)), columns=['eta', 'best_mae']))

     eta  best_mae
0  0.001  8.974660
1  0.010  8.204939
2  0.100  4.830492
time: 36.8 s (started: 2021-02-06 20:14:08 +00:00)


## Tunig tree depth

In [None]:
# Create the parameter dictionary
params = {"objective":"reg:squarederror"}

# Create list of max_depth values
max_depths = [2, 5, 10, 20, 50, 100, 500, 1000]
best_rmse = []

for curr_val in max_depths:
    params['max_depth'] = curr_val
    
    # Perform cross-validation
    cv_results = xgb.cv(dtrain=train_dmatrix, params=params, nfold=4, 
                       early_stopping_rounds=10, num_boost_round=50, metrics='mae', seed=123,
                        as_pandas=True)
    
    # Append the final round rmse to best_rmse
    best_rmse.append(cv_results['test-mae-mean'].tail().values[-1])
    
# Print the result DataFrame
print(pd.DataFrame(list(zip(max_depths, best_rmse)), columns=['max_depth', 'best_mae']))

   max_depth  best_mae
0          2  5.020174
1          5  4.499397
2         10  4.505210
3         20  4.602783
4         50  4.619110
5        100  4.619110
6        500  4.619110
7       1000  4.619110
time: 12min 55s (started: 2021-02-06 20:14:45 +00:00)


## Tuning colsample_bytree

In [None]:
# Create the parameter dictionary
params={"objective":"reg:squarederror", "max_depth":3}

# Create list of hyperparameter values: colsample_bytree_vals
colsample_bytree_vals = [0.1, 0.5, 0.8, 1]
best_rmse = []

# Systematically vary the hyperparameter value 
for curr_val in colsample_bytree_vals:
    params['colsample_bytree'] = curr_val
    
    # Perform cross-validation
    cv_results = xgb.cv(dtrain=train_dmatrix, params=params, nfold=4,
                 num_boost_round=10, early_stopping_rounds=5,
                 metrics="mae", as_pandas=True, seed=123)
    
    # Append the final round rmse to best_rmse
    best_rmse.append(cv_results["test-mae-mean"].tail().values[-1])

# Print the resultant DataFrame
print(pd.DataFrame(list(zip(colsample_bytree_vals, best_rmse)), 
                   columns=["colsample_bytree","best_mae"]))

   colsample_bytree  best_mae
0               0.1  5.618477
1               0.5  4.950673
2               0.8  4.782896
3               1.0  4.796019
time: 29.4 s (started: 2021-02-06 20:27:41 +00:00)


In [None]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid: gbm_param_grid
gbm_param_grid = {
    'colsample_bytree': [0.8],
    'n_estimators': [200],
    'max_depth': [5],
    'eta': [0.1, 0.2, 0.5],
    'early_stopping_rounds': [10], 
    'num_boost_round': [10]
}

# Instantiate the regressor: gbm
gbm = xgb.XGBRegressor()

# Perform grid search: grid_mse
grid_mse = GridSearchCV(param_grid=gbm_param_grid, estimator=gbm, 
                        scoring='neg_mean_absolute_error', cv=2, verbose=1, n_jobs=-1)

# Fit grid_mse to the data
grid_mse.fit(X_train_scaled, y_train)

# Print the best parameters and lowest RMSE
print("Best parameters found: ", grid_mse.best_params_)
print("Lowest MAE found: ", np.abs(grid_mse.best_score_))

Fitting 2 folds for each of 3 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  4.4min finished


Best parameters found:  {'colsample_bytree': 0.8, 'early_stopping_rounds': 10, 'eta': 0.1, 'max_depth': 5, 'n_estimators': 200, 'num_boost_round': 10}
Lowest MAE found:  4.370246171951294
time: 6min 9s (started: 2021-02-06 20:41:17 +00:00)


Best parameters found:  {'colsample_bytree': 0.8, 'early_stopping_rounds': 10, 'eta': 0.1, 'max_depth': 5, 'n_estimators': 100, 'num_boost_round': 10}

4.37

## Random Search

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Create the parameter grid: here it corresponds to a range of values it can take from (distribution sort of)
gbm_param_grid = {
    'n_estimators': range(1, 1000),
    'max_depth': range(2, 12),
    'colsample_bytree': [0.1*i for i in range(1, 10)],
    'eta': [0.001*i for i in range(1, 10)] + [0.01*i for i in range(1, 10)] + [0.1*i for i in range(1, 10)],
}

# Instantiate the regressor: gbm
gbm = xgb.XGBRegressor()

# Perform random search: randomized_mse
randomized_mae = RandomizedSearchCV(param_distributions=gbm_param_grid, estimator=gbm, 
                                    scoring='neg_mean_absolute_error', n_iter=100, cv=2, 
                                   verbose=1, n_jobs=-1)

# Fit randomized_mse to the data
randomized_mae.fit(X_train, y_train)

# Print the best parameters and lowest RMSE
print("Best parameters found: ", randomized_mae.best_params_)
print("Lowest MAE found: ", np.abs(randomized_mae.best_score_))

In [None]:
# Save to file
model = xgb.XGBRegressor(
    n_estimators= 12, 
    max_depth= 7, 
    eta= 0.06, 
    colsample_bytree=0.8,
)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
df_test['h_index_pred'].update(pd.Series(np.round_(y_pred, decimals=3)))
df_test.loc[:, ["authorID", "h_index_pred"]].to_csv(
    'predictions.csv', index=False
)