In [None]:
!pip install gdown
!pip install ipython-autotime
!pip install networkx



In [None]:
%load_ext autotime

time: 188 µs (started: 2021-02-27 10:36:18 +00:00)


In [None]:
!gdown "https://drive.google.com/uc?id=1Qbi954Bwx-PplM8F_7TrB_blcqcB-bF2"

Downloading...
From: https://drive.google.com/uc?id=1Qbi954Bwx-PplM8F_7TrB_blcqcB-bF2
To: /content/train.csv
  0% 0.00/305k [00:00<?, ?B/s]100% 305k/305k [00:00<00:00, 83.6MB/s]
time: 2.53 s (started: 2021-02-27 10:36:18 +00:00)


In [None]:
!gdown "https://drive.google.com/uc?id=1hWycEy8rQ8e_krGyUhqGQiMBxZtD9SWy"

Downloading...
From: https://drive.google.com/uc?id=1hWycEy8rQ8e_krGyUhqGQiMBxZtD9SWy
To: /content/test.csv
0.00B [00:00, ?B/s]2.47MB [00:00, 79.1MB/s]
time: 920 ms (started: 2021-02-27 10:36:21 +00:00)


In [None]:
!gdown "https://drive.google.com/uc?id=1IATZNB9SCDWkPhU1okCKwf_NMFSC_LSn"

Downloading...
From: https://drive.google.com/uc?id=1IATZNB9SCDWkPhU1okCKwf_NMFSC_LSn
To: /content/collaboration_network_weighted.edgelist
113MB [00:00, 147MB/s]
time: 2.04 s (started: 2021-02-27 10:36:22 +00:00)


In [None]:
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from pathlib import Path

# read training data
df_train = pd.read_csv('train.csv', dtype={'authorID': np.int64, 'h_index': np.float32})
n_train = df_train.shape[0]

# read test data
df_test = pd.read_csv('test.csv', dtype={'authorID': np.int64})
n_test = df_test.shape[0]

# load the graph
G = nx.read_edgelist('collaboration_network_weighted.edgelist',
                     delimiter=' ', nodetype=int)
n_nodes = G.number_of_nodes()
n_edges = G.number_of_edges()
print('Number of nodes:', n_nodes)
print('Number of edges:', n_edges)

Number of nodes: 231239
Number of edges: 2982240
time: 50.4 s (started: 2021-02-27 10:36:24 +00:00)


In [None]:
# remove 1-links
removed_edges = [(u, v) for (u, v, d) in G.edges(data=True) if d["weight"] == 1.0]

time: 3.32 s (started: 2021-02-27 10:37:14 +00:00)


In [None]:
G.remove_edges_from(removed_edges)
G.number_of_edges()

1284664

time: 2.67 s (started: 2021-02-27 10:37:18 +00:00)


In [None]:
from random import sample

N = 100
random_nodes = G.subgraph(sample(list(G.nodes()), N))

time: 18.4 ms (started: 2021-02-27 10:37:20 +00:00)


In [None]:
import networkx as nx

methods = []
for i in dir(nx.algorithms):
    if type(getattr(nx.algorithms, i)).__name__ == "function":
        methods.append(getattr(nx.algorithms, i))
len(methods)

390

time: 7.97 ms (started: 2021-02-27 10:37:20 +00:00)


In [None]:
def _produce_name(method_address):
    return str(method_address).split(" ")[1]

benchmark = {}

time: 2.64 ms (started: 2021-02-27 10:37:20 +00:00)


In [None]:
from tqdm.notebook import tqdm
from time import time
import numbers

allowed_methods = []

for method in tqdm(methods):
    method_repr = _produce_name(method)
    print(method_repr, end="")
    start = time()
    try:
        res = method(random_nodes)
        if type(res) == dict and list(res.keys())[0] in random_nodes.nodes() and isinstance(list(res.values())[0], numbers.Number):
            benchmark[method] = time()-start
            print(": Added")
        else:
            print(": Not added")
    except Exception as e:
        print(": Not added")
        print(str(e) + "\n")
        if method_repr in allowed_methods:
            benchmark[method] = time()-start
        pass

HBox(children=(FloatProgress(value=0.0, max=390.0), HTML(value='')))

adamic_adar_index: Not added
all_node_cuts: Not added
all_pairs_bellman_ford_path: Not added
all_pairs_bellman_ford_path_length: Not added
all_pairs_dijkstra: Not added
all_pairs_dijkstra_path: Not added
all_pairs_dijkstra_path_length: Not added
all_pairs_lowest_common_ancestor: Not added
not implemented for undirected type

all_pairs_node_connectivity: Not added
all_pairs_shortest_path: Not added
all_pairs_shortest_path_length: Not added
all_shortest_paths: Not added
all_shortest_paths() missing 2 required positional arguments: 'source' and 'target'

all_simple_edge_paths: Not added
all_simple_edge_paths() missing 2 required positional arguments: 'source' and 'target'

all_simple_paths: Not added
all_simple_paths() missing 2 required positional arguments: 'source' and 'target'

all_topological_sorts: Not added
not implemented for undirected type

all_triads: Not added
not implemented for undirected type

all_triplets: Not added
not implemented for undirected type

ancestors: Not added

  B = (expA - scipy.linalg.expm(A)) / expA


: Not added
complement: Not added
complete_bipartite_graph: Not added
complete_bipartite_graph() missing 1 required positional argument: 'n2'

complete_to_chordal_graph: Not added
compose: Not added
compose() missing 1 required positional argument: 'H'

compose_all: Not added
'int' object has no attribute 'is_multigraph'

condensation: Not added
not implemented for undirected type

conductance: Not added
conductance() missing 1 required positional argument: 'S'

connected_components: Not added
connected_double_edge_swap: Not added
Graph not connected

constraint: Added
contracted_edge: Not added
contracted_edge() missing 1 required positional argument: 'edge'

contracted_nodes: Not added
contracted_nodes() missing 2 required positional arguments: 'u' and 'v'

core_number: Added
cost_of_flow: Not added
cost_of_flow() missing 1 required positional argument: 'flowDict'

could_be_isomorphic: Not added
could_be_isomorphic() missing 1 required positional argument: 'G2'

current_flow_betweenn

  x = x / x.max()


: Not added
is_attracting_component: Not added
not implemented for undirected type

is_biconnected: Not added
is_bipartite: Not added
is_branching: Not added
not implemented for undirected type

is_chordal: Not added
is_connected: Not added
is_digraphical: Not added
is_digraphical() missing 1 required positional argument: 'out_sequence'

is_directed_acyclic_graph: Not added
is_distance_regular: Not added
is_dominating_set: Not added
is_dominating_set() missing 1 required positional argument: 'nbunch'

is_edge_cover: Not added
is_edge_cover() missing 1 required positional argument: 'cover'

is_eulerian: Not added
is_forest: Not added
is_graphical: Not added
is_isolate: Not added
is_isolate() missing 1 required positional argument: 'n'

is_isomorphic: Not added
is_isomorphic() missing 1 required positional argument: 'G2'

is_k_edge_connected: Not added
is_k_edge_connected() missing 1 required positional argument: 'k'

is_k_regular: Not added
is_k_regular() missing 1 required positional a

  adjacency_matrix /= adjacency_matrix.sum(axis=0)


wiener_index: Not added
within_inter_cluster: Not added

time: 11.4 s (started: 2021-02-27 10:37:20 +00:00)


In [None]:
len(benchmark)

29

time: 4.14 ms (started: 2021-02-27 10:37:32 +00:00)


In [None]:
quick_methods = sorted(((method, time_taken) for method, time_taken in benchmark.items()), key=lambda x: x[1])
quick_methods

[(<function networkx.algorithms.centrality.degree_alg.degree_centrality>,
  0.0018117427825927734),
 (<function networkx.algorithms.structuralholes.constraint>,
  0.0019478797912597656),
 (<function networkx.algorithms.structuralholes.effective_size>,
  0.0021333694458007812),
 (<function networkx.algorithms.cluster.clustering>, 0.002807140350341797),
 (<function networkx.algorithms.centrality.eigenvector.eigenvector_centrality>,
  0.0032052993774414062),
 (<function networkx.algorithms.core.core_number>, 0.0034215450286865234),
 (<function networkx.algorithms.link_analysis.pagerank_alg.pagerank>,
  0.003622770309448242),
 (<function networkx.algorithms.cluster.triangles>, 0.003744363784790039),
 (<function networkx.algorithms.coloring.greedy_coloring.greedy_color>,
  0.0039539337158203125),
 (<function networkx.algorithms.centrality.load.newman_betweenness_centrality>,
  0.00399327278137207),
 (<function networkx.algorithms.centrality.katz.katz_centrality>,
  0.003996610641479492),
 (

time: 8.13 ms (started: 2021-02-27 10:37:32 +00:00)


In [None]:
# [('effective_size', 0.00012493133544921875),
#  ('degree_centrality', 0.00014328956604003906),
#  ('constraint', 0.00015497207641601562),
#  ('triangles', 0.00016260147094726562),
#  ('number_of_cliques', 0.00018095970153808594),
#  ('greedy_color', 0.0001876354217529297),
#  ('newman_betweenness_centrality', 0.00018858909606933594),
#  ('core_number', 0.0002040863037109375),
#  ('harmonic_centrality', 0.0002067089080810547),
#  ('clustering', 0.0002186298370361328),
#  ('square_clustering', 0.00023293495178222656),
#  ('eigenvector_centrality', 0.0002429485321044922),
#  ('katz_centrality', 0.00025582313537597656),
#  ('betweenness_centrality_source', 0.000308990478515625),
#  ('closeness_centrality', 0.00031447410583496094),
#  ('percolation_centrality', 0.00032329559326171875),
#  ('average_neighbor_degree', 0.0003249645233154297),
#  ('subgraph_centrality', 0.00032520294189453125),
#  ('betweenness_centrality', 0.0003304481506347656),
#  ('onion_layers', 0.0003974437713623047),
#  ('node_clique_number', 0.0004062652587890625),
#  ('pagerank', 0.0005104541778564453),
#  ('pagerank_numpy', 0.0007638931274414062),
#  ('closeness_vitality', 0.0008549690246582031),
#  ('subgraph_centrality_exp', 0.0009100437164306641),
#  ('katz_centrality_numpy', 0.0010249614715576172),
#  ('pagerank_scipy', 0.001619100570678711),
#  ('eigenvector_centrality_numpy', 0.0018410682678222656),
#  ('communicability_betweenness_centrality', 0.0054891109466552734)]

time: 5.99 ms (started: 2021-02-27 10:37:32 +00:00)


In [None]:
N = 1000
random_nodes = G.subgraph(sample(list(G.nodes()), N))

time: 64.8 ms (started: 2021-02-27 10:37:32 +00:00)


In [None]:
benchmark_fast = {}

time: 2.45 ms (started: 2021-02-27 10:37:32 +00:00)


In [None]:
filtered_methods = ["percolation_centrality", "communicability_betweenness_centrality"]
methods_fast = [x[0] for x in quick_methods if _produce_name(x[0]) not in filtered_methods]

for method in tqdm(methods_fast):
    method_repr = _produce_name(method)
    print(method_repr, end="")
    start = time()
    try:
        res = method(random_nodes)
        if type(res) == dict and list(res.keys())[0] in random_nodes.nodes() and isinstance(list(res.values())[0], numbers.Number):
            benchmark_fast[method] = time()-start
            print(": Added")
        else:
            print(": Not added")
    except Exception as e:
        print(": Not added")
        print(str(e) + "\n")
        pass

HBox(children=(FloatProgress(value=0.0, max=27.0), HTML(value='')))

degree_centrality: Added
constraint: Added
effective_size: Added
clustering: Added
eigenvector_centrality: Added
core_number: Added
pagerank: Added
triangles: Added
greedy_color: Added
newman_betweenness_centrality: Added
katz_centrality: Added
node_clique_number: Added
onion_layers: Added
average_neighbor_degree: Added
katz_centrality_numpy: Added
square_clustering: Added
closeness_centrality: Added
betweenness_centrality: Added
pagerank_scipy: Added
betweenness_centrality_source: Added
harmonic_centrality: Added
number_of_cliques: Added
subgraph_centrality_exp: Added
subgraph_centrality: Added
eigenvector_centrality_numpy: Added
pagerank_numpy: Added
closeness_vitality: Added

time: 12.8 s (started: 2021-02-27 10:37:32 +00:00)


In [None]:
fast_methods = sorted(((method, time_taken) for method, time_taken in benchmark_fast.items()), key=lambda x: x[1])
str_fast_methods = [(_produce_name(method), time_taken) for method, time_taken in fast_methods]
str_quick_methods = [(_produce_name(method), time_taken) for method, time_taken in quick_methods]
str_fast_methods

[('degree_centrality', 0.026424169540405273),
 ('greedy_color', 0.03774547576904297),
 ('clustering', 0.044959068298339844),
 ('pagerank_scipy', 0.04802584648132324),
 ('triangles', 0.04928445816040039),
 ('core_number', 0.04981493949890137),
 ('average_neighbor_degree', 0.05359339714050293),
 ('newman_betweenness_centrality', 0.05432844161987305),
 ('onion_layers', 0.06090807914733887),
 ('eigenvector_centrality_numpy', 0.06296896934509277),
 ('node_clique_number', 0.0753183364868164),
 ('pagerank', 0.0848534107208252),
 ('number_of_cliques', 0.09130644798278809),
 ('katz_centrality_numpy', 0.09653091430664062),
 ('eigenvector_centrality', 0.15868782997131348),
 ('square_clustering', 0.18868017196655273),
 ('katz_centrality', 0.2028517723083496),
 ('constraint', 0.25261902809143066),
 ('subgraph_centrality', 0.2731895446777344),
 ('harmonic_centrality', 0.3767530918121338),
 ('effective_size', 0.6420180797576904),
 ('closeness_centrality', 0.7068426609039307),
 ('subgraph_centrality_e

time: 8.85 ms (started: 2021-02-27 10:37:45 +00:00)


In [None]:
quick_methods_ = [x[0] for x in quick_methods]
index_changed = sorted(
    [
        (
            method, 
            tau/(10*quick_methods[quick_methods_.index(method)][1]),
            abs(i - quick_methods_.index(method))
        ) 
        for i, (method, tau) in enumerate(fast_methods)
        if method in quick_methods_
    ], 
    key=lambda x: (x[1], x[2])
)
[(_produce_name(x), val1, val2) for x, val1, val2 in index_changed]

[('eigenvector_centrality_numpy', 0.4685061997764887, 15),
 ('pagerank_scipy', 0.5745436394751854, 15),
 ('average_neighbor_degree', 0.8109784255718305, 7),
 ('number_of_cliques', 0.8584971642493667, 9),
 ('greedy_color', 0.954630969609262, 7),
 ('onion_layers', 1.1624817983254458, 4),
 ('katz_centrality_numpy', 1.3032478192294075, 1),
 ('triangles', 1.316230499840815, 3),
 ('newman_betweenness_centrality', 1.3604991342766732, 2),
 ('core_number', 1.455919448122082, 0),
 ('degree_centrality', 1.4584945387550994, 0),
 ('clustering', 1.6015967385765246, 1),
 ('node_clique_number', 1.864534025851384, 1),
 ('pagerank', 2.3422244159262915, 5),
 ('subgraph_centrality', 2.4674619923338645, 5),
 ('square_clustering', 2.51344089436575, 0),
 ('harmonic_centrality', 4.218299030992232, 1),
 ('eigenvector_centrality', 4.9507958940791434, 10),
 ('katz_centrality', 5.075595060550021, 6),
 ('pagerank_numpy', 5.226591571453062, 3),
 ('subgraph_centrality_exp', 7.259475916059585, 0),
 ('closeness_centra

time: 16.6 ms (started: 2021-02-27 10:37:45 +00:00)


In [None]:
forbidden_methods = [
    "dfs_predecessors", 
    "harmonic_centrality", 
    "pagerank", 
    "pagerank_numpy",  
    "katz_centrality", 
    "subgraph_centrality", 
    "subgraph_centrality_exp", 
    "effective_size", 
    "closeness_vitality", 
    "betweenness_centrality_source", 
    "betweenness_centrality", 
    "eigenvector_centrality", 
    "closeness_centrality",
    "number_of_cliques",
    "constraint",
    "square_clustering",
    "katz_centrality_numpy",
    "node_clique_number"
]
# benchmark square clustering, constraint
final_methods = [x[0] for x in index_changed if _produce_name(x[0]) not in forbidden_methods]
final_methods

[<function networkx.algorithms.centrality.eigenvector.eigenvector_centrality_numpy>,
 <function networkx.algorithms.link_analysis.pagerank_alg.pagerank_scipy>,
 <function networkx.algorithms.assortativity.neighbor_degree.average_neighbor_degree>,
 <function networkx.algorithms.coloring.greedy_coloring.greedy_color>,
 <function networkx.algorithms.core.onion_layers>,
 <function networkx.algorithms.cluster.triangles>,
 <function networkx.algorithms.centrality.load.newman_betweenness_centrality>,
 <function networkx.algorithms.core.core_number>,
 <function networkx.algorithms.centrality.degree_alg.degree_centrality>,
 <function networkx.algorithms.cluster.clustering>,
 <function networkx.algorithms.clique.node_clique_number>]

time: 13.9 ms (started: 2021-02-27 10:37:45 +00:00)


In [None]:
graph_features = []

M = 2000
random_nodes = G.subgraph(sample(list(G.nodes()), M))

for m in tqdm(final_methods):
    t0 = time()
    print(f"Method {_produce_name(m)}", end="")
    graph_features.append(m(random_nodes))
    print(f": Took {(time() - t0)/60:.3f} minutes.")

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))

Method eigenvector_centrality_numpy: Took 0.001 minutes.
Method pagerank_scipy: Took 0.002 minutes.
Method average_neighbor_degree: Took 0.003 minutes.
Method greedy_color: Took 0.001 minutes.
Method onion_layers: Took 0.002 minutes.
Method triangles: Took 0.002 minutes.
Method newman_betweenness_centrality: Took 0.006 minutes.
Method core_number: Took 0.002 minutes.
Method degree_centrality: Took 0.001 minutes.
Method clustering: Took 0.002 minutes.
Method node_clique_number: Took 0.006 minutes.

time: 1.84 s (started: 2021-02-27 10:50:05 +00:00)


Method eigenvector_centrality_numpy: Took 0.032 minutes.
Method pagerank_scipy: Took 0.022 minutes.
Method average_neighbor_degree: Took 0.180 minutes.
Method greedy_color: Took 0.013 minutes.
Method onion_layers: Took 0.027 minutes.
Method triangles: Took 0.185 minutes.
Method newman_betweenness_centrality: Took 1.664 minutes.
Method core_number: Took 0.014 minutes.
Method degree_centrality: Took 0.006 minutes.
Method clustering: Took 0.182 minutes.
Method node_clique_number: Took 2.123 minutes

In [None]:
# Method pagerank_scipy: Took 0.017 minutes. # x10
# Method square_clustering: Took 5.440 minutes. # almost x1000
# Method eigenvector_centrality_numpy: Took 0.017 minutes. # x10
# Method greedy_color: Took 0.011 minutes. # x10
# Method clustering: Took 0.074 minutes. # x30
# Method node_clique_number: Took 0.947 minutes. # almost x100
# Method number_of_cliques: Took 0.839 minutes. # almost x100
# Method harmonic_centrality: Took 5.198 minutes.# almost x300

In [None]:
import numpy as np

np.save("graph_features.npy", graph_features)

In [None]:
# computes structural features for each node
core_number = nx.core_number(G)
onion_number = nx.onion_layers(G)
avg_neighbor_degree = nx.average_neighbor_degree(G)
degree_centrality = nx.degree_centrality(G)
clustering = nx.clustering(G)
print("Features computed")

Features computed
time: 3min 55s (started: 2021-02-08 10:17:17 +00:00)


In [None]:
# create the training matrix. each node is represented as a vector of 3 features:
# (1) its degree, (2) its core number and (3) the average degree of its neighbors
X_train_graph = np.zeros((n_train, 7))
y_train_graph = np.zeros(n_train)
for i, row in df_train.iterrows():
    node = row['authorID']
    X_train_graph[i, 0] = G.degree(node)
    X_train_graph[i, 1] = core_number[node]
    X_train_graph[i, 2] = avg_neighbor_degree[node]
    X_train_graph[i, 3] = onion_number[node]
    X_train_graph[i, 4] = degree_centrality[node]
    X_train_graph[i, 5] = clustering[node]
    X_train_graph[i, 6] = row['authorID']
    y_train_graph[i] = row['h_index']

time: 1.96 s (started: 2021-02-08 10:21:13 +00:00)


In [None]:
X_test_graph = np.zeros((n_test, 7))
for i, row in df_test.iterrows():
    node = row['authorID']
    X_test_graph[i, 0] = G.degree(node)
    X_test_graph[i, 1] = core_number[node]
    X_test_graph[i, 2] = avg_neighbor_degree[node]
    X_test_graph[i, 3] = onion_number[node]
    X_test_graph[i, 4] = degree_centrality[node]
    X_test_graph[i, 5] = clustering[node]
    X_test_graph[i, 6] = row['authorID']

time: 17 s (started: 2021-02-08 10:21:15 +00:00)


In [None]:
print(X_train_graph.shape)
print(y_train_graph.shape)
print(X_test_graph.shape)

(23124, 7)
(23124,)
(208115, 7)
time: 3.04 ms (started: 2021-02-08 10:21:32 +00:00)


In [None]:
## Merge text features and graph
columns=["degree", "core_number", "avg_neighbors", "onion_number", "degree_centrality", "clustering", "authorID"]

X_train_graph_df = pd.DataFrame(
    X_train_graph, 
    columns=columns
)
X_test_graph_df = pd.DataFrame(
    X_test_graph,
    columns=columns
)

time: 9.31 ms (started: 2021-02-08 10:21:32 +00:00)


In [None]:
X_train = X_train_graph_df.merge(df_train, on="authorID")
X_test = X_test_graph_df.merge(df_test, on="authorID")

time: 842 ms (started: 2021-02-08 10:21:32 +00:00)


In [None]:
X_train = X_train.merge(author_num_papers_train, on="authorID")
X_test = X_test.merge(author_num_papers_test, on="authorID")

time: 521 ms (started: 2021-02-08 10:21:33 +00:00)


In [None]:
X_test

Unnamed: 0,degree,core_number,avg_neighbors,onion_number,degree_centrality,clustering,authorID,h_index_pred,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,...,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,paper_per_author
0,16.0,5.0,6.375000,32.0,0.000069,0.166667,1.036332e+06,,-0.155256,0.260346,0.225399,-0.146141,-0.121848,-0.695546,-0.325143,-0.045406,-0.354348,0.259526,0.325150,0.182431,0.310491,0.853520,-0.285898,-0.610025,0.403819,-0.104862,-0.163646,0.216603,-0.176129,0.132749,-0.192938,-0.184535,-0.413796,-0.322188,0.368108,0.138464,0.091466,0.159419,0.027703,1.044323,...,0.795523,0.113700,-0.179638,0.659689,0.257327,-0.153000,0.409383,-0.152094,0.488123,-1.055033,-0.259180,0.070267,-0.021628,0.131026,-0.094192,-0.092993,0.959038,0.729267,-0.429529,-0.210977,-0.623851,-0.017816,0.032449,-0.233710,-0.747849,0.348640,-0.815551,-0.423639,0.734288,-0.025860,1.528527,0.612259,0.219557,0.235557,-0.369207,0.457921,-0.640803,-0.851161,-0.584914,10
1,2.0,2.0,40.000000,6.0,0.000009,1.000000,1.101850e+06,,-1.042691,0.059498,0.525740,0.494251,-0.762287,-1.254231,-0.517212,0.687099,-0.449063,-0.262348,-0.141930,-0.039613,0.416994,0.384269,-0.469035,0.630117,0.691178,-0.250343,0.783037,0.121113,-0.392788,0.345869,0.049573,-0.499544,0.328888,-0.293167,1.105296,-0.036969,0.830073,-0.063945,0.016628,0.216300,...,-0.140437,-0.123532,-0.021170,0.210708,-0.381251,-1.051324,-0.658570,0.316129,0.121643,-0.878486,-0.935552,0.201553,-1.082995,-0.479846,-0.164699,-1.181147,0.222082,-0.648029,-0.353730,0.741503,-0.207803,0.937993,-0.188198,-0.488327,0.070131,0.316409,-0.509987,0.193508,-1.030599,0.951501,0.534993,0.422611,-0.763438,0.028723,0.199459,-1.386970,0.096496,0.029991,-0.762944,10
2,107.0,13.0,19.906542,133.0,0.000463,0.094340,1.336878e+06,,-0.689841,0.183287,0.197919,0.786861,-0.121455,-0.288074,0.228335,0.735772,-0.479101,-0.099449,0.370037,0.483667,0.852816,0.948093,0.286872,0.527104,0.091453,1.168023,1.587776,0.155022,0.019015,0.405455,0.718611,0.343587,-0.734515,-0.557624,0.911792,1.751038,0.368258,-0.865857,0.979897,-0.004977,...,0.154542,-0.051720,-0.659493,0.787488,-0.690468,0.251092,0.288879,0.176045,0.173905,-1.187423,-0.211064,-0.275924,-0.560891,-0.062985,0.624421,-0.731269,0.484625,-0.545446,-0.558864,0.531976,-0.186134,0.535591,-0.809059,1.010357,0.916632,0.126666,-0.500891,0.680915,-1.154584,0.314562,0.136433,0.197267,-0.412624,-0.702108,0.089873,-0.828727,-0.607820,1.061965,-0.418734,10
3,3.0,3.0,10.666667,12.0,0.000013,1.000000,1.515524e+06,,-0.157730,0.515563,1.100034,0.223041,0.458837,-0.204708,-0.234124,0.699294,0.029797,0.216013,0.089011,-0.230966,0.215486,0.421006,-0.312003,0.795908,0.870546,0.127354,0.752644,-0.427536,0.244944,0.036968,-0.082542,-0.483761,0.373416,-0.643507,0.712348,0.847397,0.688299,-0.342872,-0.776788,0.167132,...,0.247150,-1.047600,-0.844680,-0.195852,0.602862,0.026254,0.336919,0.409069,0.660320,-0.416484,-0.786045,0.138951,-0.601038,0.198226,-0.160834,-0.480783,-0.386439,0.081284,-0.307784,-0.052124,-0.461364,-0.530466,0.388740,0.716832,-0.112452,-0.184019,0.364115,-0.140035,0.323978,0.657082,-0.056838,0.206461,0.208076,0.422688,1.059371,-0.431169,0.042994,-0.190372,-0.686180,10
4,2.0,2.0,2.500000,6.0,0.000009,1.000000,1.606427e+06,,-2.586875,0.497515,-0.283361,-0.349463,-1.154963,0.656123,-2.278294,-0.205188,-0.687929,-0.021536,-1.486544,1.728148,-1.645367,0.949120,-0.358345,0.561480,0.640793,0.520943,0.098057,1.205942,-1.410678,0.741824,0.712074,0.790805,-0.429487,-0.235012,1.471173,0.449798,-1.023981,-0.683958,-1.629534,0.252126,...,-1.059800,-1.531920,0.090296,-0.612620,-0.821650,-0.096015,0.929706,-1.320925,1.494835,-0.674506,1.692135,-0.384102,-0.744986,-0.488114,-0.388717,-0.466860,-0.375351,2.112656,0.279075,-0.528707,-1.265026,1.828347,0.781639,-0.662664,0.013650,2.515458,-1.307551,0.738019,0.156143,-0.624748,1.255702,-0.631254,-1.496553,-0.496400,-1.349975,-0.252682,1.306690,-0.413302,-0.836433,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
208110,2.0,2.0,3.000000,6.0,0.000009,1.000000,2.908387e+09,,-0.149186,0.059576,1.266709,-0.381713,0.889350,0.852032,0.713128,-1.553209,0.996669,1.482064,1.854617,-3.017689,0.755202,1.265741,-0.248379,1.216149,3.175436,-0.179571,-1.421328,0.275496,-2.354647,0.164358,-1.162982,-0.728985,0.248338,-2.743318,-1.584232,0.016289,0.427671,-0.754608,2.323058,0.948590,...,0.870787,-1.212942,-0.502236,0.033678,-0.210403,0.978463,0.964199,1.367468,-0.206939,-0.452424,1.158892,0.043928,-0.052703,-2.091047,0.871733,1.290289,1.658933,-1.846488,0.589930,-1.336555,0.279556,-1.120334,1.233501,0.742955,-2.313985,0.623281,0.207383,0.497897,0.366071,-0.568100,-2.638939,-0.701911,1.631383,2.078212,-1.487375,1.699247,-0.109325,-0.352533,-0.162939,1
208111,4.0,4.0,21.500000,21.0,0.000017,1.000000,2.908426e+09,,-0.081930,-0.698139,0.098003,-2.008395,1.345156,-0.274243,1.113226,1.276448,-0.292127,0.145079,-0.649811,-0.893578,0.271506,1.737947,-3.179465,2.792611,-1.152994,-0.056920,-0.542416,-1.103027,2.005044,0.600051,-0.539209,2.525267,-1.207601,-2.751825,-0.744246,0.317674,1.667089,-0.248875,0.294096,-1.315753,...,1.322071,-0.111963,1.433176,0.725125,2.560617,-0.293633,-0.981468,-0.867888,-0.743421,0.403697,0.095773,2.420750,-1.760718,1.591131,-0.811064,-2.563954,0.847572,0.983452,-0.955982,-0.679260,-0.849609,-0.521846,-3.000236,-0.301128,-1.369122,1.334446,-1.163863,-1.628017,-1.815075,-0.275536,-0.435272,1.404804,0.861267,1.456644,0.113690,-1.471684,-1.062008,0.648366,-3.153230,1
208112,10.0,10.0,33.600000,103.0,0.000043,1.000000,2.908436e+09,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1
208113,1.0,1.0,3.000000,1.0,0.000004,0.000000,2.908499e+09,,0.286835,-0.839971,0.611096,0.327960,1.078321,-1.270630,0.221250,0.929837,-0.498350,0.047874,0.703157,0.911930,0.914742,0.782331,0.533920,0.715461,0.042399,-0.090814,0.447618,-0.213802,0.931521,-0.392314,0.731735,-1.230608,-0.722140,-0.004463,0.250383,1.729914,0.368008,1.809704,-1.107211,0.128283,...,0.561672,-1.042341,0.151909,1.307286,-0.210972,-0.474297,-0.464220,-0.980512,1.628541,-0.506402,-0.818498,-1.849933,-1.167783,0.066413,0.095741,-0.186292,-0.143909,0.627508,-0.485306,-0.450537,-1.709520,-0.498035,-0.075538,0.813568,-0.542703,0.626973,-0.296874,-0.180164,-1.648410,-0.302270,0.361147,-0.566138,0.568060,1.360779,1.172025,-1.396534,0.714365,0.201030,-0.125780,8


time: 501 ms (started: 2021-02-08 10:21:34 +00:00)


In [None]:
# Removing h_index and author_id
y_train = X_train["h_index"]
X_train.drop(columns=["authorID", "h_index"], inplace=True)
print(X_train.head())

X_test.drop(columns=["authorID", "h_index_pred"], inplace=True)
print(X_test.head())

   degree  core_number  avg_neighbors  ...       255       256  paper_per_author
0     3.0          3.0      13.333333  ...  1.284111 -0.491038                 4
1     5.0          5.0      23.200000  ...  0.506366 -1.664988                10
2     5.0          5.0       6.000000  ... -0.286176 -0.743096                 1
3     3.0          3.0      40.333333  ... -0.192451  1.583816                 1
4     4.0          2.0       7.250000  ... -1.457717  0.290012                10

[5 rows x 263 columns]
   degree  core_number  avg_neighbors  ...       255       256  paper_per_author
0    16.0          5.0       6.375000  ... -0.851161 -0.584914                10
1     2.0          2.0      40.000000  ...  0.029991 -0.762944                10
2   107.0         13.0      19.906542  ...  1.061965 -0.418734                10
3     3.0          3.0      10.666667  ... -0.190372 -0.686180                10
4     2.0          2.0       2.500000  ... -0.413302 -0.836433                 1

[5 

In [None]:
# scale for 0 mean and 1 std per col
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(
    X_train_scaled, y_train, test_size=0.2, random_state=42
)

time: 861 ms (started: 2021-02-08 10:21:35 +00:00)


In [None]:
# train a regression model and make predictions
model = xgb.XGBRegressor(n_jobs=-1)
model.fit(X_train_final, y_train_final)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

time: 32.1 s (started: 2021-02-08 10:21:36 +00:00)


In [None]:
y_pred = model.predict(X_test_final)
print(f"Loss: {mean_absolute_error(y_test_final, y_pred)}")

Loss: 4.655028820037842
time: 45.5 ms (started: 2021-02-08 10:22:08 +00:00)


In [None]:
# Save to file
y_pred_save = model.predict(X_test_scaled)
df_test['h_index_pred'].update(pd.Series(np.round_(y_pred_save, decimals=3)))
df_test.loc[:, ["authorID", "h_index_pred"]].to_csv(
    'predictions.csv', index=False
)

time: 2.17 s (started: 2021-02-08 10:22:08 +00:00)


In [None]:
df_test.loc[:, ["authorID", "h_index_pred"]]

Unnamed: 0,authorID,h_index_pred
0,1036332,20.136999
1,1101850,10.493000
2,1336878,32.626999
3,1515524,8.954000
4,1606427,1.015000
...,...,...
208110,2908387141,4.431000
208111,2908425732,1.765000
208112,2908436250,1.621000
208113,2908499439,5.597000


time: 19.5 ms (started: 2021-02-07 11:03:47 +00:00)


In [None]:
## Hyper parameter search
train_dmatrix = xgb.DMatrix(data=X_train_final, label=y_train_final)

time: 39.3 ms (started: 2021-02-07 11:03:47 +00:00)


## Early stopping for num of boosting rounds

In [None]:
# Creata the parameter dictionary for each tree: params
params = {
    "objective":"reg:squarederror", 
    "max_depth": 5,
    "n_estimators": 200
}

# Perform cross-validation with early-stopping: cv_results
cv_results = xgb.cv(
    dtrain=train_dmatrix, 
    nfold=4, 
    params=params, 
    metrics="mae", 
    early_stopping_rounds=10, 
    num_boost_round=100,
    seed=123
)

cv_results

Unnamed: 0,train-mae-mean,train-mae-std,test-mae-mean,test-mae-std
0,6.512381,0.03353,6.530529,0.122788
1,5.230396,0.030879,5.292411,0.131111
2,4.681756,0.02785,4.796288,0.139419
3,4.440768,0.027814,4.615366,0.142417
4,4.314823,0.029789,4.557977,0.140311
5,4.239606,0.027406,4.545356,0.127317
6,4.182877,0.026721,4.539812,0.114327
7,4.144245,0.02454,4.537821,0.108271
8,4.10438,0.019467,4.541241,0.105299
9,4.059746,0.018686,4.538721,0.10368


time: 55.8 s (started: 2021-02-07 11:07:13 +00:00)


## Tunig eta (learning rate)

In [None]:
# Create list of eta values and empty list to store final round rmse per xgboost model
eta_vals = [0.001, 0.01, 0.1]
best_rmse = []

# Systematicallyvary the eta
for curr_val in eta_vals:
    params['eta'] = curr_val
    
    # Perform cross-validation: cv_results
    cv_results = xgb.cv(dtrain=train_dmatrix, params=params, nfold=4,
                        early_stopping_rounds=5, num_boost_round=10, metrics='mae', seed=123, 
                       as_pandas=True)
    
    # Append the final round rmse to best_rmse
    best_rmse.append(cv_results['test-mae-mean'].tail().values[-1])
    
# Print the result DataFrame
print(pd.DataFrame(list(zip(eta_vals, best_rmse)), columns=['eta', 'best_mae']))

     eta  best_mae
0  0.001  8.974660
1  0.010  8.204939
2  0.100  4.830492
time: 36.8 s (started: 2021-02-06 20:14:08 +00:00)


## Tunig tree depth

In [None]:
# Create the parameter dictionary
params = {"objective":"reg:squarederror"}

# Create list of max_depth values
max_depths = [2, 5, 10, 20, 50, 100, 500, 1000]
best_rmse = []

for curr_val in max_depths:
    params['max_depth'] = curr_val
    
    # Perform cross-validation
    cv_results = xgb.cv(dtrain=train_dmatrix, params=params, nfold=4, 
                       early_stopping_rounds=10, num_boost_round=50, metrics='mae', seed=123,
                        as_pandas=True)
    
    # Append the final round rmse to best_rmse
    best_rmse.append(cv_results['test-mae-mean'].tail().values[-1])
    
# Print the result DataFrame
print(pd.DataFrame(list(zip(max_depths, best_rmse)), columns=['max_depth', 'best_mae']))

   max_depth  best_mae
0          2  5.020174
1          5  4.499397
2         10  4.505210
3         20  4.602783
4         50  4.619110
5        100  4.619110
6        500  4.619110
7       1000  4.619110
time: 12min 55s (started: 2021-02-06 20:14:45 +00:00)


# Tuning colsample_bytree

In [None]:
# Create the parameter dictionary
params={"objective":"reg:squarederror", "max_depth":3}

# Create list of hyperparameter values: colsample_bytree_vals
colsample_bytree_vals = [0.1, 0.5, 0.8, 1]
best_rmse = []

# Systematically vary the hyperparameter value 
for curr_val in colsample_bytree_vals:
    params['colsample_bytree'] = curr_val
    
    # Perform cross-validation
    cv_results = xgb.cv(dtrain=train_dmatrix, params=params, nfold=4,
                 num_boost_round=10, early_stopping_rounds=5,
                 metrics="mae", as_pandas=True, seed=123)
    
    # Append the final round rmse to best_rmse
    best_rmse.append(cv_results["test-mae-mean"].tail().values[-1])

# Print the resultant DataFrame
print(pd.DataFrame(list(zip(colsample_bytree_vals, best_rmse)), 
                   columns=["colsample_bytree","best_mae"]))

   colsample_bytree  best_mae
0               0.1  5.618477
1               0.5  4.950673
2               0.8  4.782896
3               1.0  4.796019
time: 29.4 s (started: 2021-02-06 20:27:41 +00:00)


In [None]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid: gbm_param_grid
gbm_param_grid = {
    'colsample_bytree': [0.8],
    'n_estimators': [200],
    'max_depth': [5],
    'eta': [0.1, 0.2, 0.5],
    'early_stopping_rounds': [10], 
    'num_boost_round': [10]
}

# Instantiate the regressor: gbm
gbm = xgb.XGBRegressor()

# Perform grid search: grid_mse
grid_mse = GridSearchCV(param_grid=gbm_param_grid, estimator=gbm, 
                        scoring='neg_mean_absolute_error', cv=2, verbose=1, n_jobs=-1)

# Fit grid_mse to the data
grid_mse.fit(X_train_scaled, y_train)

# Print the best parameters and lowest RMSE
print("Best parameters found: ", grid_mse.best_params_)
print("Lowest MAE found: ", np.abs(grid_mse.best_score_))

Fitting 2 folds for each of 3 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  4.4min finished


Best parameters found:  {'colsample_bytree': 0.8, 'early_stopping_rounds': 10, 'eta': 0.1, 'max_depth': 5, 'n_estimators': 200, 'num_boost_round': 10}
Lowest MAE found:  4.370246171951294
time: 6min 9s (started: 2021-02-06 20:41:17 +00:00)


Best parameters found:  {'colsample_bytree': 0.8, 'early_stopping_rounds': 10, 'eta': 0.1, 'max_depth': 5, 'n_estimators': 100, 'num_boost_round': 10}

4.37

## Random Search

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Create the parameter grid: here it corresponds to a range of values it can take from (distribution sort of)
gbm_param_grid = {
    'n_estimators': range(1, 1000),
    'max_depth': range(2, 12),
    'colsample_bytree': [0.1*i for i in range(1, 10)],
    'eta': [0.001*i for i in range(1, 10)] + [0.01*i for i in range(1, 10)] + [0.1*i for i in range(1, 10)],
}

# Instantiate the regressor: gbm
gbm = xgb.XGBRegressor()

# Perform random search: randomized_mse
randomized_mae = RandomizedSearchCV(param_distributions=gbm_param_grid, estimator=gbm, 
                                    scoring='neg_mean_absolute_error', n_iter=100, cv=2, 
                                   verbose=1, n_jobs=-1)

# Fit randomized_mse to the data
randomized_mae.fit(X_train, y_train)

# Print the best parameters and lowest RMSE
print("Best parameters found: ", randomized_mae.best_params_)
print("Lowest MAE found: ", np.abs(randomized_mae.best_score_))

In [None]:
# Save to file
model = xgb.XGBRegressor(
    n_estimators= 12, 
    max_depth= 7, 
    eta= 0.06, 
    colsample_bytree=0.8,
)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
df_test['h_index_pred'].update(pd.Series(np.round_(y_pred, decimals=3)))
df_test.loc[:, ["authorID", "h_index_pred"]].to_csv(
    'predictions.csv', index=False
)