In [1]:
#!pip install catboost
#!pip install spektral
import pandas as pd
import numpy as np
import networkx as nx
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import train_test_split

import xgboost as xgb
import lightgbm as lgb
import catboost as cat


from utils.GCN_model import GCN_model
from utils.Stacking_regressor import Stacking_regressor
from utils.Averaging_Models import AveragingModels
from utils.embedding_PCA import remove_embedding, pca_node_embedding, pca_author_embedding

path = "./data/"

# Useful Functions

In [4]:
def generate_data(X, y, df, features, all_data, doc2vec, Bert):
    auth_freq_abs = np.load(path+'auth_freq_abs.npy',allow_pickle='TRUE').item()
    auth_freq_crps = np.load(path+'auth_freq_crps.npy',allow_pickle='TRUE').item()
    if all_data == True:
        author_Id_train = list(df.authorID)
        loop = pd.DataFrame(G.nodes()).iterrows()
    else:
        loop = df.iterrows()
    for i, row in tqdm(loop):
        if all_data == True:
            node = row[0]
        else:
            node = row['authorID']      
        X[i, 0:f_n] = features.loc[features.index == node,:].values
        if len(auth_freq_abs[str(int(node))]) != 0:
            X[i, f_n:f_n+1] = np.max(auth_freq_abs[str(int(node))])
            X[i, f_n+1:f_n+2] = np.mean(auth_freq_abs[str(int(node))])
            X[i, f_n+2:f_n+3] = np.max(auth_freq_crps[str(int(node))])
            X[i, f_n+3:f_n+4] = np.mean(auth_freq_crps[str(int(node))])
        X[i, f_n+4:f_n+4+nod_em_n] = df_node_emb_pca.loc[df_node_emb_pca.index == node,:].values
        if (doc2vec == True) & (Bert == False):
            X[i, f_n+4+nod_em_n:f_n+4+nod_em_n+Doc2vec_n] = auth_doc2vec_pca.loc[auth_doc2vec_pca.index == node,:].values
        elif (doc2vec == False) & (Bert == True):
            X[i, f_n+4+nod_em_n:f_n+4+nod_em_n+Bert_n] = df_auth_emb_Bert.loc[df_auth_emb_Bert.index == node,:].values
        elif (doc2vec == True) & (Bert == True):
            X[i, f_n+4+nod_em_n:f_n+4+nod_em_n+Doc2vec_n] = auth_doc2vec_pca.loc[auth_doc2vec_pca.index == node,:].values
            X[i, f_n+4+nod_em_n+Doc2vec_n:] = df_auth_emb_Bert.loc[df_auth_emb_Bert.index == node,:].values
        if all_data == True:
            if node in author_Id_train:
                y[i] = np.log(df.h_index[df.authorID == node].values[0] + 1)
            else:
                y[i] == -100
        else:
            y[i] = np.log(row['h_index']+1)
    return X, y

In [None]:
def base_models():
    lgb_reg = lgb.LGBMRegressor(boosting_type='dart',  n_estimators=2500,num_leaves=27, max_depth=-1, learning_rate=0.2)
    cat_reg = cat.CatBoostRegressor(boosting_type='Ordered',n_estimators=1500, max_depth=5, learning_rate=0.1, metric_period=100, verbose=False)
    xg_reg = xgb.XGBRegressor(objective= "reg:squarederror", n_estimators = 1500, colsample_bytree = 0.7, learning_rate = 0.1,
                            max_depth = 5, alpha = 9, random_state = 7, ree_method="approx")
    return lgb_reg, cat_reg, xg_reg

In [None]:
def error(model, X_train, X_test, y_train, y_test):
    model_name = type(model).__name__
    print("---Performing "+model_name+"---")
    model.fit(X_train, y_train, verbose=False)
    return mae(np.expm1(y_test), np.round(np.expm1(model.predict(X_test))))

In [5]:
# Read the graph
G = nx.read_edgelist(path + "collaboration_network.edgelist", delimiter=' ', nodetype=int)
# Read training data
df_train = pd.read_csv(path + "train.csv", dtype={'authorID': np.int64, 'h_index': np.float32})
n_train = df_train.shape[0]
# Read test data
df_test = pd.read_csv(path + "test.csv", dtype={'authorID': np.int64})
n_test = df_test.shape[0]

In [None]:
# Node Embedding with Deep Walk
df_node_emb = pd.read_csv(path + "node_embd_DW_Weighted_256.csv", index_col=0)  # Weithed Node Embedding
#df_node_emb = pd.read_csv(path+'node_embd_DW.csv', index_col=0) # Unweithed Node Embedding

# Author Emebedding with Doc2Vec
df_auth_emb_Doc2vec = pd.read_csv(path + "author_embedding.csv",header = None, index_col=0)

# Author Emebedding with SBERT
df_auth_emb_Bert = pd.read_csv(path + "df_auth_emb_Bert.csv",index_col=0)

In [7]:
features_df = pd.read_csv(path + "df_features.csv",index_col='author_id')
features_df.drop(columns = ['auth_lang_n'], inplace = True)
f_n = features_df.shape[1]

# Graph Convolutional Network (GCN):

In [4]:
import pickle
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb+') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

## Data preparation:

In [6]:
id_train, id_valid, h_train, h_valid = train_test_split(df_train.authorID, df_train.h_index, test_size=0.33, random_state=7)

#Create mask for train, validation and test

train_mask = np.in1d(G.nodes(), id_train)
print(np.sum(train_mask))

valid_mask = np.in1d(G.nodes(), id_valid)
print(np.sum(valid_mask))

test_mask = np.in1d(G.nodes(), df_test.authorID)
print(np.sum(test_mask))

15493
7631
208115


In [12]:
df_node_emb_pca = pca_node_embedding(25, df_node_emb)
auth_doc2vec_pca  = pca_author_embedding(64, df_auth_emb_Doc2vec)

0.28119939295253166
0.41239559157361055


In [13]:
# Load X, y and Adjencency matrix A
dim_0 = len(list(G.nodes()))
f_n = features_df.shape[1]
Doc2vec_n = auth_doc2vec_pca.shape[1]
nod_em_n = df_node_emb_pca.shape[1]
Bert_n = df_auth_emb_Bert.shape[1]

## ----You can load the data in the next cell instead of running generate data function----##
X = np.zeros((dim_0,f_n+nod_em_n+Bert_n+Doc2vec_n+4))
y = np.zeros(dim_0)
X, y = generate_data(X, y, df_train, features_df,  all_data=True, doc2vec = True, Bert = True) # Doc2Vec + DW + BERT for the whole data

231239it [34:56, 110.30it/s]


In [9]:
# save_obj(X, path + "X_final_data")
# save_obj(y, path + "y_final_data")
X = load_obj(path + "X_final_data")
y = load_obj(path + "y_final_data")
A = load_obj(path + "A_data")

In [14]:
y[train_mask] #get only y_train which means h-index for train dataset only

array([1.09861229, 2.07944154, 1.09861229, ..., 0.69314718, 1.09861229,
       1.38629436])

In [10]:
gcn = GCN_model(X, y, A)
gcn.build() #compile model

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 880)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 231239)]     0                                            
__________________________________________________________________________________________________
gcn_conv (GCNConv)              (None, 512)          451072      input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
dropout (Dropout)               (None, 512)          0           gcn_conv[0][0]               

In [11]:
gcn.fit(train_mask, valid_mask, n_epochs = 2000) #fit model on train data (train mask) by using validation data (valid_mask)

Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
Epoch 63/2000
Epoch 64/2000
Epoch 65/2000
Epoch 66/2000
Epoch 67/2000
Epoch 68/2000
Epoch 69/2000
Epoch 70/2000
Epoch 71/2000
Epoch 72/2000
E

In [None]:
y_pred = gcn.predict(valid_mask) #predict on test or validation data
print(mae(np.expm1(y[valid_mask]), np.round(np.expm1(y_pred))))#validation mae score

# Doc2Vec + Deep_Walk

In [None]:
df_train_0, df_auth_emb_Doc2vec_1 = remove_embedding(df_train, df_auth_emb_Doc2vec)
df_node_emb_pca = pca_node_embedding(20, df_node_emb)
auth_doc2vec_pca  = pca_author_embedding(100, df_auth_emb_Doc2vec_1)

0.24767599269232463
0.5440698584320326


In [None]:
dim_0 = df_train_0.shape[0]
f_n = features_df.shape[1]
Doc2vec_n = auth_doc2vec_pca.shape[1]
nod_em_n = df_node_emb_pca.shape[1]
X_train = np.zeros((dim_0,f_n+4+nod_em_n+Doc2vec_n))
y_train = np.zeros(dim_0)
X_train, y_train = generate_data(X_train, y_train, df_train_0, features_df,  all_data=False, doc2vec = True, Bert = False)
X_t, X_v, y_t, y_v = train_test_split(X_train, y_train, test_size=0.33, random_state=7)

22872it [01:12, 315.07it/s]


In [None]:
lgb_reg, _ , _ = base_models()
X_t, X_v, y_t, y_v = train_test_split(X_train, y_train, test_size=0.33, random_state=7)
print('mae for lighgbm : ', error(lgb_reg, X_t, X_v, y_t, y_v))

---Performing LGBMRegressor---
mae for lighgbm :  3.325781664016958


# Bert + Deep_Walk

In [None]:
dim_0 = df_train.shape[0]
df_node_emb_pca = pca_node_embedding(20, df_node_emb)
nod_em_n = df_node_emb_pca.shape[1]
X_train = np.zeros((dim_0,f_n+4+nod_em_n+Bert_n))
y_train = np.zeros(dim_0)
X_train, y_train = generate_data(X_train, y_train, df_train, features_df,  all_data=False, doc2vec = False, Bert = True)

0.24767599269232485


23124it [01:18, 296.02it/s]


In [None]:
lgb_reg, _ , _ = base_models()
X_t, X_v, y_t, y_v = train_test_split(X_train, y_train, test_size=0.33, random_state=7)
print('mae for lighgbm : ', error(lgb_reg, X_t, X_v, y_t, y_v))

---Performing LGBMRegressor---
mae for lighgbm :  3.1643297077709343


# Doc2vec + Bert + Deep_Walk

In [None]:
df_node_emb_pca = pca_node_embedding(25, df_node_emb)
auth_doc2vec_pca  = pca_author_embedding(64, df_auth_emb_Doc2vec)

0.28119939295253166
0.4123955915736107


In [None]:
dim_0 = df_train.shape[0]
f_n = features_df.shape[1]
Doc2vec_n = auth_doc2vec_pca.shape[1]
nod_em_n = df_node_emb_pca.shape[1]
Bert_n = df_auth_emb_Bert.shape[1]
X_train = np.zeros((dim_0,f_n+4+nod_em_n+Bert_n+Doc2vec_n))
y_train = np.zeros(dim_0)
X_train, y_train = generate_data(X_train, y_train, df_train, features_df,  all_data=False, doc2vec = True, Bert = True)

23124it [01:55, 200.07it/s]


## Models

In [None]:
lgb_reg, cat_reg, xg_reg = base_models()
X_t, X_v, y_t, y_v = train_test_split(X_train, y_train, test_size=0.33, random_state=7)

In [None]:
print('mae for catboost : ', error(cat_reg, X_t, X_v, y_t, y_v))
print('mae for lighgbm : ', error(lgb_reg, X_t, X_v, y_t, y_v))
print('mae for xgboost : ', error(xg_reg, X_t, X_v, y_t, y_v))

---Performing CatBoostRegressor---
mae for catboost :  3.279124623247281
---Performing LGBMRegressor---
mae for lighgbm :  3.1163674485650636
---Performing XGBRegressor---
mae for xgboost :  3.2552745380684054


In [None]:
%%capture
lgb_reg, cat_reg, xg_reg = base_models()
Averaging_models = AveragingModels(X_t, y_t, [lgb_reg, cat_reg, xg_reg])
Averaging_models.fit(X_t, y_t)

In [None]:
error_avg = mae(np.expm1(y_v), np.round(np.expm1(Averaging_models.predict(X_v))))
print('mae for Averaging_models : ', error_avg)

mae for Averaging_models :  3.1611846415935


In [None]:
%%capture

regressor = Stacking_regressor(lgb_reg, cat_reg, xg_reg)
regressor.fit(X_t, y_t)

In [None]:
error_stacking = mae(np.expm1(y_v), np.round(np.expm1(regressor.predict(X_v))))
print('mae for Stacking_regressor : ', error_stacking)

mae for Stacking_regressor :  3.114401782204167
