In [2]:
#!pip install catboost
import pandas as pd
import numpy as np
import networkx as nx
from tqdm import tqdm
from sklearn.model_selection import train_test_split

import xgboost as xgb
from sklearn.metrics import mean_absolute_error as mae
import lightgbm as lgb
import catboost as cat

from utils.Stacking_regressor import Stacking_regressor
from utils.embedding_PCA import pca_node_embedding, pca_author_embedding
path = "/data/"

In [3]:
def base_models():
    lgb_reg = lgb.LGBMRegressor(boosting_type='dart',  n_estimators=2500,num_leaves=27, max_depth=-1, learning_rate=0.2)
    cat_reg = cat.CatBoostRegressor(boosting_type='Ordered',n_estimators=1500, max_depth=5, learning_rate=0.1)
    xg_reg = xgb.XGBRegressor(objective= "reg:squarederror", n_estimators = 1500, colsample_bytree = 0.7, learning_rate = 0.1,
                            max_depth = 5, alpha = 9, random_state = 7, ree_method="approx")
    return lgb_reg, cat_reg, xg_reg

In [4]:
# read training data
df_train = pd.read_csv(path + "train.csv", dtype={'authorID': np.int64, 'h_index': np.float32})
n_train = df_train.shape[0]

# read test data
df_test = pd.read_csv(path + "test.csv", dtype={'authorID': np.int64})
n_test = df_test.shape[0]

In [5]:
# Data Features
features_df = pd.read_csv(path + "df_features.csv",index_col='author_id')
features_df.drop(columns = ['auth_lang_n'], inplace = True)
features_df.shape

(231239, 19)

In [6]:
#Node Embedding
df_node_emb = pd.read_csv(path + "node_embd_DW_Weighted_256.csv", index_col=0)  # Weithed Node Embedding
#df_node_emb = pd.read_csv(path + "node_embd_DW.csv", index_col=0) # Unweithed Node Embedding

# Author Emebeding
df_auth_emb_Doc2vec = pd.read_csv(path + "author_embedding.csv",header = None, index_col=0) # Author Embedding with Doc2Vec

 #Author Emebeding
df_auth_emb_Bert = pd.read_csv(path + "df_auth_emb_Bert.csv",index_col=0)# Author Embedding with SBERT

In [7]:
#PCA on embeddings
df_node_emb_pca = pca_node_embedding(25, df_node_emb)
auth_doc2vec_pca  = pca_author_embedding(64, df_auth_emb_Doc2vec)

0.28119939295253193
0.4123955915736105


In [8]:
auth_freq_abs = np.load(path + "auth_freq_abs.npy",allow_pickle='TRUE').item()
auth_freq_crps = np.load(path + "auth_freq_crps.npy",allow_pickle='TRUE').item()

In [9]:
dim_0 = df_train.shape[0]
f_n = features_df.shape[1]
Doc2vec_n = auth_doc2vec_pca.shape[1]
nod_em_n =  df_node_emb_pca.shape[1]
Bert_n = df_auth_emb_Bert.shape[1]
X_train = np.zeros((dim_0,f_n+4+nod_em_n+Bert_n+Doc2vec_n))
y_train = np.zeros(dim_0)
for i,row in tqdm(df_train.iterrows()):
    node = row['authorID']
    X_train[i, 0:f_n] = features_df.loc[features_df.index == node,:].values
    if len(auth_freq_abs[str(int(node))]) != 0:
        X_train[i, f_n:f_n+1] = np.max(auth_freq_abs[str(int(node))])
        X_train[i, f_n+1:f_n+2] = np.mean(auth_freq_abs[str(int(node))])
        X_train[i, f_n+2:f_n+3] = np.max(auth_freq_crps[str(int(node))])
        X_train[i, f_n+3:f_n+4] = np.mean(auth_freq_crps[str(int(node))])
    X_train[i, f_n+4:f_n+4+nod_em_n] = df_node_emb_pca.loc[df_node_emb_pca.index == node,:].values
    X_train[i, f_n+4+nod_em_n:f_n+nod_em_n+4+Doc2vec_n] = auth_doc2vec_pca.loc[auth_doc2vec_pca.index == node,:].values
    X_train[i, f_n+4+nod_em_n+Doc2vec_n:] = df_auth_emb_Bert.loc[df_auth_emb_Bert.index == node,:].values
    y_train[i] = np.log(row['h_index']+1)

23124it [01:34, 244.88it/s]


In [None]:
X_test = np.zeros((n_test,f_n+4+nod_em_n+Bert_n+Doc2vec_n))
for i,row in tqdm(df_test.iterrows()):
    node = row['authorID']
    X_test[i, 0:f_n] = features_df.loc[features_df.index == node,:].values
    if len(auth_freq_abs[str(int(node))]) != 0:
        X_test[i, f_n:f_n+1] = np.max(auth_freq_abs[str(int(node))])
        X_test[i, f_n+1:f_n+2] = np.round(np.mean(auth_freq_abs[str(int(node))]))
        X_test[i, f_n+2:f_n+3] = np.round(np.max(auth_freq_crps[str(int(node))]))
        X_test[i, f_n+3:f_n+4] = np.mean(auth_freq_crps[str(int(node))])
    X_test[i, f_n+4:f_n+4+nod_em_n] = df_node_emb_pca.loc[df_node_emb_pca.index == node,:].values
    X_test[i, f_n+4+nod_em_n:f_n+nod_em_n+4+Doc2vec_n] = auth_doc2vec_pca.loc[auth_doc2vec_pca.index == node,:].values
    X_test[i, f_n+4+nod_em_n+Doc2vec_n:] = df_auth_emb_Bert.loc[df_auth_emb_Bert.index == node,:].values

208115it [15:49, 219.10it/s]


In [None]:
lgb_reg, cat_reg, xg_reg = base_models()
regressor = Stacking_regressor(lgb_reg, cat_reg, xg_reg)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
df_test['h_index_pred'].update(pd.Series(np.rint(np.expm1(y_pred))))
df_test.loc[:,["authorID","h_index_pred"]].to_csv('best_kaggle_submission.csv', index=False)