In [12]:
import lightgbm as lgb
import xgboost as xgb
import catboost as cgb
from sklearn.ensemble import GradientBoostingRegressor as gbr
from sklearn.linear_model import Lasso
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
# read all features
features = pd.read_pickle('./data/features')
# They are encoded as follows:
'''
0 - Degreee Number
1 - Core Number
2 - Page Rank
3 - Avg. H-Index
4- Top Cited Paper (Dummy)
5 - 104 Word2Vec Embeddings
105 - 154 Graph Embeddings
'''
# read the training data
df_train = pd.read_csv('./data/train.csv', dtype={'author': np.int64, 'hindex': np.float32})

In [3]:
y_train = df_train['hindex']

In [5]:
X_train = features.loc[:,:]

In [7]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(
    X_train, y_train, test_size=0.25, random_state=42
)

In [8]:
# Light GBM
reg = lgb.LGBMRegressor(n_estimators = 500,learning_rate = 0.12, num_leaves = 150)
reg.fit(X_train1, y_train1)
y_pred1 = reg.predict(X_test1)

In [9]:
print("MSE for LGBM: ", mean_squared_error(y_test1, y_pred1))

MSE for LGBM:  50.386366599275945


In [13]:
# Lasso
reg = Lasso(alpha=0.1)
reg.fit(X_train1, y_train1)
y_pred1 = reg.predict(X_test1)

In [14]:
print("MSE for Lasso: ", mean_squared_error(y_test1, y_pred1))

MSE for Lasso:  72.82442164910213


In [15]:
# CatBoost
reg = cgb.CatBoostRegressor(n_estimators = 500,learning_rate = 0.12)
reg.fit(X_train1, y_train1)
y_pred1 = reg.predict(X_test1)

0:	learn: 11.9590383	total: 127ms	remaining: 1m 3s
1:	learn: 11.4088532	total: 193ms	remaining: 48s
2:	learn: 10.9285118	total: 254ms	remaining: 42.1s
3:	learn: 10.5226916	total: 312ms	remaining: 38.7s
4:	learn: 10.1844311	total: 375ms	remaining: 37.2s
5:	learn: 9.9130606	total: 434ms	remaining: 35.7s
6:	learn: 9.6681669	total: 492ms	remaining: 34.7s
7:	learn: 9.4574274	total: 557ms	remaining: 34.2s
8:	learn: 9.2847169	total: 615ms	remaining: 33.6s
9:	learn: 9.1143358	total: 685ms	remaining: 33.6s
10:	learn: 8.9840305	total: 750ms	remaining: 33.4s
11:	learn: 8.8629861	total: 817ms	remaining: 33.2s
12:	learn: 8.7599581	total: 881ms	remaining: 33s
13:	learn: 8.6665338	total: 946ms	remaining: 32.9s
14:	learn: 8.5798008	total: 1.01s	remaining: 32.7s
15:	learn: 8.5136126	total: 1.08s	remaining: 32.6s
16:	learn: 8.4530233	total: 1.14s	remaining: 32.3s
17:	learn: 8.3992051	total: 1.2s	remaining: 32.1s
18:	learn: 8.3490432	total: 1.27s	remaining: 32.2s
19:	learn: 8.3043300	total: 1.33s	remaini

In [16]:
print("MSE for Catboost: ", mean_squared_error(y_test1, y_pred1))

MSE for Catboost:  50.9428367637957


In [0]:
# XGBoost
reg = xgb.XGBRegressor(n_estimators = 500,learning_rate = 0.12)
reg.fit(X_train1, y_train1)
y_pred1 = reg.predict(X_test1)

In [0]:
print("MSE for XGBoost: ", mean_squared_error(y_test1, y_pred1))

In [17]:
# Gradient Boosting
reg = gbr(n_estimators = 500,learning_rate = 0.12)
reg.fit(X_train1, y_train1)
y_pred1 = reg.predict(X_test1)

KeyboardInterrupt: KeyboardInterrupt: 

In [0]:
print("MSE for Gradient Boosting: ", mean_squared_error(y_test1, y_pred1))