In [36]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import lightgbm as lgb

In [37]:
# read all features
features = pd.read_pickle('./data/features')
# They are encoded as follows:
'''
0 - Degreee Number
1 - Core Number
2 - Page Rank
3 - Avg. H-Index
4- Top Cited Paper (Dummy)
5 - 104 Word2Vec Embeddings
105 - 154 Graph Embeddings
'''
# read the training data
df_train = pd.read_csv('./data/train.csv', dtype={'author': np.int64, 'hindex': np.float32})

In [38]:
y_train = df_train['hindex']

# Base Model

In [6]:
# Getting the data for the base model
X_train = features.iloc[:,:2]

In [7]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(
    X_train, y_train, test_size=0.25, random_state=42
)

In [8]:
reg = lgb.LGBMRegressor(n_estimators = 500,learning_rate = 0.12, num_leaves = 150)
reg.fit(X_train1, y_train1)
y_pred1 = reg.predict(X_test1)

In [9]:
print("MSE for baseline model: ", mean_squared_error(y_test1, y_pred1))

MSE for baseline model:  105.38829642483441


# Pagerank

In [10]:
# Getting the data for the base model and adding pagerank
X_train = pd.concat([features.iloc[:,:2],features.loc[:,2]], axis=1)

In [11]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(
    X_train, y_train, test_size=0.25, random_state=42
)

In [12]:
reg = lgb.LGBMRegressor(n_estimators = 500,learning_rate = 0.12, num_leaves = 150)
reg.fit(X_train1, y_train1)
y_pred1 = reg.predict(X_test1)

In [13]:
print("MSE for pagerank added: ", mean_squared_error(y_test1, y_pred1))

MSE for pagerank added:  103.30533662927053


# Average H-Index

In [44]:
# Getting the data for the base model and adding average h-index
X_train = pd.concat([features.iloc[:,:2],features.loc[:,3]], axis=1)

In [45]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(
    X_train, y_train, test_size=0.25, random_state=42
)

In [46]:
reg = lgb.LGBMRegressor(n_estimators = 500,learning_rate = 0.12, num_leaves = 150)
reg.fit(X_train1, y_train1)
y_pred1 = reg.predict(X_test1)

In [47]:
print("MSE for average h-index added: ", mean_squared_error(y_test1, y_pred1))

MSE for average h-index added:  86.74902864269235


# Top cited paper (Dummy)

In [39]:
# Getting the data for the base model and adding top cited paper (Dummy)
X_train = pd.concat([features.iloc[:,:2],features.loc[:,4]], axis=1)

In [40]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(
    X_train, y_train, test_size=0.25, random_state=42
)

In [41]:
reg = lgb.LGBMRegressor(n_estimators = 500,learning_rate = 0.12, num_leaves = 150)
reg.fit(X_train1, y_train1)
y_pred1 = reg.predict(X_test1)

In [42]:
print("MSE for top cited paper (Dummy) added: ", mean_squared_error(y_test1, y_pred1))

MSE for top cited paper (Dummy) added:  97.35953757539404


# Word2Vec Embeddings

In [26]:
# Getting the data for the base model and adding word2vec embeddings
X_train = pd.concat([features.iloc[:,:2],features.loc[:,5:104]], axis=1)

In [27]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(
    X_train, y_train, test_size=0.25, random_state=42
)

In [28]:
reg = lgb.LGBMRegressor(n_estimators = 500,learning_rate = 0.12, num_leaves = 150)
reg.fit(X_train1, y_train1)
y_pred1 = reg.predict(X_test1)

In [29]:
print("MSE for Word2Vec Embeddings added: ", mean_squared_error(y_test1, y_pred1))

MSE for Word2Vec Embeddings added:  55.73936707760211


# Graph Embeddings

In [30]:
# Getting the data for the base model and adding graph embeddings
X_train = pd.concat([features.iloc[:,:2],features.loc[:,105:154]], axis=1)

In [31]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(
    X_train, y_train, test_size=0.25, random_state=42
)

In [32]:
reg = lgb.LGBMRegressor(n_estimators = 500,learning_rate = 0.12, num_leaves = 150)
reg.fit(X_train1, y_train1)
y_pred1 = reg.predict(X_test1)

In [33]:
print("MSE for graph embeddings added: ", mean_squared_error(y_test1, y_pred1))

MSE for graph embeddings added:  90.70049364241065
