In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import xgboost as xgb
# import lightgbm as lgb

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [14]:
root_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(root_dir)

from utils.utils import get_absolute_path, load_data, save_csv, data_prep_X, gen_col_name, data_pre_Y
from utils.constants import *

# from src.data_input import *

In [15]:
# Import data

df_train = pd.read_csv(get_absolute_path('X_train.csv', 'data'))
y_train = pd.read_csv(get_absolute_path('y_train.csv', 'data'))
df_test = pd.read_csv(get_absolute_path('X_test.csv', 'data'))
y_test = pd.read_csv(get_absolute_path('y_test.csv', 'data'))



In [16]:
df_train.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11
0,0.001131,0.884615,0.00112,0.001113,0.677632,0.841463,0.765152,0.787402,0.29375,0.298077,0.276163
1,0.00117,0.871795,0.001159,0.001152,0.703947,0.829268,0.772727,0.795276,0.29375,0.301282,0.276163
2,0.001326,0.884615,0.001198,0.00125,0.677632,0.853659,0.75,0.755906,0.3,0.298077,0.287791
3,0.014094,0.858974,0.001238,0.003926,0.697368,0.829268,0.772727,0.771654,0.296875,0.294872,0.27907
4,0.088109,0.858974,0.010766,0.029297,0.684211,0.853659,0.765152,0.755906,0.296875,0.291667,0.281977


In [17]:
df_test.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11
0,0.0023,0.910256,0.001984,0.002051,0.578947,0.865854,0.651515,0.661417,0.571875,0.573718,0.543605
1,0.243665,0.923077,0.002534,0.110547,0.572368,0.902439,0.575758,0.503937,0.553125,0.541667,0.537791
2,0.208577,0.884615,0.007957,0.095508,0.565789,0.890244,0.583333,0.535433,0.55625,0.544872,0.552326
3,0.391813,0.935897,0.0611,0.228516,0.519737,0.926829,0.530303,0.519685,0.5375,0.528846,0.526163
4,0.497076,0.961538,0.243615,0.353516,0.486842,0.926829,0.530303,0.519685,0.53125,0.528846,0.514535


In [18]:
y_train.head()

Unnamed: 0,measurement
0,0.648148
1,0.648148
2,0.648148
3,0.638889
4,0.648148


# Data

To make valid comparison across different methods, we split the original `df_train` into new train and validation data sets.

In [19]:
# Splitting the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(df_train, y_train, test_size=0.2, random_state=42)

# XGBoost

In [20]:


# Converting the data to DMatrix format for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

# Defining XGBoost parameters
params = {
    'objective': 'reg:squarederror',
    'max_depth': 3,
    'learning_rate': 0.1,
    'n_estimators': 100
}

# Training the XGBoost model
model_xgb = xgb.train(params, dtrain, num_boost_round=100, evals=[(dval, 'validation')], early_stopping_rounds=10)

# Making predictions
y_pred_xgb = model_xgb.predict(dval)

# Calculating RMSE
rmse_xgb = mean_squared_error(y_val, y_pred_xgb, squared=False)
print("XGBoost RMSE:", rmse_xgb)

Parameters: { "n_estimators" } are not used.

[0]	validation-rmse:0.14949
[1]	validation-rmse:0.13466
[2]	validation-rmse:0.12135
[3]	validation-rmse:0.10933
[4]	validation-rmse:0.09851
[5]	validation-rmse:0.08884
[6]	validation-rmse:0.08011


[7]	validation-rmse:0.07230
[8]	validation-rmse:0.06529
[9]	validation-rmse:0.05897
[10]	validation-rmse:0.05332
[11]	validation-rmse:0.04825
[12]	validation-rmse:0.04371
[13]	validation-rmse:0.03968
[14]	validation-rmse:0.03606
[15]	validation-rmse:0.03283
[16]	validation-rmse:0.02996
[17]	validation-rmse:0.02742
[18]	validation-rmse:0.02515
[19]	validation-rmse:0.02316
[20]	validation-rmse:0.02139
[21]	validation-rmse:0.01986
[22]	validation-rmse:0.01851
[23]	validation-rmse:0.01732
[24]	validation-rmse:0.01630
[25]	validation-rmse:0.01539
[26]	validation-rmse:0.01464
[27]	validation-rmse:0.01400
[28]	validation-rmse:0.01345
[29]	validation-rmse:0.01298
[30]	validation-rmse:0.01258
[31]	validation-rmse:0.01221
[32]	validation-rmse:0.01187
[33]	validation-rmse:0.01160
[34]	validation-rmse:0.01135
[35]	validation-rmse:0.01114
[36]	validation-rmse:0.01096
[37]	validation-rmse:0.01078
[38]	validation-rmse:0.01065
[39]	validation-rmse:0.01053
[40]	validation-rmse:0.01043
[41]	validation-r

In [21]:
# Get feature importance scores
feature_importance = model_xgb.get_score(importance_type='weight')

# Sort feature importance scores in descending order
sorted_feature_importance = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)

# Print feature importance scores
print("Feature Importance:")
for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance}")

Feature Importance:
X6: 153.0
X1: 121.0
X5: 117.0
X3: 81.0
X11: 68.0
X4: 52.0
X8: 37.0
X2: 32.0
X10: 17.0
X9: 16.0
X7: 4.0


# Random Forest

In [22]:
# Creating and training the RandomForest model
model_rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
model_rf.fit(X_train, y_train)

# Making predictions
y_pred_rf = model_rf.predict(X_val)

# Calculating RMSE
rmse_rf = mean_squared_error(y_val, y_pred_rf, squared=False)
print("RandomForest RMSE:", rmse_rf)


  model_rf.fit(X_train, y_train)


RandomForest RMSE: 0.009088903489056428


In [23]:
# Get feature importances
feature_importance = model_rf.feature_importances_

# Create a dictionary mapping feature names to their importances
feature_importance_dict = dict(zip(X_train.columns, feature_importance))

# Sort feature importances in descending order
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

# Print feature importances
print("Feature Importances:")
for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance}")

Feature Importances:
X6: 0.8026470550005369
X5: 0.05383681502442475
X1: 0.04016139579030007
X3: 0.028803528839215956
X11: 0.019147877931631422
X4: 0.012233199832258396
X8: 0.009565646208314347
X7: 0.00909731298736705
X9: 0.008590941091664396
X2: 0.008074156693922663
X10: 0.007842070600364171


# LightBGM

In [24]:


# # Creating a LightGBM dataset
# lgb_train = lgb.Dataset(X_train, label=y_train)
# lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train)

# # Defining LightGBM parameters
# params = {
#     'objective': 'regression',
#     'metric': 'rmse',
#     'boosting_type': 'gbdt',
#     'max_depth': 3,
#     'learning_rate': 0.1,
#     'num_leaves': 31,
#     'n_estimators': 100
# }

# # Training the LightGBM model
# model_lgb = lgb.train(params, lgb_train, valid_sets=[lgb_val], early_stopping_rounds=10)

# # Making predictions
# y_pred_lgb = model_lgb.predict(X_val)

# # Calculating RMSE
# rmse_lgb = mean_squared_error(y_val, y_pred_lgb, squared=False)
# print("LightGBM RMSE:", rmse_lgb)
