# Model Training Jupyter Notebook

In [55]:
# Importing Packages
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder

In [56]:
# Importing functions from modules
from features import *

In [57]:
# Reading data
df_train = pd.read_csv('data/train_logs.csv', 
                 header=0)
df_test = pd.read_csv('data/test_logs.csv', 
                 header=0)
df_train_scores = pd.read_csv('data/train_scores.csv')


In [58]:
features_train = iki_features(df_train)
features_test = iki_features(df_test)

In [59]:
# Merging training features with training scores
df_train_merged = features_train.merge(df_train_scores, on='id')

# Splitting the merged data into features and target variable
X_train = df_train_merged.drop(['id', 'score'], axis=1)  # Dropping 'id' as it's not a feature
y_train = df_train_merged['score']

# Map scores to integers
score_mapping = {0.5: 0, 1: 1, 1.5: 2, 2: 3, 2.5: 4, 3: 5, 3.5: 6, 4: 7, 4.5: 8, 5: 9, 5.5: 10, 6: 11}
reverse_mapping = {v: k for k, v in score_mapping.items()}
y_train_mapped = y_train.map(score_mapping)


X_test = features_test.drop('id', axis=1)  # Dropping 'id' as it's not a feature


In [60]:
df_train_merged.head()
print(type(y_train[4]))

<class 'numpy.float64'>


In [62]:
model = HistGradientBoostingClassifier(max_iter=100, max_leaf_nodes=31, early_stopping='auto', random_state=42, verbose=1, scoring='loss')
model.fit(X_train, y_train_mapped)


Binning 0.000 GB of training data: 0.005 s
Fitting gradient boosted rounds:
[1/100] 12 trees, 336 leaves (28 on avg), max depth = 12, in 0.015s
[2/100] 12 trees, 372 leaves (31 on avg), max depth = 15, in 0.017s
[3/100] 12 trees, 372 leaves (31 on avg), max depth = 16, in 0.016s
[4/100] 12 trees, 372 leaves (31 on avg), max depth = 14, in 0.016s
[5/100] 12 trees, 372 leaves (31 on avg), max depth = 15, in 0.016s
[6/100] 12 trees, 372 leaves (31 on avg), max depth = 16, in 0.017s
[7/100] 12 trees, 372 leaves (31 on avg), max depth = 13, in 0.016s
[8/100] 12 trees, 372 leaves (31 on avg), max depth = 19, in 0.017s
[9/100] 12 trees, 372 leaves (31 on avg), max depth = 19, in 0.016s
[10/100] 12 trees, 372 leaves (31 on avg), max depth = 20, in 0.016s
[11/100] 12 trees, 372 leaves (31 on avg), max depth = 17, in 0.014s
[12/100] 12 trees, 372 leaves (31 on avg), max depth = 20, in 0.015s
[13/100] 12 trees, 372 leaves (31 on avg), max depth = 22, in 0.015s
[14/100] 12 trees, 372 leaves (31 on

In [63]:
# Predict and reverse map
y_pred_mapped = model.predict(X_test)
y_pred = pd.Series(y_pred_mapped).map(reverse_mapping)


In [64]:
print(y_pred)

0    3.5
1    3.0
2    4.0
dtype: float64
