In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from math import sqrt
import xgboost as xgb
import time
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error
from tabulate import tabulate 


fractional_to_integer_mapping = {
    0.5: 0,
    1.0: 1,
    1.5: 2,
    2.0: 3,
    2.5: 4,
    3.0: 5,
    3.5: 6,
    4.0: 7,
    4.5: 8,
    5.0: 9,
    5.5: 10,
    6.0: 11
}


train_data = pd.read_csv('/kaggle/input/finaldf/final_df.csv')


# Apply the mapping to the 'score' column
train_data['score'] = train_data['score'].map(fractional_to_integer_mapping) # no need of this
train_data.head()

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    train_data.drop(['score', 'id'], axis=1),
    train_data['score'],
    test_size=0.2,
    random_state=42
)
X_train.head()
y_train.head()

In [None]:
# Try out some values to get the best hyper parameters
def train_and_evaluate(X_train, y_train, X_valid, y_valid, max_depth, learning_rate, n_estimators):

    xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=12, seed=42, max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators)
    xgb_model.fit(X_train, y_train)
    y_pred = xgb_model.predict(X_valid)
    
    # Inverse mapping dictionary
    integer_to_fractional_mapping = {v: k for k, v in fractional_to_integer_mapping.items()}

    # Convert predicted values back to fractional scores
    y_pred_original = [integer_to_fractional_mapping[int(prediction)] for prediction in y_pred]

    # Convert true values back to fractional scores (if needed)
    y_valid_original = [integer_to_fractional_mapping[int(true_value)] for true_value in y_valid]
    
    # Evaluate the model
    rmse = np.sqrt(mean_squared_error(y_valid_original, y_pred_original))
    return rmse

# Initialize variables to keep track of the best hyperparameters and lowest RMSE
best_hyperparams = {'max_depth': 0, 'learning_rate': 0, 'n_estimators': 0}
lowest_rmse = float('inf')

for depth in range(6, 17):
    for rate in np.arange(0.01, 0.41, 0.04):
        for n_estimators in range(50, 151, 10):
            rmse = train_and_evaluate(X_train, y_train, X_valid, y_valid, max_depth=depth, learning_rate=rate, n_estimators=n_estimators)
            
            # Update best hyperparameters if the current RMSE is lower
            if rmse < lowest_rmse:
                lowest_rmse = rmse
                best_hyperparams['max_depth'] = depth
                best_hyperparams['learning_rate'] = rate
                best_hyperparams['n_estimators'] = n_estimators
            
            print(f"Max Depth: {depth}, Learning Rate: {rate:.2f}, n_estimators: {n_estimators}, RMSE: {rmse:.4f}, Lowest RMSE: {lowest_rmse:.4f}")     