In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from math import sqrt
import xgboost as xgb
import time
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error
from tabulate import tabulate 



train_data = pd.read_csv('/kaggle/input/finaldf/final_df.csv')


train_data.head()

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    train_data.drop(['score', 'id'], axis=1),
    train_data['score'],
    test_size=0.2,
    random_state=42
)

X_train.head()
y_train.head()

In [None]:
# Try out some values to get the best hyper parameters
def train_and_evaluate(X_train, y_train, X_valid, y_valid, max_depth, learning_rate, n_estimators):

    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', seed=42, max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators)
    xgb_model.fit(X_train, y_train)
    y_pred = xgb_model.predict(X_valid)
    rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
    return rmse

# Initialize variables to keep track of the best hyperparameters and lowest RMSE
best_hyperparams = {'max_depth': 0, 'learning_rate': 0, 'n_estimators': 0}
lowest_rmse = float('inf')

for depth in range(5, 17):
    for rate in np.arange(0.01, 0.41, 0.03):
        for n_estimators in range(50, 151, 3):
            rmse = train_and_evaluate(X_train, y_train, X_valid, y_valid, max_depth=depth, learning_rate=rate, n_estimators=n_estimators)
            
            # Update best hyperparameters if the current RMSE is lower
            if rmse < lowest_rmse:
                lowest_rmse = rmse
                best_hyperparams['max_depth'] = depth
                best_hyperparams['learning_rate'] = rate
                best_hyperparams['n_estimators'] = n_estimators
            
            print(f"Max Depth: {depth}, Learning Rate: {rate:.2f}, n_estimators: {n_estimators}, RMSE: {rmse:.4f}, Lowest RMSE: {lowest_rmse:.4f}")     