In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import root_mean_squared_error, make_scorer

In [2]:
SEED = 42

### Read data

In [3]:
df = pd.read_csv('../data/study_habits.csv')

In [4]:
df.shape

(2000, 8)

In [5]:
df.head()

Unnamed: 0,Student_ID,Study_Hours_Per_Day,Extracurricular_Hours_Per_Day,Sleep_Hours_Per_Day,Social_Hours_Per_Day,Physical_Activity_Hours_Per_Day,GPA,Stress_Level
0,1,6.9,3.8,8.7,2.8,1.8,2.99,Moderate
1,2,5.3,3.5,8.0,4.2,3.0,2.75,Low
2,3,5.1,3.9,9.2,1.2,4.6,2.67,Low
3,4,6.5,2.1,7.2,1.7,6.5,2.88,Moderate
4,5,8.1,0.6,6.5,2.2,6.6,3.51,High


### Feature engineering

In [6]:
# Drop Student_ID column
del df['Student_ID']

In [7]:
# Column names to lower case
df.columns = df.columns.str.lower()

In [8]:
# Separate features and target variable
X = df.copy()
del X['gpa']
y = df['gpa']

In [9]:
numerical_features = list(X.select_dtypes(exclude=["object"]).columns)
numerical_features

['study_hours_per_day',
 'extracurricular_hours_per_day',
 'sleep_hours_per_day',
 'social_hours_per_day',
 'physical_activity_hours_per_day']

In [10]:
categorical_features = list(X.select_dtypes(include=["object"]).columns)
categorical_features

['stress_level']

In [11]:
# Scale numeric feature
num_scaler = StandardScaler()

In [12]:
# Apply one-hot encoding for categorical features
cat_encoder = OneHotEncoder(handle_unknown='error')

In [13]:
# Combine transformation steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_scaler, numerical_features),
        ('cat', cat_encoder, categorical_features),
    ],
    remainder='passthrough' # Keep other columns as they are (if any)
)

In [14]:
X = preprocessor.fit_transform(X)
feature_names = [f.lower() for f in preprocessor.get_feature_names_out()]
X = pd.DataFrame(X, columns=feature_names)

In [15]:
X

Unnamed: 0,num__study_hours_per_day,num__extracurricular_hours_per_day,num__sleep_hours_per_day,num__social_hours_per_day,num__physical_activity_hours_per_day,cat__stress_level_high,cat__stress_level_low,cat__stress_level_moderate
0,-0.404487,1.566246,0.820734,0.056543,-1.005896,0.0,0.0,1.0
1,-1.528451,1.306633,0.341473,0.885882,-0.528470,0.0,1.0,0.0
2,-1.668947,1.652784,1.163063,-0.891273,0.108097,0.0,1.0,0.0
3,-0.685478,0.095105,-0.206253,-0.595080,0.864021,0.0,0.0,1.0
4,0.438487,-1.202961,-0.685514,-0.298888,0.903806,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...
1995,-0.685478,-1.549111,-0.069322,-0.358126,1.381232,0.0,0.0,1.0
1996,-0.825974,0.700869,0.889200,-0.713557,0.108097,0.0,0.0,1.0
1997,-0.896221,-1.722187,-0.890911,-1.128226,2.574795,0.0,0.0,1.0
1998,0.438487,-1.116423,0.067610,0.471212,-0.090830,1.0,0.0,0.0


### Train/test split

In [16]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)
X_train_val.shape, X_test.shape, y_train_val.shape, y_test.shape

((1600, 8), (400, 8), (1600,), (400,))

### Set up cross-validation framework

In [17]:
def run_cross_validation(model, X, y):
    """Run k-fold cross validation with 5-folds, use RMSE for scoring, return RSMEs mean value"""
    scores = cross_val_score(model, X, y, scoring=make_scorer(root_mean_squared_error), cv=5, n_jobs=-1, verbose=3)
    return np.mean(scores), np.std(scores)

In [18]:
models_evaluation = []

In [19]:
def log_model_performance(model, **kwargs):
    """Save model name, parameters and performance metrics"""
    models_evaluation.append({
        'model_name': model.__class__.__name__,
        'parameters': model.get_params(),
        **kwargs
    })

In [20]:
def evaluate_model(model):
    """Orchestrate model evaluation:
        - run cross-validation with 5-folds
        - run final training on full dataset
        - evaluate final model performance
        - store model performance metrics
    """
    # Model cross-validation
    rmse_mean, rmse_std = run_cross_validation(model, X_train_val, y_train_val)

    # Final training on full data set
    model.fit(X_train_val, y_train_val)

    # Evaluate final model
    y_pred = model.predict(X_test)
    rmse_test = root_mean_squared_error(y_test, y_pred)

    # Log model performance
    log_model_performance(model, rmse_mean=rmse_mean, rmse_std=rmse_std, rmse_test=rmse_test)

### Train baseline model - Linear Regression

In [21]:
model = LinearRegression()
evaluate_model(model)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.8s remaining:    2.8s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.8s finished


In [22]:
# Explore model coefficients
for k, v in zip(['intercept'] + feature_names, [model.intercept_] + list(model.coef_)):
    print(f'{k}: {round(v, 3)}')

intercept: 3.116
num__study_hours_per_day: 0.188
num__extracurricular_hours_per_day: -0.035
num__sleep_hours_per_day: -0.032
num__social_hours_per_day: -0.033
num__physical_activity_hours_per_day: -0.05
cat__stress_level_high: 0.006
cat__stress_level_low: 0.011
cat__stress_level_moderate: -0.017


### Linear regression with regularization

In [23]:
for r in [0.0001, 0.001, 0.01, 0.1, 1, 5, 10, 100]:
    model = Ridge(alpha=r)
    evaluate_model(model)

r=0.0001


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_

r=0.001
r=0.01
r=0.1
r=1
r=5


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s


r=10
r=100


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.0s finished


### Random forest

In [25]:
rf_params = {
    'criterion': 'squared_error',
    'random_state': SEED,
    'n_jobs': 8,
    'verbosity': 1,
}

In [24]:
%%capture
for n_estimators in [5, 10, 20, 50, 100, 150, 200]:
    for max_depth in [2, 5, 10, 15, 20, 50, 100]:
        for min_samples_leaf in [1, 2, 5, 10, 15, 30]:

            rf_params['n_estimators'] = n_estimators
            rf_params['max_depth'] = max_depth
            rf_params['min_samples_leaf'] = min_samples_leaf

            model = RandomForestRegressor(**rf_params)
            evaluate_model(model)

### XGBoost

In [26]:
xgb_params = {
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': SEED,
    'verbosity': 1,
}

In [27]:
%%capture
for n_estimators in [10, 20, 50, 100, 150, 200]:
    for learning_rate in [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]:
        for booster in ['gbtree', 'gblinear', 'dart']:

            xgb_params['n_estimators'] = n_estimators
            xgb_params['learning_rate'] = learning_rate
            xgb_params['booster'] = booster

            model = xgb.XGBRegressor(**xgb_params)

            evaluate_model(model)

[CV] END ................................ score: (test=0.199) total time=   0.0s
[CV] END ................................ score: (test=0.199) total time=   0.0s
[CV] END ................................ score: (test=0.201) total time=   0.0s
[CV] END ................................ score: (test=0.199) total time=   0.0s
[CV] END ................................ score: (test=0.199) total time=   0.0s
[CV] END ................................ score: (test=0.203) total time=   0.0s
[CV] END ................................ score: (test=0.203) total time=   0.1s
[CV] END ................................ score: (test=0.217) total time=   0.0s
[CV] END ................................ score: (test=0.203) total time=   0.0s
[CV] END ................................ score: (test=0.203) total time=   0.0s
[CV] END ................................ score: (test=0.205) total time=   0.0s
[CV] END ................................ score: (test=0.223) total time=   0.0s
[CV] END ...................

### Select best model

In [48]:
pd.set_option('display.float_format', '{:.15f}'.format)

In [56]:
models_evaluation_df = (
    pd.DataFrame(models_evaluation)
    .sort_values(by='rmse_test', inplace=False, ascending=True)  # model with min RMSE on test data
)

In [57]:
models_evaluation_df.head(5)

Unnamed: 0,model_name,parameters,rmse_mean,rmse_std,rmse_test
0,XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",0.20212763154624,0.006154429166672,0.205571087635788
1,XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",0.202127665333918,0.0061544444023,0.205571103212251
2,XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",0.202127654823117,0.006154450591738,0.20557111268538
3,XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",0.20212766007942,0.006154448451745,0.205571119462857
4,XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",0.202127652187381,0.006154453086291,0.205571121023709


In [60]:
models_evaluation_df.iloc[0].to_dict()['model_name']

'XGBRegressor'

In [63]:
{
    k: v for k, v in models_evaluation_df.iloc[0].to_dict()['parameters'].items()
    if k in ['n_estimators', 'learning_rate', 'booster']
}

{'booster': 'gblinear', 'learning_rate': 1, 'n_estimators': 20}