In [82]:
import pandas as pd

In [83]:
df = pd.read_csv('./data/student_habits_performance.csv')
X = df.iloc[:, 1: -1]
y = df.iloc[:, -1]

In [84]:
categorical_cols = X.select_dtypes(include= ['object', 'category']).columns
numerical_cols = X.select_dtypes(exclude= ['object', 'category']).columns



In [85]:
print(categorical_cols)
print(numerical_cols)

Index(['gender', 'part_time_job', 'diet_quality', 'parental_education_level',
       'internet_quality', 'extracurricular_participation'],
      dtype='object')
Index(['age', 'study_hours_per_day', 'social_media_hours', 'netflix_hours',
       'attendance_percentage', 'sleep_hours', 'exercise_frequency',
       'mental_health_rating'],
      dtype='object')


In [86]:
from sklearn.model_selection import train_test_split, GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 0)

In [87]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [88]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [None]:
preprocessor = ColumnTransformer(transformers= [
    ('numerical', StandardScaler(), numerical_cols),
    ('categorical', OneHotEncoder(drop= 'first'), categorical_cols)
], remainder= 'passthrough',)

grid_params = [
    {
        'classifier': [LinearRegression()],
    },
    {
        'classifier': [DecisionTreeRegressor(random_state=0)],
        'classifier__criterion': ['squared_error', 'absolute_error']
    },
    {
        'classifier': [SVR()],
        'classifier__kernel': ['linear', 'rbf'],
        'classifier__gamma': ['scale', 0.01, 0.1, 1]
    },
    {
        'classifier': [RandomForestRegressor(random_state=0)],
        'classifier__criterion': ['squared_error', 'absolute_error']
    },
    {
        'classifier': [XGBRegressor()]
    }
]

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LinearRegression()) #dummy, will get replaced
])

grid_search = GridSearchCV(estimator= pipeline, param_grid= grid_params, n_jobs= -1, cv= 10, scoring= 'r2')
grid_search.fit(X_train, y_train)

In [90]:
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(best_model)
print(best_score)


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('numerical', StandardScaler(),
                                                  Index(['age', 'study_hours_per_day', 'social_media_hours', 'netflix_hours',
       'attendance_percentage', 'sleep_hours', 'exercise_frequency',
       'mental_health_rating'],
      dtype='object')),
                                                 ('categorical',
                                                  OneHotEncoder(drop='first'),
                                                  Index(['gender', 'part_time_job', 'diet_quality', 'parental_education_level',
       'internet_quality', 'extracurricular_participation'],
      dtype='object'))])),
                ('classifier', LinearRegression())])
0.8857106085154645


In [91]:
best_model.score(X_test, y_test)


0.9115458370094476