In [82]:
import numpy as np
import pandas as pd

In [83]:
df = pd.read_csv('./data/student_habits_performance.csv')
X = df.iloc[:, 1: -1]
y = df.iloc[:, -1]

In [84]:
numerical_cols = X.select_dtypes(include= ['number']).columns
categorical_cols = X.select_dtypes(include= ['object', 'category']).columns

In [85]:
numerical_cols


Index(['age', 'study_hours_per_day', 'social_media_hours', 'netflix_hours',
       'attendance_percentage', 'sleep_hours', 'exercise_frequency',
       'mental_health_rating'],
      dtype='object')

In [86]:
categorical_cols

Index(['gender', 'part_time_job', 'diet_quality', 'parental_education_level',
       'internet_quality', 'extracurricular_participation'],
      dtype='object')

In [87]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 0)

In [88]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

scaler = StandardScaler()
encoder = OneHotEncoder(drop= 'first', sparse_output= False)

X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

X_train_encoded = encoder.fit_transform(X_train[categorical_cols])
X_test_encoded = encoder.transform(X_test[categorical_cols])

encoded_cols = encoder.get_feature_names_out(categorical_cols)

X_train = X_train.drop(columns= categorical_cols)
X_test = X_test.drop(columns= categorical_cols)

In [89]:
X_train_encoded_cols = pd.DataFrame(data= X_train_encoded, columns= encoded_cols, index= X_train.index)
X_test_encoded_cols = pd.DataFrame(data = X_test_encoded, columns= encoded_cols, index= X_test.index)

X_train = pd.concat([X_train, X_train_encoded_cols], axis= 1).values
X_test = pd.concat([X_test, X_test_encoded_cols], axis= 1).values

In [90]:
class LinearRegression():
    def __init__(self, learning_rate= 0.01, max_iteration= 1000, tolerance= 1e-6):
        self.weights = None
        self.bias = 0
        self.learning_rate = learning_rate
        self.max_iteration = max_iteration
        self.tolerance = tolerance

    def fit(self, X_train, y_train):
        n_samples, n_features = X_train.shape
        self.weights = np.random.randn(n_features)

        for _ in range(self.max_iteration):
            preds = X_train @ self.weights + self.bias
            error = preds - y_train

            mse = np.mean(error ** 2)
            print(f'MSE: {mse}')

            dW = (2/n_samples) * X_train.T @ error
            dB = (2/n_samples) * np.sum(error)
            
            update_w = self.learning_rate * dW
            update_b = self.learning_rate * dB

            if np.all(np.abs(update_w) < self.tolerance) and (abs(update_b) < self.tolerance):
                break

            self.weights -= update_w
            self.bias -= update_b

        return [self.weights, self.bias]
    
    def predict(self, X_test):
        return X_test @ self.weights + self.bias
    
    def score(self, X_test, y_test):
        rss = np.sum((y_test - self.predict(X_test)) ** 2)
        tss = np.sum((y_test - np.mean(y_test)) ** 2)
        return 1 - rss/tss


In [91]:
model = LinearRegression()
model.fit(X_train, y_train)
model.score(X_test, y_test)

MSE: 5260.989755905974
MSE: 4867.1014141561145
MSE: 4504.680839531426
MSE: 4171.188720280585
MSE: 3864.2914069839444
MSE: 3581.8442277187364
MSE: 3321.8761579065713
MSE: 3082.5757348107472
MSE: 2862.2781155898424
MSE: 2659.453186026708
MSE: 2472.6946345974084
MSE: 2300.709913477063
MSE: 2142.311014448682
MSE: 1996.405993532824
MSE: 1861.9911835321516
MSE: 1738.1440386244876
MSE: 1624.0165596761462
MSE: 1518.8292531168022
MSE: 1421.865580047934
MSE: 1332.4668557763719
MSE: 1250.0275631981199
MSE: 1173.9910464284794
MSE: 1103.845553804067
MSE: 1039.120601890093
MSE: 979.3836344303229
MSE: 924.2369522940323
MSE: 873.3148924191592
MSE: 826.2812355377689
MSE: 782.8268241116873
MSE: 742.6673734145298
MSE: 705.5414600821948
MSE: 671.2086737271524
MSE: 639.4479183816802
MSE: 610.0558516100087
MSE: 582.845450116823
MSE: 557.6446915868446
MSE: 534.2953433237974
MSE: 512.6518490229287
MSE: 492.5803057149096
MSE: 473.9575235654475
MSE: 456.6701618089214
MSE: 440.6139346401059
MSE: 425.692881389453

np.float64(0.8883907177629746)