In [1]:
import numpy as np
import pandas as pd
from preprocessing import Preprocessor
from asserts import asserts

In [2]:
df = pd.read_csv('./data/student_habits_performance.csv')
X = df.iloc[:, 1:-1]
y = df.iloc[:, -1]

In [3]:
preprocessor = Preprocessor()

In [4]:
X_train, X_test, y_train, y_test = preprocessor.preprocess(X, y)

In [5]:
class LinearRegression():
    def __init__(self, learning_rate = 0.01, max_epochs = 100, treshold = 1e-6, batch_size = None):
        self.weights = None
        self.bias = 0
        self.learning_rate = learning_rate
        self.max_epochs = max_epochs
        self.treshold = treshold
        self.batch_size = batch_size

    def fit(self, X_train, y_train):
        asserts(X_train, y_train)
        
        n_samples, n_features = X_train.shape
        self.weights = np.random.randn(n_features)

        batch_size = self.batch_size if self.batch_size else n_samples
        previous_loss = float('-inf')

        for _ in range(self.max_epochs):
            indices = np.arange(n_samples)
            np.random.shuffle(indices)
            X_train = X_train[indices]
            y_train = y_train[indices]
            total_loss = 0

            for i in range(0, n_samples, batch_size):
                samples = X_train[i: i + batch_size]
                true_vals = y_train[i: i + batch_size]

                dW, dB = self.gradient_descent(samples, true_vals)
                
                update_w = self.learning_rate * dW
                update_b = self.learning_rate * dB

                self.weights -= update_w
                self.bias -= update_b

                error = samples @ self.weights + self.bias - true_vals
                total_loss += np.sum(error ** 2)
            
            epoch_loss = total_loss / n_samples
            if abs(epoch_loss - previous_loss) < self.treshold:
                break
            previous_loss = epoch_loss
            
        return [self.weights, self.bias]
    
    def gradient_descent(self, samples, true_vals):
        n_samples = samples.shape[0]
        preds = samples @ self.weights + self.bias
        error = preds - true_vals
        #mse = np.mean(error ** 2)
        #print(f'MSE: {mse}')

        dW = (2/n_samples) * samples.T @ error
        dB = (2/n_samples) * np.sum(error)

        return [dW, dB]

    def predict(self, X_test):
        return X_test @ self.weights + self.bias
    
    def score(self, X_test, y_test):
        rss = np.sum((y_test - self.predict(X_test)) ** 2)
        tss = np.sum((y_test - np.mean(y_test)) ** 2)
        return 1 - rss/tss



In [None]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)
regressor.score(X_test, y_test)

np.float64(0.8749327136796486)

In [7]:
input = [
    [23, 'Female', 0.0, 1.2, 1.1, 'No', 85.0, 8.0, 'Fair', 6, 'Master', 'Average', 8, 'Yes'],
    [20, 'Female', 6.9, 2.8, 2.3, 'No', 97.3, 4.6, 'Good', 6, 'High School', 'Average', 8, 'No'],
    [21, 'Male', 1.4, 3.1, 1.3, 'No', 94.8, 8.0, 'Poor', 1, 'High School', 'Poor', 1, 'No'],
    [23, 'Female', 1.0, 3.9, 1.0, 'No', 71.0, 9.2, 'Poor', 4, 'Master', 'Good', 1, 'Yes'],
    [19, 'Female', 5.0, 4.4, 0.5, 'No', 90.9, 4.9, 'Fair', 3, 'Master', 'Good', 1, 'No'],
    [24, 'Male', 7.2, 1.3, 0.0, 'No', 82.9, 7.4, 'Fair', 1, 'Master', 'Average', 4, 'No'],
    [21, 'Female', 5.6, 1.5, 1.4, 'Yes', 85.8, 6.5, 'Good', 2, 'Master', 'Poor', 4, 'No'],
    [21, 'Female', 4.3, 1.0, 2.0, 'Yes', 77.7, 4.6, 'Fair', 0, 'Bachelor', 'Average', 8, 'No'],
    [23, 'Female', 4.4, 2.2, 1.7, 'No', 100.0, 7.1, 'Good', 3, 'Bachelor', 'Good', 1, 'No'],
    [18, 'Female', 4.8, 3.1, 1.3, 'No', 95.4, 7.5, 'Good', 5, 'Bachelor', 'Good', 10, 'Yes'],
    [19, 'Female', 4.6, 3.7, 0.8, 'No', 77.6, 5.8, 'Fair', 1, None, 'Good', 3, 'No'],
]
input = preprocessor.transform_input(input)
my_pred = regressor.predict(X_train)



In [8]:
import sklearn
lr = sklearn.linear_model.LinearRegression()
lr.fit(X_train, y_train)
sklearn_pred = lr.predict(X_train)
lr.score(X_train, y_train)

0.8959948778509672

In [9]:
count = 0
for pred1, pred2 in zip(my_pred, sklearn_pred):
    if abs(pred1 - pred2) <= 10:
        count += 1
print(f"Number of predictions within 10 distance: {count}")
print(count/len(my_pred))

Number of predictions within 10 distance: 744
0.992
