In [1]:
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# 1. Building a model

Building a linear regression model for any number of features, using the sum of the difference of the true and the predicted values as a loss function and gradient descent as an optimizer.

In [2]:
class MyLinearRegression:
    params = [];
    
    def __init__(self):
        self.params = [];
        pass
    
    def cost(self, true, pred):
        return np.sum(pred - true);
        
    def gradient_desc(self, X, y, iterations=None, learning_rate = 0.1):
        if iterations is None:
            current_cost = 0;
            previous_cost = 0;
            y_pred = self.predict(X);
            current_cost = self.cost(y, y_pred);
            while (np.abs(previous_cost - current_cost)) >= 0.001 :
                previous_cost = current_cost;
                
                for i in range(X.shape[1]):
                    self.params[i] = self.params[i] - learning_rate / X.shape[0] * np.sum((y_pred - y) * X[:,i]);
                
                y_pred = self.predict(X);
                current_cost = self.cost(y, y_pred);
        else:
            for i in range(iterations):
                y_pred = self.predict(X);
                for i in range(X.shape[1]):
                    self.params[i] = self.params[i] - learning_rate / X.shape[0] * np.sum((y_pred - y) * X[:,i]);
    
    def fit(self, X, y, iterations = None, learning_rate = 0.1, add_bias_term = True):
        """
        -Fit a linear model with gradient descent. 
        -If no iterations number is passed, gradient descent will repeat until the change in cost function is < 0.001.
        """
        if add_bias_term == True:
            X = np.append(np.ones([X.shape[0], 1]), X, 1);
            
        if len(self.params) != X.shape[1]:
            self.params = np.random.rand(X.shape[1], 1);
        
        self.gradient_desc(X, y, iterations, learning_rate);
        
    def predict(self, X):
        """
        Predict using the trained model.
        """
        # Add bias term if the data shape mismatches the number of parameters.
        if (X.shape[1] != len(self.params)):
            X = np.append(np.ones([X.shape[0], 1]), X, 1);
            
        prediction = np.dot(np.transpose(self.params), np.transpose(X));
        
        return prediction;
    
    def params(self):
        return self.params;

# 2. Generate some sample test data

Generating data, which will be a function of 3 variables: $y = 5x_{1} + 3x_{2} + 2x_{3} + 9$ with some added noise.

In [3]:
def generate_data(samples):
    x = np.random.rand(samples, 3);
    y = 5 * x[:,0] + 3 * x[:,1] + 2 * x[:,2] + 9;
    noise = np.random.normal(0, 1, samples);
    y = y + noise;
    return x, y;

In [4]:
x_train, y_train = generate_data(300);

In [5]:
x_test, y_test = generate_data(100);

# 3. Compare to scikit-learn model

Checking how the model is performing on the training and testing set and comparing it to the sklearn model to see how mine is doing.

In [6]:
my_lr = MyLinearRegression();
my_lr.fit(x_train, y_train);
train_predictions = my_lr.predict(x_train)[0];
test_predictions = my_lr.predict(x_test)[0];

In [7]:
sk_lr = LinearRegression();
sk_lr.fit(x_train, y_train, True);
sk_train_pred = sk_lr.predict(x_train);
sk_test_pred = sk_lr.predict(x_test);

In [8]:
print("My model:")
print("Coefficients: ", np.transpose(my_lr.params)[0][1:], " Intercept: ", np.transpose(my_lr.params)[0][0]);
print("Train r2 score: ", r2_score(y_train, train_predictions));
print("Test r2 score: ", r2_score(y_test, test_predictions));

My model:
Coefficients:  [5.11562903 3.34761631 1.96525477]  Intercept:  8.679908055055748
Train r2 score:  0.74645630985797
Test r2 score:  0.7850681908719949


In [9]:
print("Sklearn model:")
print("Coefficients: ", sk_lr.coef_, " Intercept: ", sk_lr.intercept_)
print("Train r2 score: ", r2_score(y_train, sk_train_pred));
print("Test r2 score: ", r2_score(y_test, sk_test_pred));

Sklearn model:
Coefficients:  [5.10284578 3.33809439 1.94927766]  Intercept:  8.699324100478396
Train r2 score:  0.7464653597244935
Test r2 score:  0.7852865905779518
