In [1]:
#!/usr/bin/python3
import pandas as pd
import numpy as np

In [2]:
class InvalidArgumentTypeError(Exception):
    '''
        Description: Raised when the arguments passed to the function do
                     not meet the required types
    '''
    pass

In [3]:
class CustomLinearRegression:
    def check_input(self, X, y=None):
        '''
            Description: Checks if the input is a Pandas DataFrame,
                         Pandas Series or Numpy array
            
            Parameters: X: Data
                        y: Target values
                        
            Returns: False if input is not valid,
                     True otherwise
        '''
        try:
            if not isinstance(X, (pd.Series, pd.DataFrame, np.ndarray)):
                raise InvalidArgumentTypeError
        except InvalidArgumentTypeError:
            print("InvalidArgumentTypeError: X required as a DataFrame, Series or a Numpy array")
            print("Found type: {datatype_x}".format(datatype_x=type(X)))
            return False
        try:
            if not isinstance(y, (pd.Series, pd.DataFrame, np.ndarray)) and y is not None:
                raise InvalidArgumentTypeError
        except InvalidArgumentTypeError:
            print("InvalidArgumentTypeError: y required as a DataFrame, Series or a Numpy array")
            print("Found type: {datatype_y}".format(datatype_y=type(y)))
            return False
        return True
    
    def transform_input(self, X, y=None):
        '''
            Description: Transforms the input into 2D numpy array before fitting the model
            
            Parameters: X: Data
                        y: Target values
                        
            Returns: If input is of valid format:
                         X: Returns data X as a 2D numpy array
                         y: Returns target y as a 2D numpy array
                     Otherwise returns the input as it is
        '''
        if self.check_input(X, y) is True:
            # Convert X into numpy array
            if isinstance(X, pd.DataFrame):
                X = X.values
            elif isinstance(X, pd.Series):
                X = X.values
                X = X[..., None]
            # Convert y into numpy array
            if y is not None:
                if isinstance(y, pd.DataFrame):
                    y = y.values
                elif isinstance(y, pd.Series):
                    y = y.values
                    y = y[..., None]
        if y is None:
            return X
        else:
            return X, y
        
    def fit(self, X, y):
        '''
            Description: Finds the line of best fit for the given training set
                         using the ordinary least square estimator.
                         It uses the training data to find the beta coeffiecients needed
                         to minimize the least squared error.
                         
            Parameters: X: Training data
                           Type: Numpy array, Pandas Series or Pandas DataFrame
                        y: Target values
                           Type: Numpy array, Pandas Series or Pandas DataFrame
                         
            Returns: self: Returns an instance of self
        '''
        X, y = self.transform_input(X, y)
        X = np.insert(X, 0, values=1, axis=1)
        coef = np.dot(np.dot(np.linalg.inv(np.dot(X.T, X)), X.T), y).ravel()
        self.coef_ = coef[1:]
        self.intercept_ = coef[0]
        return self
    
    def predict(self, X):
        '''
            Description: Predict target values using linear model
            
            Parameters: X: Sample data
                           Type: Numpy array, Pandas Series or Pandas DataFrame
                           
            Returns: C: Predicted target values
                        Type: Numpy array
                        Shape: (n_samples,)
        ''' 
        X = self.transform_input(X)
        X = np.insert(X, 0, values=1, axis=1)
        coefs = np.insert(self.coef_, 0, values=self.intercept_)
        return np.dot(X, coefs)

In [4]:
# Loading the dataset
from sklearn.datasets import load_boston
dataset = load_boston()
print(dataset['DESCR'])

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [5]:
# Split the dataset into training and testing
X = dataset['data']
y = dataset['target']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.44, random_state=1)

In [6]:
# Predicting target values using custom CustomLinearRegression class
my_lm = CustomLinearRegression()
my_lm.fit(X_train, y_train).predict(X_test)

array([32.8867694 , 28.46566481, 18.16621576, 21.1967802 , 18.56246685,
       20.6835774 , 33.09820187, 18.54410492, 24.19037742, 26.68701831,
       27.06060251, 29.14244418, 20.6059348 , 26.722208  , 23.29994681,
       20.3051452 , 18.00058329, 38.92546517, 30.27596485,  8.72397885,
       20.63372329, 15.71751444, 25.42392581, 24.93327124, 31.33214768,
       10.2543323 , 14.14557266, 16.61517789, 36.4267268 , 14.66090088,
       21.72141346, 14.21810835, 43.91449092, 18.44885203, 21.05740924,
       20.42176576, 17.89888538, 27.58573935,  8.91105474, 19.86848526,
       23.52645586, 21.56305687, 29.30661444, 16.35790226, 19.49660144,
       14.59663418, 39.25920327, 18.32396592, 25.07593871, 19.79907719,
       25.43173919, 24.39329413, 25.0571002 , 26.08721963,  4.57552125,
       24.59425862, 11.19163896, 26.60077484, 17.31995465, 36.34426546,
       19.85024806, 27.43763258, 16.69395337, 18.17546413, 10.99515984,
       32.25585006, 37.2390627 , 21.54988422, 24.12265621, 24.60

In [7]:
# Predicting target values using sklearn LinearRegression class
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train, y_train).predict(X_test)

array([32.8867694 , 28.46566481, 18.16621576, 21.1967802 , 18.56246685,
       20.6835774 , 33.09820187, 18.54410492, 24.19037742, 26.68701831,
       27.06060251, 29.14244418, 20.6059348 , 26.722208  , 23.29994681,
       20.3051452 , 18.00058329, 38.92546517, 30.27596485,  8.72397885,
       20.63372329, 15.71751444, 25.42392581, 24.93327124, 31.33214768,
       10.2543323 , 14.14557266, 16.61517789, 36.4267268 , 14.66090088,
       21.72141346, 14.21810835, 43.91449092, 18.44885203, 21.05740924,
       20.42176576, 17.89888538, 27.58573935,  8.91105474, 19.86848526,
       23.52645586, 21.56305687, 29.30661444, 16.35790226, 19.49660144,
       14.59663418, 39.25920327, 18.32396592, 25.07593871, 19.79907719,
       25.43173919, 24.39329413, 25.0571002 , 26.08721963,  4.57552125,
       24.59425862, 11.19163896, 26.60077484, 17.31995465, 36.34426546,
       19.85024806, 27.43763258, 16.69395337, 18.17546413, 10.99515984,
       32.25585006, 37.2390627 , 21.54988422, 24.12265621, 24.60