In [48]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [49]:
# Load train and Test set

train=pd.read_csv("Trainfile.csv")
test=pd.read_csv("Testfile.csv")

In [50]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1138 entries, 0 to 1137
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Idx            1138 non-null   int64  
 1   MSSubClass     1138 non-null   int64  
 2   MSZoning       1138 non-null   object 
 3   LotFrontage    938 non-null    float64
 4   LotArea        1138 non-null   int64  
 5   Street         1138 non-null   object 
 6   Alley          66 non-null     object 
 7   LotShape       1138 non-null   object 
 8   LandContour    1138 non-null   object 
 9   Utilities      1138 non-null   object 
 10  LotConfig      1138 non-null   object 
 11  LandSlope      1138 non-null   object 
 12  Neighborhood   1138 non-null   object 
 13  Condition1     1138 non-null   object 
 14  Condition2     1138 non-null   object 
 15  BldgType       1138 non-null   object 
 16  HouseStyle     1138 non-null   object 
 17  OverallQual    1138 non-null   int64  
 18  OverallC

In [51]:
train.isnull().sum()

Idx                0
MSSubClass         0
MSZoning           0
LotFrontage      200
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [52]:
nullableColumns = train.columns[train.isna().any()==True].tolist()

In [53]:
nullableColumns

['LotFrontage',
 'Alley',
 'MasVnrType',
 'MasVnrArea',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [54]:
train = train.drop(nullableColumns, axis=1)

In [55]:
test=test.drop(nullableColumns, axis=1)

In [56]:
train.shape

(1138, 62)

In [57]:
test.shape

(322, 62)

In [58]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1138 entries, 0 to 1137
Data columns (total 62 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Idx            1138 non-null   int64 
 1   MSSubClass     1138 non-null   int64 
 2   MSZoning       1138 non-null   object
 3   LotArea        1138 non-null   int64 
 4   Street         1138 non-null   object
 5   LotShape       1138 non-null   object
 6   LandContour    1138 non-null   object
 7   Utilities      1138 non-null   object
 8   LotConfig      1138 non-null   object
 9   LandSlope      1138 non-null   object
 10  Neighborhood   1138 non-null   object
 11  Condition1     1138 non-null   object
 12  Condition2     1138 non-null   object
 13  BldgType       1138 non-null   object
 14  HouseStyle     1138 non-null   object
 15  OverallQual    1138 non-null   int64 
 16  OverallCond    1138 non-null   int64 
 17  YearBuilt      1138 non-null   int64 
 18  YearRemodAdd   1138 non-null

In [59]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 322 entries, 0 to 321
Data columns (total 62 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Idx            322 non-null    int64 
 1   MSSubClass     322 non-null    int64 
 2   MSZoning       322 non-null    object
 3   LotArea        322 non-null    int64 
 4   Street         322 non-null    object
 5   LotShape       322 non-null    object
 6   LandContour    322 non-null    object
 7   Utilities      322 non-null    object
 8   LotConfig      322 non-null    object
 9   LandSlope      322 non-null    object
 10  Neighborhood   322 non-null    object
 11  Condition1     322 non-null    object
 12  Condition2     322 non-null    object
 13  BldgType       322 non-null    object
 14  HouseStyle     322 non-null    object
 15  OverallQual    322 non-null    int64 
 16  OverallCond    322 non-null    int64 
 17  YearBuilt      322 non-null    int64 
 18  YearRemodAdd   322 non-null   

In [60]:
numeric_features = train.select_dtypes(include = [np.number])

numeric_features.columns.tolist()

['Idx',
 'MSSubClass',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold',
 'SalePrice']

In [61]:
numerical_features=['MSSubClass','LotArea','OverallQual','OverallCond','YearBuilt','YearRemodAdd',
 'BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea',
 'BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces',
 'GarageCars','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal',
 'MoSold','YrSold']

In [62]:
features =numerical_features

X_train=train[features].copy()
X_test=test[features].copy()

y_train=train[['SalePrice']].copy()
y_test=test[['SalePrice']].copy()

In [63]:
#Normalizing the input matrix so that gradient descent can perform better
X_train=(X_train-X_train.mean())/(X_train.max()-X_train.min())
X_test=(X_test-X_test.mean())/(X_test.max()-X_test.min())

In [64]:
X_train.shape

(1138, 33)

In [65]:
# the model needs X_train of dim (no.of featuresxno.of training examples)
X_train=(X_train.T).to_numpy() 
X_train.shape

(33, 1138)

In [66]:
y_train.shape

(1138, 1)

In [67]:
# the model needs y_train of dim (1xno.of training examples)
y_train=(y_train.T).to_numpy()
y_train.shape

(1, 1138)

In [68]:
X_test.shape

(322, 33)

In [69]:
# the model needs X_train of dim (no.of featuresxno.of testing examples)
X_test=(X_test.T).to_numpy()
X_test.shape

(33, 322)

In [70]:
y_test.shape

(322, 1)

In [71]:
# the model needs y_test of dim (1xno.of testing examples)
y_test=(y_test.T).to_numpy()
y_test.shape

(1, 322)

In [None]:
# multivariable linear regression model from scratch

In [72]:
def init_param(len_w):
    w =np.zeros((len_w, 1))
    b=0
    assert(w.shape == (len_w, 1))
    assert(isinstance(b, float) or isinstance(b, int))
    return w,b

In [73]:
def propagate(w,b,X,Y): 
    #fwd_prop
    m = X.shape[1]
    A=np.dot(w.T,X)+b 
    cost=np.sum(np.square(A - Y),dtype = np.float32)/(2*m)

    #back_prop
    dw = np.dot(X, (A - Y).T ) / m
    db = np.sum(A - Y)/ m
    
    assert(dw.shape == w.shape)
    assert(db.dtype == float)
    cost = np.squeeze(cost)
    assert(cost.shape==())
    
    grads = {"dw": dw,
             "db": db}
    
    return grads,cost

In [74]:
def grad_desc_update(w, b, X, Y, num_iterations, learning_rate, print_cost = False):
    
    for i in range(num_iterations):
        
        costs = []
        grads, cost = propagate(w, b, X, Y)
    
        dw = grads["dw"]
        db = grads["db"]
    
        w=w-learning_rate*dw
        b=b-learning_rate*db
        
        # Record the costs
        if i % 100 == 0:
            costs.append(cost)
        
        # Print the cost every 100 training examples
        if print_cost and i % 100 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))
    
    
    
    params = {"w": w,
              "b": b}
    
    grads = {"dw": dw,
             "db": db}
            
            
    return params, grads, costs

In [75]:
def predict(w, b, X):
    m = X.shape[1]
    Y_prediction = np.zeros((1,m))
    w = w.reshape(X.shape[0], 1)
    
    A =np.dot(w.T, X) + b

    for i in range(A.shape[1]):
        Y_prediction[0,i] = A[0,i]
    
    assert(Y_prediction.shape == (1, m))
    
    return Y_prediction

In [103]:
def MultivariateLinearReg_model(X_train, Y_train, X_test, Y_test, num_iterations = 2000, learning_rate = 0.5, print_cost = False):
    
    # initialize parameters with zeros 
    w, b = init_param(X_train.shape[0])
    
    # Gradient descent 
    parameters, grads, costs = grad_desc_update(w, b, X_train, Y_train, num_iterations, learning_rate, print_cost = print_cost)
    
    # Retrieve parameters w and b from dictionary "parameters"
    w = parameters["w"]
    b = parameters["b"]
    
    # Predict test/train set examples 
    Y_prediction_test = predict(w, b, X_test)
    Y_prediction_train = predict(w, b, X_train)
    
    mean_Y_train=np.mean(Y_train)
    mean_Y_test=np.mean(Y_test)
    train_accuracy=((np.mean(np.abs(Y_prediction_train - Y_train)) * 100)/mean_Y_train)
    test_accuracy=((np.mean(np.abs(Y_prediction_test - Y_test)) * 100)/mean_Y_test)
    print("MAPE of training dataset: {} %".format(train_accuracy))
    print("MAPE of test dataset: {} %".format(test_accuracy))
    

In [104]:
MultivariateLinearReg_model(X_train,y_train,X_test,y_test,num_iterations = 5000, learning_rate = 0.03, print_cost = True)

Cost after iteration 0: 19013861779.121265
Cost after iteration 100: 1544460766.706503
Cost after iteration 200: 1064052516.442882
Cost after iteration 300: 903268348.400703
Cost after iteration 400: 830447889.546573
Cost after iteration 500: 788889830.355009
Cost after iteration 600: 760945521.827768
Cost after iteration 700: 740308016.590510
Cost after iteration 800: 724237760.112478
Cost after iteration 900: 711287028.752197
Cost after iteration 1000: 700575060.133568
Cost after iteration 1100: 691521474.811951
Cost after iteration 1200: 683727355.500879
Cost after iteration 1300: 676911553.912127
Cost after iteration 1400: 670872336.646749
Cost after iteration 1500: 665462276.499121
Cost after iteration 1600: 660572300.372584
Cost after iteration 1700: 656119768.407733
Cost after iteration 1800: 652041505.743410
Cost after iteration 1900: 648287755.697715
Cost after iteration 2000: 644818954.797891
Cost after iteration 2100: 641603083.697715
Cost after iteration 2200: 638613536.393