### Importing Libraries

In [1]:
#Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

### Importing training dataset

In [2]:
#Loading training dataset:
df = pd.read_csv('train.csv')
df = df.drop('Id',axis=1)
df

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,8,2007,WD,Normal,175000
1456,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2010,WD,Normal,142125


### Preprocessing the data

In [3]:
#Replacing all the NaN values
df.replace(np.NaN,method='bfill',inplace=True)
df.replace(np.NaN,method='ffill',inplace=True)
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,Grvl,Reg,Lvl,AllPub,Inside,...,0,Ex,MnPrv,Shed,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,Grvl,Reg,Lvl,AllPub,FR2,...,0,Ex,MnPrv,Shed,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,Grvl,IR1,Lvl,AllPub,Inside,...,0,Ex,MnPrv,Shed,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,Grvl,IR1,Lvl,AllPub,Corner,...,0,Ex,MnPrv,Shed,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,Grvl,IR1,Lvl,AllPub,FR2,...,0,Ex,MnPrv,Shed,0,12,2008,WD,Normal,250000


### Function to convert string values

In [4]:
#Function to change all the String values to a number
def changeStringToNum(x):
    if(type(x[0]) == str):
        return [sum([ord(element) for element in i]) for i in x]
    else:
        return x
df = df.apply( changeStringToNum)

In [5]:
df.tail()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
1455,60,158,62.0,7917,396,396,286,302,576,604,...,0,171,499,388,0,8,2007,155,617,175000
1456,20,158,85.0,13175,396,396,286,302,576,604,...,0,171,499,388,0,2,2010,155,617,210000
1457,70,158,66.0,9042,396,396,286,302,576,604,...,0,171,483,388,2500,5,2010,155,617,266500
1458,20,158,68.0,9717,396,396,286,302,576,604,...,0,171,483,388,0,4,2010,155,617,142125
1459,20,158,75.0,9937,396,396,286,302,576,604,...,0,171,483,388,0,6,2008,155,617,147500


In [6]:
#Separating training values
X = np.array(df[df.columns[0:-1]])
y = np.array(df['SalePrice'],dtype='float64')

In [7]:
print(X,y)
print(X.shape,y.shape)

[[  60.  158.   65. ... 2008.  155.  617.]
 [  20.  158.   80. ... 2007.  155.  617.]
 [  60.  158.   68. ... 2008.  155.  617.]
 ...
 [  70.  158.   66. ... 2010.  155.  617.]
 [  20.  158.   68. ... 2010.  155.  617.]
 [  20.  158.   75. ... 2008.  155.  617.]] [208500. 181500. 223500. ... 266500. 142125. 147500.]
(1460, 79) (1460,)


### Train-Test split

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

### Importing regression function

In [9]:
#Regression libraries
from sklearn.linear_model import Ridge
from sklearn import preprocessing
from sklearn.metrics import r2_score

In [10]:
#Normalizing the training data
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

In [11]:
X_train

array([[ 0.04101151, -0.09738286,  0.65971693, ...,  0.13693064,
        -0.34935953, -0.43164769],
       [-0.87577152, -0.09738286,  0.35030937, ..., -1.35104898,
        -0.34935953, -0.43164769],
       [-0.87577152, -0.09738286, -0.44531006, ..., -0.60705917,
        -0.34935953, -0.43164769],
       ...,
       [ 2.33296909, -0.1598971 , -1.77134244, ..., -1.35104898,
        -0.34935953,  2.23159689],
       [ 1.41618606, -0.09738286,  0.30610829, ..., -1.35104898,
        -0.34935953, -0.43164769],
       [-0.87577152, -0.09738286, -0.44531006, ...,  1.62491025,
        -0.34935953,  2.23159689]])

In [12]:
print(X_train.shape,y_train.shape)

(978, 79) (978,)


### Using RidgeCV with L2 normalization

In [13]:
model = Ridge(alpha=0.1,fit_intercept= True)
model.fit(X_train,y_train)

### Function to pre-process data for prediction

In [14]:
#Function to predict new data:
def dataPreprocessor(dataframe,func):
    data = dataframe
    data.replace(np.NaN,method='bfill',inplace=True)
    data.replace(np.NaN,method='ffill',inplace=True)
    data = data.apply(func)
    X = np.array(data[data.columns[0:]])
    return X

### Importing data to predict

In [15]:
#Importing testing data:
test = pd.read_csv('test.csv')
test = test.drop('Id',axis=1)
test.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
4,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,144,0,,,,0,1,2010,WD,Normal


### Preprocessing the data to predict

In [16]:
X_pred = dataPreprocessor(test,changeStringToNum)
X_pred = scaler.transform(X_pred)
y_pred = np.array(pd.read_csv('sample_submission.csv')['SalePrice'])
y_pred = y_pred.astype('int32')
print(X_pred.shape,y_pred.shape)

(1459, 79) (1459,)


In [17]:
print(X_pred,y_pred)

[[-0.87577152 -0.22241133  0.43871153 ...  1.62491025 -0.34935953
  -0.43164769]
 [-0.87577152 -0.09738286  0.48291261 ...  1.62491025 -0.34935953
  -0.43164769]
 [ 0.04101151 -0.09738286  0.17350506 ...  1.62491025 -0.34935953
  -0.43164769]
 ...
 [-0.87577152 -0.09738286  3.97479788 ... -1.35104898 -0.34935953
   2.23159689]
 [ 0.61400091 -0.09738286 -0.3569079  ... -1.35104898 -0.34935953
  -0.43164769]
 [ 0.04101151 -0.09738286  0.17350506 ... -1.35104898 -0.34935953
  -0.43164769]] [169277 187758 183583 ... 219222 184924 187741]


### Calculating accuracy with the test data

In [18]:
#Shape of test data
print(X_test.shape,y_test.shape)

(482, 79) (482,)


In [19]:
#Normalizing the test data
X_test = scaler.transform(X_test)

In [20]:
#Predicting accuracy of the model using test data:
y_predicted = model.predict(X_test)
accuracy = r2_score(y_test,y_predicted)
print(f'The accuracy of the model is {accuracy*100}%')

The accuracy of the model is 82.93034680274152%


### Predicting few samples

In [21]:
#Predciting few of the test data:
index_to_predict = [5,180,330]
for items in index_to_predict:
    print(f'Model prediction of test data at index {items} is:')
    print('----------------------------------')
    sample = X_pred[items].reshape(1,-1)
    prediction = model.predict(sample)
    print(f'House price according to the model\'s prediction is: ${format(prediction[0],".2f")}\n')
    print(f'The actual price of the house is: {y_pred[items]}\n')
    print('----------------------------------')

Model prediction of test data at index 5 is:
----------------------------------
House price according to the model's prediction is: $170248.22

The actual price of the house is: 177150

----------------------------------
Model prediction of test data at index 180 is:
----------------------------------
House price according to the model's prediction is: $172902.86

The actual price of the house is: 181294

----------------------------------
Model prediction of test data at index 330 is:
----------------------------------
House price according to the model's prediction is: $201623.25

The actual price of the house is: 221648

----------------------------------


### The accuracy of the model is around 80% which might be because the dataset contains high variance

## In conclusion, we used the Linear Regression model from SKLearn with L2 normalization for generalization and used test data to predict few samples and calculate the accuracy