In [167]:
# I have used Jupyter for making this notebook

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import csv

### House Prediction 

### Linear Regression

In [168]:
df = pd.read_excel('dataset.xlsx') # Reading the data
df.head() # Displaying first few columns of data

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,1,2012.916667,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.916667,19.5,306.5947,9,24.98034,121.53951,42.2
2,3,2013.583333,13.3,561.9845,5,24.98746,121.54391,47.3
3,4,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,5,2012.833333,5.0,390.5684,5,24.97937,121.54245,43.1


In [169]:
df.drop(columns='No', inplace=True) # Drop the column 'No' as it is not necessary
df.head()

Unnamed: 0,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,2012.916667,32.0,84.87882,10,24.98298,121.54024,37.9
1,2012.916667,19.5,306.5947,9,24.98034,121.53951,42.2
2,2013.583333,13.3,561.9845,5,24.98746,121.54391,47.3
3,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,2012.833333,5.0,390.5684,5,24.97937,121.54245,43.1


In [170]:
np_array = pd.DataFrame.to_numpy(df) # convert dataframe to numpy array
X = np_array[:,:6] # separate feature set
y = np_array[:,6] # separate output set

In [171]:
# split data into train and test sets
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X,y,test_size = 0.8,
                                                                            train_size=0.2)

In [172]:
lin_reg = sklearn.linear_model.LinearRegression()
lin_reg.fit(X_train, y_train) # train linear regression model

LinearRegression()

In [173]:
print(f'The intercept is: {lin_reg.intercept_}') # print intercept
print(f'The coefficients are: {lin_reg.coef_}') # print coefficients

The intercept is: -6624.553883159841
The coefficients are: [ 2.33330967e+00 -4.31061378e-01 -3.71503543e-03  1.22100881e+00
  3.48680587e+02 -5.54131174e+01]


In [174]:
pred = lin_reg.predict(X_test) # predict values from test set using the trained models

In [175]:
mean_squared_error = sklearn.metrics.mean_squared_error(y_test,pred) # find mean squared error in the predictions
r_sq = sklearn.metrics.r2_score(y_test,pred) # find r_2 in the predictions
print(f'Mean Squared Error is: {mean_squared_error}')
print(f'r_sq is: {r_sq}')

Mean Squared Error is: 94.34023785840638
r_sq is: 0.5028577402622865


In [176]:
for i, j in [[0.6, 0.4], [0.7,0.3], [0.9,0.1]]:
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X,y,train_size=i,
                                                                                test_size = j)                                                                                                                                               
    lin_reg = sklearn.linear_model.LinearRegression()
    lin_reg.fit(X_train, y_train)
    print(f'For train set size = {i*100}% of dataset:', end='\n\n')
    print(f'The intercept is: {lin_reg.intercept_}')
    print (f'The coefficients are: {lin_reg.coef_}')
    pred = lin_reg.predict(X_test)
    mean_squared_error = sklearn.metrics.mean_squared_error(y_test,pred)
    r_sq = sklearn.metrics.r2_score(y_test,pred)
    print(f'Mean Squared Error is: {mean_squared_error}')
    print(f'r_sq is: {r_sq}', end='\n\n')

For train set size = 60.0% of dataset:

The intercept is: -19576.817013763302
The coefficients are: [ 5.02981269e+00 -2.71328994e-01 -3.21599299e-03  1.57454728e+00
  2.40023765e+02  2.87802292e+01]
Mean Squared Error is: 97.61849667626569
r_sq is: 0.4588075327257106

For train set size = 70.0% of dataset:

The intercept is: -7999.260801492228
The coefficients are: [ 3.77804872e+00 -2.31654700e-01 -5.43398866e-03  9.53068878e-01
  2.04784688e+02 -3.84708372e+01]
Mean Squared Error is: 69.2889127694256
r_sq is: 0.6080953409647518

For train set size = 90.0% of dataset:

The intercept is: -13947.118174007515
The coefficients are: [ 5.67810114e+00 -2.87175162e-01 -4.87905153e-03  1.08116077e+00
  2.06943057e+02 -2.14501145e+01]
Mean Squared Error is: 95.25560914463364
r_sq is: 0.5073364361390894



#### We get the minimum mean squared error at train set size = 70%. Before that there is underfitting and after that there is overfitting

### Ridge Regression

In [177]:
for i, j in [[0.6, 0.4], [0.7,0.3], [0.8, 0.2], [0.9,0.1]]:
    for alpha in [0.001, 0.005, 0.01, 0.05, 0.1, 0.5]:
        # break data into train and test sets
        X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X,y,train_size=i,
                                                                                test_size = j)
        ridge_reg = sklearn.linear_model.Ridge(alpha = alpha) 
        ridge_reg.fit(X_train, y_train) # train ridge regression model
        print(f'For train set size = {i*100}% of dataset and alpha={alpha}:', end='\n\n')
        print(f'The intercept is: {ridge_reg.intercept_}') # print intercept
        print (f'The coefficients are: {ridge_reg.coef_}') # print cofficients
        pred = ridge_reg.predict(X_test)
        mean_squared_error = sklearn.metrics.mean_squared_error(y_test,pred)
        r_sq = sklearn.metrics.r2_score(y_test,pred)
        print(f'Mean Squared Error is: {mean_squared_error}')
        print(f'r_sq is: {r_sq}', end='\n\n')

For train set size = 60.0% of dataset and alpha=0.001:

The intercept is: -16045.46551199843
The coefficients are: [ 4.96541036e+00 -1.82865401e-01 -3.62189220e-03  1.60689909e+00
  2.16780525e+02  5.55488782e+00]
Mean Squared Error is: 96.62589632643196
r_sq is: 0.47526861714138335

For train set size = 60.0% of dataset and alpha=0.005:

The intercept is: -12734.250816400612
The coefficients are: [ 6.58889287e+00 -2.86266540e-01 -4.98567087e-03  1.37620986e+00
  1.34812585e+02 -3.17056220e+01]
Mean Squared Error is: 110.66608631709003
r_sq is: 0.4444985295305204

For train set size = 60.0% of dataset and alpha=0.01:

The intercept is: -12803.566063030143
The coefficients are: [ 5.53781935e+00 -2.84012415e-01 -6.15162423e-03  6.68946571e-01
  1.61587422e+02 -1.91973274e+01]
Mean Squared Error is: 70.81021800790406
r_sq is: 0.6016016703515178

For train set size = 60.0% of dataset and alpha=0.05:

The intercept is: -13127.451473760479
The coefficients are: [ 6.08167955e+00 -2.13315307e-

### LASSO Regression

In [178]:
for i, j in [[0.6, 0.4], [0.7,0.3], [0.8, 0.2], [0.9,0.1]]:
    for alpha in [0.001, 0.005, 0.01, 0.05, 0.1, 0.5]:
        # split data into train and test sets
        X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X,y,train_size=i,
                                                                                test_size = j)
        lasso_reg = sklearn.linear_model.Lasso(alpha = alpha)
        lasso_reg.fit(X_train, y_train) # train lasso model
        print(f'For train set size = {i*100}% of dataset and alpha={alpha}:', end='\n\n')
        print(f'The intercept is: {lasso_reg.intercept_}')
        print (f'The coefficients are: {lasso_reg.coef_}')
        pred = lasso_reg.predict(X_test)
        mean_squared_error = sklearn.metrics.mean_squared_error(y_test,pred)
        r_sq = sklearn.metrics.r2_score(y_test,pred)
        print(f'Mean Squared Error is: {mean_squared_error}')
        print(f'r_sq is: {r_sq}', end='\n\n')

For train set size = 60.0% of dataset and alpha=0.001:

The intercept is: -15937.203168885879
The coefficients are: [ 5.98338539e+00 -2.62773673e-01 -4.87613466e-03  1.15440467e+00
  1.69029634e+02 -2.34704254e+00]
Mean Squared Error is: 52.84186233057155
r_sq is: 0.643436536132604

For train set size = 60.0% of dataset and alpha=0.005:

The intercept is: -15980.006942113157
The coefficients are: [ 5.97830053e+00 -2.35780396e-01 -4.50077376e-03  1.42170582e+00
  1.59650242e+02 -0.00000000e+00]
Mean Squared Error is: 86.51474597721564
r_sq is: 0.5183817001677996

For train set size = 60.0% of dataset and alpha=0.01:

The intercept is: -14139.584900076257
The coefficients are: [ 5.05747709e+00 -2.55509894e-01 -4.43417547e-03  1.29366555e+00
  1.60205578e+02  0.00000000e+00]
Mean Squared Error is: 98.81318614295132
r_sq is: 0.5068583323069571

For train set size = 60.0% of dataset and alpha=0.05:

The intercept is: -16264.481296272903
The coefficients are: [ 8.10078021e+00 -2.51927356e-01