In [87]:
import pandas as pd
import numpy as np
from sklearn import preprocessing


## **Importing and analysing the dataset**

In [88]:
df = pd.read_csv('/content/London.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Property Name,Price,House Type,Area in sq ft,No. of Bedrooms,No. of Bathrooms,No. of Receptions,Location,City/County,Postal Code
0,0,Queens Road,1675000,House,2716,5,5,5,Wimbledon,London,SW19 8NY
1,1,Seward Street,650000,Flat / Apartment,814,2,2,2,Clerkenwell,London,EC1V 3PA
2,2,Hotham Road,735000,Flat / Apartment,761,2,2,2,Putney,London,SW15 1QL
3,3,Festing Road,1765000,House,1986,4,4,4,Putney,London,SW15 1LP
4,4,Spencer Walk,675000,Flat / Apartment,700,2,2,2,Putney,London,SW15 1PL


In [89]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Price,Area in sq ft,No. of Bedrooms,No. of Bathrooms,No. of Receptions
count,3480.0,3480.0,3480.0,3480.0,3480.0,3480.0
mean,1739.5,1864173.0,1712.973563,3.103736,3.103736,3.103736
std,1004.733796,2267283.0,1364.259351,1.517698,1.517698,1.517698
min,0.0,180000.0,274.0,0.0,0.0,0.0
25%,869.75,750000.0,834.0,2.0,2.0,2.0
50%,1739.5,1220000.0,1310.0,3.0,3.0,3.0
75%,2609.25,2150000.0,2157.25,4.0,4.0,4.0
max,3479.0,39750000.0,15405.0,10.0,10.0,10.0


In [90]:
df.isna().sum()

Unnamed: 0             0
Property Name          0
Price                  0
House Type             0
Area in sq ft          0
No. of Bedrooms        0
No. of Bathrooms       0
No. of Receptions      0
Location             962
City/County            0
Postal Code            0
dtype: int64

## **Creating X and y variables**

In [91]:
cols_needed = ['Price','Area in sq ft',	'No. of Bedrooms',	'No. of Bathrooms',	'No. of Receptions']
df = df[cols_needed]
df.head()

Unnamed: 0,Price,Area in sq ft,No. of Bedrooms,No. of Bathrooms,No. of Receptions
0,1675000,2716,5,5,5
1,650000,814,2,2,2
2,735000,761,2,2,2
3,1765000,1986,4,4,4
4,675000,700,2,2,2


In [92]:
sc = preprocessing.StandardScaler()
df = sc.fit_transform(df)
df = pd.DataFrame(df, columns =['Price',	'Area in sq ft'	,'No. of Bedrooms' , 	'No. of Bathrooms' , 	'No. of Receptions'])

In [93]:
X = df.drop('Price',axis=1)
X = np.array(X)
print(X)

y = df["Price"]
y = np.array(y)
print(y)

[[ 0.73532249  1.24961432  1.24961432  1.24961432]
 [-0.65904093 -0.72734787 -0.72734787 -0.72734787]
 [-0.69789543 -0.72734787 -0.72734787 -0.72734787]
 ...
 [ 1.99552791  1.90860171  1.90860171  1.90860171]
 [-0.1517331  -0.06836047 -0.06836047 -0.06836047]
 [ 2.69930756  1.90860171  1.90860171  1.90860171]]
[-0.08344778 -0.53559571 -0.49810052 ...  1.82219034  1.95673192
  2.99336376]


In [94]:
print(X.shape)
print(y.shape)

(3480, 4)
(3480,)


## **Creating Train and test variables for X and y**

In [95]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 2)

## **Using the linear regression model to fit dataset**

In [96]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train,y_train)

In [97]:
reg.score(X_test, y_test)
#As the accuracy is very low
#We can use regularisation and improve the accuracy by adding a term to the mean square error

0.373855447843326

## **Hyperparameter Tuning for Lasso regression**

In [98]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
model = linear_model.Lasso()
# define grid
grid = dict()
grid['alpha'] = np.arange(0, 1, 0.01)
search = GridSearchCV(model, grid, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(X, y)
# summarize
print('MAE: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

MAE: -0.342
Config: {'alpha': 0.01}


## **Using Lasso regression(L1) technique to improve accuracy**

In [104]:
from sklearn import linear_model
lasso_reg = linear_model.Lasso(alpha=0.01 , max_iter=1)
lasso_reg.fit(X_train , y_train)


  positive)


Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1, normalize=False,
      positive=False, precompute=False, random_state=None, selection='cyclic',
      tol=0.0001, warm_start=False)

In [105]:
lasso_reg.score(X_test, y_test)

0.42355700997682966

## **Hyperparameter tuning for Ridge regression**

In [101]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
model = linear_model.Ridge()
# define grid
grid = dict()
grid['alpha'] = np.arange(0, 1, 0.01)
search = GridSearchCV(model, grid, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(X, y)
# summarize
print('MAE: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

MAE: -0.342
Config: {'alpha': 0.99}


## **Using Ridge regression(L2) to improve accuracy**

In [106]:
ridge_reg = linear_model.Ridge(alpha=0.99 )
ridge_reg.fit(X_train , y_train)

Ridge(alpha=0.99, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [107]:
ridge_reg.score(X_test, y_test)

0.3741937225992223