## Step 1 : Loading the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Step 2: Load the data

In [2]:
data = pd.read_csv('Melbourne_housing_Full.csv')
data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [3]:
data.shape

(34857, 21)

In [4]:
data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [5]:
cols_to_use = ['Suburb', 'Rooms', 'Type', 'Method', 'SellerG', 'Regionname', 'Propertycount', 'Distance', 
              'Bedroom2', 'Bathroom', 'Car','CouncilArea', 'Landsize', 'BuildingArea', 'Price']

data = data[cols_to_use]
data.head()

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,Bedroom2,Bathroom,Car,CouncilArea,Landsize,BuildingArea,Price
0,Abbotsford,2,h,SS,Jellis,Northern Metropolitan,4019.0,2.5,2.0,1.0,1.0,Yarra City Council,126.0,,
1,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,2.0,1.0,1.0,Yarra City Council,202.0,,1480000.0
2,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,2.0,1.0,0.0,Yarra City Council,156.0,79.0,1035000.0
3,Abbotsford,3,u,VB,Rounds,Northern Metropolitan,4019.0,2.5,3.0,2.0,1.0,Yarra City Council,0.0,,
4,Abbotsford,3,h,SP,Biggin,Northern Metropolitan,4019.0,2.5,3.0,2.0,0.0,Yarra City Council,134.0,150.0,1465000.0


In [6]:
data.shape

(34857, 15)

## Step 3: Data Cleaning, Data Preprocessing

In [8]:
## Checking missing values per column

data.isnull().sum()

Suburb               0
Rooms                0
Type                 0
Method               0
SellerG              0
Regionname           3
Propertycount        3
Distance             1
Bedroom2          8217
Bathroom          8226
Car               8728
CouncilArea          3
Landsize         11810
BuildingArea     21115
Price             7610
dtype: int64

In [11]:
## Checking percentage missing value per column

data.isnull().sum() / len(data) * 100

Suburb            0.000000
Rooms             0.000000
Type              0.000000
Method            0.000000
SellerG           0.000000
Regionname        0.008607
Propertycount     0.008607
Distance          0.002869
Bedroom2         23.573457
Bathroom         23.599277
Car              25.039447
CouncilArea       0.008607
Landsize         33.881286
BuildingArea     60.576068
Price            21.832057
dtype: float64

## Note: Landsize and Buidling Area have more than 30% missing value hence delete these columns

In [12]:
data = data.drop(['Landsize', 'BuildingArea'], axis = 1)
data.head()

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,Bedroom2,Bathroom,Car,CouncilArea,Price
0,Abbotsford,2,h,SS,Jellis,Northern Metropolitan,4019.0,2.5,2.0,1.0,1.0,Yarra City Council,
1,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,2.0,1.0,1.0,Yarra City Council,1480000.0
2,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,2.0,1.0,0.0,Yarra City Council,1035000.0
3,Abbotsford,3,u,VB,Rounds,Northern Metropolitan,4019.0,2.5,3.0,2.0,1.0,Yarra City Council,
4,Abbotsford,3,h,SP,Biggin,Northern Metropolitan,4019.0,2.5,3.0,2.0,0.0,Yarra City Council,1465000.0


In [13]:
## Performing Missing value treatment for remaining column having missing vlaues

data['Propertycount'] = data['Propertycount'].fillna(data['Propertycount'].mean())
data['Bedroom2'] = data['Bedroom2'].fillna(data['Bedroom2'].mean())
data['Bathroom'] = data['Bathroom'].fillna(data['Bathroom'].mean())
data['Car'] = data['Car'].fillna(data['Car'].mean())
data['Price'] = data['Price'].fillna(data['Price'].mean())

data['Distance'] = data['Distance'].fillna(data['Distance'].mode()[0])
data['Regionname'] = data['Regionname'].fillna(data['Regionname'].mode()[0])
data['CouncilArea'] = data['CouncilArea'].fillna(data['CouncilArea'].mode()[0])

In [14]:
data.isnull().sum()

Suburb           0
Rooms            0
Type             0
Method           0
SellerG          0
Regionname       0
Propertycount    0
Distance         0
Bedroom2         0
Bathroom         0
Car              0
CouncilArea      0
Price            0
dtype: int64

## Feature Scaling

In [16]:
data.head()

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,Bedroom2,Bathroom,Car,CouncilArea,Price
0,Abbotsford,2,h,SS,Jellis,Northern Metropolitan,4019.0,2.5,2.0,1.0,1.0,Yarra City Council,1050173.0
1,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,2.0,1.0,1.0,Yarra City Council,1480000.0
2,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,2.0,1.0,0.0,Yarra City Council,1035000.0
3,Abbotsford,3,u,VB,Rounds,Northern Metropolitan,4019.0,2.5,3.0,2.0,1.0,Yarra City Council,1050173.0
4,Abbotsford,3,h,SP,Biggin,Northern Metropolitan,4019.0,2.5,3.0,2.0,0.0,Yarra City Council,1465000.0


In [17]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
mms

In [18]:
data[['Propertycount', 'Distance']] = mms.fit_transform(data[['Propertycount', 'Distance']])
data.head()

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,Bedroom2,Bathroom,Car,CouncilArea,Price
0,Abbotsford,2,h,SS,Jellis,Northern Metropolitan,0.182501,0.051975,2.0,1.0,1.0,Yarra City Council,1050173.0
1,Abbotsford,2,h,S,Biggin,Northern Metropolitan,0.182501,0.051975,2.0,1.0,1.0,Yarra City Council,1480000.0
2,Abbotsford,2,h,S,Biggin,Northern Metropolitan,0.182501,0.051975,2.0,1.0,0.0,Yarra City Council,1035000.0
3,Abbotsford,3,u,VB,Rounds,Northern Metropolitan,0.182501,0.051975,3.0,2.0,1.0,Yarra City Council,1050173.0
4,Abbotsford,3,h,SP,Biggin,Northern Metropolitan,0.182501,0.051975,3.0,2.0,0.0,Yarra City Council,1465000.0


## Feature Encoding

In [19]:
data_ohe = pd.get_dummies(data[['Suburb', 'Type', 'Method', 'SellerG', 'Regionname', 'CouncilArea']])
data_ohe

Unnamed: 0,Suburb_Abbotsford,Suburb_Aberfeldie,Suburb_Airport West,Suburb_Albanvale,Suburb_Albert Park,Suburb_Albion,Suburb_Alphington,Suburb_Altona,Suburb_Altona Meadows,Suburb_Altona North,...,CouncilArea_Moorabool Shire Council,CouncilArea_Moreland City Council,CouncilArea_Nillumbik Shire Council,CouncilArea_Port Phillip City Council,CouncilArea_Stonnington City Council,CouncilArea_Whitehorse City Council,CouncilArea_Whittlesea City Council,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34852,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34853,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34854,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34855,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
data = pd.concat([data, data_ohe], axis = 1)
data.head()

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,Bedroom2,Bathroom,...,CouncilArea_Moorabool Shire Council,CouncilArea_Moreland City Council,CouncilArea_Nillumbik Shire Council,CouncilArea_Port Phillip City Council,CouncilArea_Stonnington City Council,CouncilArea_Whitehorse City Council,CouncilArea_Whittlesea City Council,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council
0,Abbotsford,2,h,SS,Jellis,Northern Metropolitan,0.182501,0.051975,2.0,1.0,...,0,0,0,0,0,0,0,0,1,0
1,Abbotsford,2,h,S,Biggin,Northern Metropolitan,0.182501,0.051975,2.0,1.0,...,0,0,0,0,0,0,0,0,1,0
2,Abbotsford,2,h,S,Biggin,Northern Metropolitan,0.182501,0.051975,2.0,1.0,...,0,0,0,0,0,0,0,0,1,0
3,Abbotsford,3,u,VB,Rounds,Northern Metropolitan,0.182501,0.051975,3.0,2.0,...,0,0,0,0,0,0,0,0,1,0
4,Abbotsford,3,h,SP,Biggin,Northern Metropolitan,0.182501,0.051975,3.0,2.0,...,0,0,0,0,0,0,0,0,1,0


In [22]:
data =data.drop(['Suburb', 'Type', 'Method', 'SellerG', 'Regionname', 'CouncilArea'], axis = 1)
data.head()

Unnamed: 0,Rooms,Propertycount,Distance,Bedroom2,Bathroom,Car,Price,Suburb_Abbotsford,Suburb_Aberfeldie,Suburb_Airport West,...,CouncilArea_Moorabool Shire Council,CouncilArea_Moreland City Council,CouncilArea_Nillumbik Shire Council,CouncilArea_Port Phillip City Council,CouncilArea_Stonnington City Council,CouncilArea_Whitehorse City Council,CouncilArea_Whittlesea City Council,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council
0,2,0.182501,0.051975,2.0,1.0,1.0,1050173.0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1,2,0.182501,0.051975,2.0,1.0,1.0,1480000.0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
2,2,0.182501,0.051975,2.0,1.0,0.0,1035000.0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,3,0.182501,0.051975,3.0,2.0,1.0,1050173.0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
4,3,0.182501,0.051975,3.0,2.0,0.0,1465000.0,1,0,0,...,0,0,0,0,0,0,0,0,1,0


## Step 4: Seperate X and y

In [23]:
X = data.drop('Price', axis = 1)
y = data['Price']

## Step 5: Split the data into train and test set

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

## Apply Linear Regression on the train set

In [25]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr

In [26]:
lr.fit(X_train, y_train)

## Step 7: Perform Predictions on X_test

In [27]:
y_pred = lr.predict(X_test)
y_pred

array([ 771584.,  774144., 1222144., ..., 1536000.,  651776.,  641024.])

## Step 8: Checking the accuracy on the train set and test set

In [28]:
## Checking the accuracy on the train data

lr.score(X_train, y_train)

0.49461060111182964

In [30]:
## Checking the accuracy on the test data

lr.score(X_test, y_test)

-3.901921141426073e+21

## Note:

- As seen above, the accuracy on train set is high whereas on test set the accuracy is low. This is called as a case of overfitting.

- In Linear Regression, Accuracy and error are inversely related. (If acc high then error will be low, else if the acc is low it is clear that the error is high)

- In above dataset, accuracy on train data is high it simple indicates that error is low and accuracy on test data is low it simply indicates that error is very high.



- Bias : Error on the train data is called bias
- Variance : Error on the test data is called as variance

- In the above dataset, we can observe low bias and high variance.
- Overfitting happens when there is low bias and high variance

### Q. How to avoid overfitting?

1. Increase the data size(i.e. increase the total rows in the data) - This is a random approach to avoid overfitting. There is no gurantee that overfitting problem will be solved with this. There is only possibility of overfitting being solved.

2. Regularization Techniques -    
    a. Lasso Regression(L1 Regularization) : It does not consider the less important features from the data while fitting the algorithm.  
    b. Ridge Regression(L2 Regularization) : It considers all the features but makes the less important features to 0.  
    
3. Consider applying a Complex Machine learning Alogrithm on the data.

#### Working of Lasso Regression:

Eg: The data has 34000 rows and 27 columns, We are tyring to predict the house price in a particular location.
1. If Linear Regression is applied on this data then there can be a problem of Overfitting.
    a. Linear Regression is directly applied on all the 27 columns in the data hence the problem overfitting arises.

2. If the same data is applied with Lasso Regression then we can avoid overfitting problem. How?   
    a. Lasso Regression consider only very important colunms out 27 cols to predict house price. The unwanted columns are ignored by Lasso Regression.
    

#### Working of Ridge Regression:

Eg: The data has 34000 rows and 27 columns, We are tyring to predict the house price in a particular location.

    1. If Linear Regression is applied on this data then there can be a problem of Overfitting. a. Linear Regression is directly applied on all the 27 columns in the data hence the problem overfitting arises.
    
    2. If Ridge Regression is applied on the same data instead of normal Linear Regression then also we avoid the problem of overfitting. How?  
    a. Ridge Regression consider all the columns present in the data but the unwanted are zerod out.

In [32]:
from sklearn.linear_model import Lasso
las = Lasso()
las

In [33]:
las.fit(X_train, y_train)

In [43]:
y_pred = las.predict(X_test)
y_pred

array([ 750643.93873177,  797960.49514603, 1243602.78396804, ...,
       1512454.73296505,  647861.79008042,  658152.08997326])

In [44]:
## Checking accuracy on train data

las.score(X_train, y_train)

0.4957063272178599

In [45]:
## Checking accuracy on test data

las.score(X_test, y_test)

0.4717390770158346

### Note:

- In case of plain Linear Regression Alogrithm, the accuracy on train data = 49.47 and accuracy on test data = -3.90. (Error is low in train data and high in test data indicating Overfitting)
- In case of Lasso Regression, the accuracy on train data = 49.47 and accuracy on test data = 47.17.(Error on train data is low and error on test data is also low indicating no overfitting)

## Applying Ridge Regression on the data

In [37]:
from sklearn.linear_model import Ridge
rid = Ridge()
rid

In [38]:
rid.fit(X_train, y_train)

In [39]:
y_pred = rid.predict(X_test)
y_pred

array([ 764678.06495257,  791848.82526054, 1240250.82466861, ...,
       1515205.26027276,  638360.06998183,  647524.85385354])

In [41]:
rid.score(X_train, y_train)

0.4947786587205505

In [42]:
rid.score(X_test, y_test)

0.47542897153645625

### Note:

- In case of plain Linear Regression Alogrithm, the accuracy on train data = 49.47 and accuracy on test data = -3.90. (Error is low in train data and high in test data indicating Overfitting)
- In case of Ridge Regression, the accuracy on train data = 49.47 and accuracy on test data = 47.54.(Error on train data is low and error on test data is also low indicating no overfitting)

## Underfitting

- High Bias and Low Variance - Strategies to deal with underfitting as well