In [1]:
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from joblib import dump, load

from pandas.plotting import scatter_matrix
import pandas as pd

import numpy as np
import helpers

# Data Preprocessing

In [2]:
with open('housing.data') as myFile:
    content = myFile.read()

lines = content.split('\n')
data = []

for line in lines:
    data.append(line.split(' '))

# Remove extra whitespaces
for i in range(len(data)):
    for j in range(len(data[i])):
        data[i][j] = data[i][j].strip()

# Remove empty entries
newData = []
x = []

for i in data:
    for j in i:
        if (j != ''):
            x.append(j)

    newData.append(x)
    x = []

newData

# Building a DataFrame
columns = [ 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
df = pd.DataFrame(newData, columns=columns)
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,396.90,4.98,24.00
1,0.02731,0.00,7.070,0,0.4690,6.4210,78.90,4.9671,2,242.0,17.80,396.90,9.14,21.60
2,0.02729,0.00,7.070,0,0.4690,7.1850,61.10,4.9671,2,242.0,17.80,392.83,4.03,34.70
3,0.03237,0.00,2.180,0,0.4580,6.9980,45.80,6.0622,3,222.0,18.70,394.63,2.94,33.40
4,0.06905,0.00,2.180,0,0.4580,7.1470,54.20,6.0622,3,222.0,18.70,396.90,5.33,36.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502,0.04527,0.00,11.930,0,0.5730,6.1200,76.70,2.2875,1,273.0,21.00,396.90,9.08,20.60
503,0.06076,0.00,11.930,0,0.5730,6.9760,91.00,2.1675,1,273.0,21.00,396.90,5.64,23.90
504,0.10959,0.00,11.930,0,0.5730,6.7940,89.30,2.3889,1,273.0,21.00,393.45,6.48,22.00
505,0.04741,0.00,11.930,0,0.5730,6.0300,80.80,2.5050,1,273.0,21.00,396.90,7.88,11.90


In [3]:
# Drop the last row
df = df.drop(df.index[-1])
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,396.90,4.98,24.00
1,0.02731,0.00,7.070,0,0.4690,6.4210,78.90,4.9671,2,242.0,17.80,396.90,9.14,21.60
2,0.02729,0.00,7.070,0,0.4690,7.1850,61.10,4.9671,2,242.0,17.80,392.83,4.03,34.70
3,0.03237,0.00,2.180,0,0.4580,6.9980,45.80,6.0622,3,222.0,18.70,394.63,2.94,33.40
4,0.06905,0.00,2.180,0,0.4580,7.1470,54.20,6.0622,3,222.0,18.70,396.90,5.33,36.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.00,11.930,0,0.5730,6.5930,69.10,2.4786,1,273.0,21.00,391.99,9.67,22.40
502,0.04527,0.00,11.930,0,0.5730,6.1200,76.70,2.2875,1,273.0,21.00,396.90,9.08,20.60
503,0.06076,0.00,11.930,0,0.5730,6.9760,91.00,2.1675,1,273.0,21.00,396.90,5.64,23.90
504,0.10959,0.00,11.930,0,0.5730,6.7940,89.30,2.3889,1,273.0,21.00,393.45,6.48,22.00


In [4]:
df.columns

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'MEDV'],
      dtype='object')

In [5]:
# Giving columns better Names
df.rename(columns={'CRIM' : 'Crime Rate', 'ZN' : 'Residential Land Proportions', 'INDUS' : 'Non-Retail Business Proportions', 'CHAS' : 'Charles River', 'NOX' : 'NO Concentration', 'RM' : 'Avg no. of Rooms', 'AGE' : 'Owner Occupied Units', 'DIS' : 'Weighted Distances', 'RAD' : 'Radial Highways Accessibility', 'TAX' : 'Tax',
       'PTRATIO' : 'Pupil-Teacher Ratio', 'B' : 'Bks', 'LSTAT' : 'Lower Status Population %', 'MEDV' : 'Median Owner Occupied Home Value'}, inplace=True)

In [6]:
df

Unnamed: 0,Crime Rate,Residential Land Proportions,Non-Retail Business Proportions,Charles River,NO Concentration,Avg no. of Rooms,Owner Occupied Units,Weighted Distances,Radial Highways Accessibility,Tax,Pupil-Teacher Ratio,Bks,Lower Status Population %,Median Owner Occupied Home Value
0,0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,396.90,4.98,24.00
1,0.02731,0.00,7.070,0,0.4690,6.4210,78.90,4.9671,2,242.0,17.80,396.90,9.14,21.60
2,0.02729,0.00,7.070,0,0.4690,7.1850,61.10,4.9671,2,242.0,17.80,392.83,4.03,34.70
3,0.03237,0.00,2.180,0,0.4580,6.9980,45.80,6.0622,3,222.0,18.70,394.63,2.94,33.40
4,0.06905,0.00,2.180,0,0.4580,7.1470,54.20,6.0622,3,222.0,18.70,396.90,5.33,36.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.00,11.930,0,0.5730,6.5930,69.10,2.4786,1,273.0,21.00,391.99,9.67,22.40
502,0.04527,0.00,11.930,0,0.5730,6.1200,76.70,2.2875,1,273.0,21.00,396.90,9.08,20.60
503,0.06076,0.00,11.930,0,0.5730,6.9760,91.00,2.1675,1,273.0,21.00,396.90,5.64,23.90
504,0.10959,0.00,11.930,0,0.5730,6.7940,89.30,2.3889,1,273.0,21.00,393.45,6.48,22.00


In [7]:
# Converting all Features to Float
df = df.astype(float)

# Data Exploration

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Crime Rate                        506 non-null    float64
 1   Residential Land Proportions      506 non-null    float64
 2   Non-Retail Business Proportions   506 non-null    float64
 3   Charles River                     506 non-null    float64
 4   NO Concentration                  506 non-null    float64
 5   Avg no. of Rooms                  506 non-null    float64
 6   Owner Occupied Units              506 non-null    float64
 7   Weighted Distances                506 non-null    float64
 8   Radial Highways Accessibility     506 non-null    float64
 9   Tax                               506 non-null    float64
 10  Pupil-Teacher Ratio               506 non-null    float64
 11  Bks                               506 non-null    float64
 12  Lower St

In [9]:
df['Charles River'].value_counts()

Charles River
0.0    471
1.0     35
Name: count, dtype: int64

In [10]:
# Saving my DataFrame
safe = df

In [11]:
df.describe()

Unnamed: 0,Crime Rate,Residential Land Proportions,Non-Retail Business Proportions,Charles River,NO Concentration,Avg no. of Rooms,Owner Occupied Units,Weighted Distances,Radial Highways Accessibility,Tax,Pupil-Teacher Ratio,Bks,Lower Status Population %,Median Owner Occupied Home Value
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [12]:
df.rename(columns={'Owner Occupied Units' : 'House Age'})
#df.hist(bins=50, figsize=(20, 15))

Unnamed: 0,Crime Rate,Residential Land Proportions,Non-Retail Business Proportions,Charles River,NO Concentration,Avg no. of Rooms,House Age,Weighted Distances,Radial Highways Accessibility,Tax,Pupil-Teacher Ratio,Bks,Lower Status Population %,Median Owner Occupied Home Value
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


# Seperating Training & Test Data

In [13]:
trainData, testData = helpers.splitData(df, 0.2)

In [14]:
print(f'Training Data Rows : {len(trainData)}\nTesting Data Rows  : {len(testData)}')

Training Data Rows : 405
Testing Data Rows  : 101


In [15]:
# NOTE : Can do the above splitting using sklearn built-in library as well
# trainData, testData = train_test_split(df, test_size=0.2, random_state=42)

In [16]:
# Here there is a problem that we need to address. 
# It might be possible that the above eplitting method randomly does not include a particular value of a Feature.
# e.g : 405 (0) values of Charles River may be included and none of the (1) values
# Therefore, for our Training Data to represent all values equally, we should always use Stratified Sampling

x = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for trainIndices, testIndices in x.split(df, df['Charles River']):
    trainData = df.loc[trainIndices]
    testData  = df.loc[testIndices]

testData

Unnamed: 0,Crime Rate,Residential Land Proportions,Non-Retail Business Proportions,Charles River,NO Concentration,Avg no. of Rooms,Owner Occupied Units,Weighted Distances,Radial Highways Accessibility,Tax,Pupil-Teacher Ratio,Bks,Lower Status Population %,Median Owner Occupied Home Value
342,0.02498,0.0,1.89,0.0,0.518,6.540,59.7,6.2669,1.0,422.0,15.9,389.96,8.65,16.5
379,17.86670,0.0,18.10,0.0,0.671,6.223,100.0,1.3861,24.0,666.0,20.2,393.74,21.78,10.2
223,0.61470,0.0,6.20,0.0,0.507,6.618,80.8,3.2721,8.0,307.0,17.4,396.90,7.60,30.1
219,0.11425,0.0,13.89,1.0,0.550,6.373,92.4,3.3633,5.0,276.0,16.4,393.74,10.50,23.0
48,0.25387,0.0,6.91,0.0,0.448,5.399,95.3,5.8700,3.0,233.0,17.9,396.90,30.81,14.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,0.05660,0.0,3.41,0.0,0.489,7.007,86.3,3.4217,2.0,270.0,17.8,396.90,5.50,23.6
466,3.77498,0.0,18.10,0.0,0.655,5.952,84.7,2.8715,24.0,666.0,20.2,22.01,17.15,19.0
52,0.05360,21.0,5.64,0.0,0.439,6.511,21.1,6.8147,4.0,243.0,16.8,396.90,5.28,25.0
121,0.07165,0.0,25.65,0.0,0.581,6.004,84.1,2.1974,2.0,188.0,19.1,377.67,14.27,20.3


In [17]:
# Defining a variable for the Label
trainDataX = trainData.drop(columns='Median Owner Occupied Home Value')
trainDfLabel = trainData['Median Owner Occupied Home Value'].copy()
trainDfLabel

254    21.9
348    24.5
476    16.7
321    23.1
326    23.0
       ... 
155    15.6
423    13.4
98     43.8
455    14.1
216    23.3
Name: Median Owner Occupied Home Value, Length: 404, dtype: float64

# Understanding Data Correlations

In [18]:
# Now, lets take a look at how the Features are related to the Label (Median Owner Occupied Home Value)
corrMatrix = df.corr()
corrMatrix['Median Owner Occupied Home Value'].sort_values(ascending=False)

Median Owner Occupied Home Value    1.000000
Avg no. of Rooms                    0.695360
Residential Land Proportions        0.360445
Bks                                 0.333461
Weighted Distances                  0.249929
Charles River                       0.175260
Owner Occupied Units               -0.376955
Radial Highways Accessibility      -0.381626
Crime Rate                         -0.388305
NO Concentration                   -0.427321
Tax                                -0.468536
Non-Retail Business Proportions    -0.483725
Pupil-Teacher Ratio                -0.507787
Lower Status Population %          -0.737663
Name: Median Owner Occupied Home Value, dtype: float64

In [19]:
# Now we can see which Features increase the Label and which of them decrease it.

In [20]:
# Plotting graphs of Strong +ve & -ve Correlations
#scatter_matrix(df[['Median Owner Occupied Home Value', 'Avg no. of Rooms', 'Lower Status Population %']], figsize=(12, 8))

# Histograms represents the value_counts()

In [21]:
# To get a better view of a graph, plot it seperately
#df.plot(kind='scatter', x='Avg no. of Rooms', y='Median Owner Occupied Home Value', alpha=1, figsize=(15, 6))

In [22]:
# Here we can and we should remove all outliers from this dataset as we do not want to confuse our Model with them

# Trying out Attribute Combinations

In [23]:
df['Tax per Room'] = df['Tax'] / df['Avg no. of Rooms']
# trainData['Tax per Room'] = trainData['Tax'] / trainData['Avg no. of Rooms']
# testData['Tax per Room'] = testData['Tax'] / testData['Avg no. of Rooms']
df

Unnamed: 0,Crime Rate,Residential Land Proportions,Non-Retail Business Proportions,Charles River,NO Concentration,Avg no. of Rooms,Owner Occupied Units,Weighted Distances,Radial Highways Accessibility,Tax,Pupil-Teacher Ratio,Bks,Lower Status Population %,Median Owner Occupied Home Value,Tax per Room
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0,45.019011
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6,37.688834
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7,33.681280
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4,31.723350
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2,31.061984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4,41.407553
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6,44.607843
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9,39.134174
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0,40.182514


In [24]:
corrMatrix = df.corr()
corrMatrix['Median Owner Occupied Home Value'].sort_values(ascending=False)

Median Owner Occupied Home Value    1.000000
Avg no. of Rooms                    0.695360
Residential Land Proportions        0.360445
Bks                                 0.333461
Weighted Distances                  0.249929
Charles River                       0.175260
Owner Occupied Units               -0.376955
Radial Highways Accessibility      -0.381626
Crime Rate                         -0.388305
NO Concentration                   -0.427321
Tax                                -0.468536
Non-Retail Business Proportions    -0.483725
Pupil-Teacher Ratio                -0.507787
Tax per Room                       -0.537650
Lower Status Population %          -0.737663
Name: Median Owner Occupied Home Value, dtype: float64

In [25]:
# Now, we can see that as the Tax per each Room increase, the Label decreases
#df.plot(kind='scatter', x='Tax per Room', y='Median Owner Occupied Home Value', alpha=1, figsize=(15, 6))

In [26]:
# Can remove Outliers from this as well

# Scikit-Learn

It has primarirly three types of objects :
   
     1. Estimators
        - It estimates some parameter based on a DataSet.
    
    2. Transformers
        - Takes input & gives output based on the learnings from fit().
        - Better to use fit_transform() once rather than using fit() & transform() seperately.

    3. Predictors
        - A Linear Regression Model is an example of a Predictor.
        - Three common functions : fit(), predict() & score()   # score() evaluates predictions

# Feature Scaling

We want all the values of our Features to fall in the same range, so we scale all of them with an appropriate number.

Two methods:

    1. Min-Max Scaling or Normalization
        - formula = (value - min) / (max - min)
        - Sklearn provides a class called MinMaxScaler for this.

    2. Standardization
        - (value - mean) / std
        - Sklearn provides a class called StandardScaler for this

# Building a Pipeline

In [27]:
myPipeline = Pipeline([

    ('imputer', SimpleImputer(strategy='median')),
    # ... add as many methods as you want here in your Pipeline
    ('std_scaler', StandardScaler())

])

In [28]:
safe = df.copy()
trainDataX = myPipeline.fit_transform(trainDataX)
trainDataX.shape

(404, 13)

# Model Setup

In [29]:
# trainData.drop(columns='Tax per Room', inplace=True)
# testData.drop(columns='Tax per Room', inplace=True)

In [30]:
# model = LinearRegression()
# model = DecisionTreeRegressor()
model = RandomForestRegressor()
model.fit(trainDataX, trainDfLabel)

In [31]:
# Now, lets try to predict some values and Analyze if the predictions are close or wrong

# Extract some data
someData = trainData.iloc[:5, :-1]
someLabels = trainDfLabel.iloc[:5]

# Prepare the Data by passing it through the built Pipeline
passedData = myPipeline.transform(someData)
model.predict(passedData)

array([22.508, 25.587, 16.363, 23.376, 23.391])

In [32]:
list(someLabels)

[21.9, 24.5, 16.7, 23.1, 23.0]

Therefore, the predictions are somewhat accurate and this is good.

# Checking the Mean Squared Error

In [33]:
dfPredictions = model.predict(trainDataX)
linMSE = mean_squared_error(trainDfLabel, dfPredictions)
linRMSE = np.sqrt(linMSE)
linRMSE

1.1631531338870584

This is acceptable, but we should test other models : DecisionTreeRegressor & CrossValidation

MSE with DecisionTreeRegressor model is 0 which means that this model is overfitting the data.


# Cross Validation Training

In [34]:
# 0 1 2 3 4 5 6 7 8 9
scores = cross_val_score(model, trainDataX, trainDfLabel, scoring='neg_mean_squared_error', cv=10)
rmseScores = np.sqrt(-scores)

In [35]:
rmseScores

array([2.79289168, 2.69441597, 4.40018895, 2.56972379, 3.33073436,
       2.62687167, 4.77007351, 3.27403209, 3.38378214, 3.16691711])

We have the least RMSE for the DecisionTreeRegressor CrossValidation

In [36]:
def printScores(scores):
    print('Scores : ', scores)
    print('Mean   : ', scores.mean())
    print('Std    : ', scores.std())

printScores(rmseScores)

Scores :  [2.79289168 2.69441597 4.40018895 2.56972379 3.33073436 2.62687167
 4.77007351 3.27403209 3.38378214 3.16691711]
Mean   :  3.3009631251857217
Std    :  0.7076841067486248


Writing these outputs of each Model to a Text File so that we can decide on the best Model

Upon comparision of the Three Models that were tested, it can clearly be seen that the RandomForestRegressor is the best Model

# Saving the chosen Model

In [37]:
dump(model, 'mlProject1.joblib')

['mlProject1.joblib']

# Model Testing

In [39]:
xTest = testData.drop(columns='Median Owner Occupied Home Value')
yTest = testData['Median Owner Occupied Home Value'].copy()
xTestPassed = myPipeline.transform(xTest)

finalPredictions = model.predict(xTestPassed)
finalMSE = mean_squared_error(yTest, finalPredictions)
finalRMSE = np.sqrt(finalMSE)

finalRMSE

2.948844070638726

Therefore, Model is working very nicely.