# MLRM

In [1]:
# Setting New Working Directory.

# First Import Python's os Module.
import os

# Print Current Working Directory.
print("Current Working Directory is:",os.getcwd())

# Change the Current Working Directory using os.chdir(path)
os.chdir("F:\EDP on BUSINESS ANALYTICS\CAPSTONE PROJECT")
print("New Working Directory is:",os.getcwd())

In [2]:
# Suppress Warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
## Import Required Packages
%matplotlib inline
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, BayesianRidge
import statsmodels.formula.api as sm
import matplotlib.pylab as plt

from dmba import regressionSummary, exhaustive_search
from dmba import backward_elimination, forward_selection, stepwise_selection
from dmba import adjusted_r2_score, AIC_score, BIC_score

# REGRESSION MODELLING USING TESTING DATA

In [4]:
# Reduce data frame to the top 1000 rows and select columns for regression analysis
house_df = pd.read_csv('kc_house_data.csv')
house_df = house_df.iloc[0:100]

predictors = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors','waterfront', 'view', 
              'condition', 'grade', 'age', 'renovated']
outcome = 'price'

# partition data
X = pd.get_dummies(house_df[predictors], drop_first=True)
y = house_df[outcome]
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=1)

house_lm = LinearRegression()
house_lm.fit(train_X, train_y)

# print coefficients
print('intercept ', house_lm.intercept_)
print(pd.DataFrame({'Predictor': X.columns, 'coefficient': house_lm.coef_}))

# print performance measures
regressionSummary(train_y, house_lm.predict(train_X))


intercept  -814689.2580388592
      Predictor    coefficient
0      bedrooms  -36350.665606
1     bathrooms    -558.322408
2   sqft_living     105.292944
3      sqft_lot      -0.052909
4        floors    7787.064450
5    waterfront  341284.347719
6          view  121463.361918
7     condition  -31044.232601
8         grade  154998.152898
9           age    2531.364777
10    renovated   23341.937801

Regression statistics

                      Mean Error (ME) : 0.0000
       Root Mean Squared Error (RMSE) : 171851.2540
            Mean Absolute Error (MAE) : 127425.4558
          Mean Percentage Error (MPE) : -8.8076
Mean Absolute Percentage Error (MAPE) : 26.9430


In [5]:
pred_y = house_lm.predict(train_X)

print('adjusted r2 : ', adjusted_r2_score(train_y, pred_y, house_lm))
print('AIC : ', AIC_score(train_y, pred_y, house_lm))
print('BIC : ', BIC_score(train_y, pred_y, house_lm))

adjusted r2 :  0.6746799489047288
AIC :  1642.7987735076026
BIC :  1670.02525281649


# Regression Modelling on Validation Data

In [6]:
# Use predict() to make predictions on a new set
house_lm_pred = house_lm.predict(valid_X)

result = pd.DataFrame({'Predicted': house_lm_pred, 'Actual': valid_y,
                       'Residual': valid_y - house_lm_pred})
print(result.head(20))

# Compute common accuracy measures
regressionSummary(valid_y, house_lm_pred)

        Predicted     Actual       Residual
80  514834.550491   390000.0 -124834.550491
84  180549.912500   315000.0  134450.087500
33  535497.263422   535000.0    -497.263422
81  303907.174267   360000.0   56092.825733
93  588542.878393   430000.0 -158542.878393
17  430686.806078   485000.0   54313.193922
36  171597.763455   550000.0  378402.236545
82  493630.160334   355000.0 -138630.160334
69  619379.888722  1330000.0  710620.111278
65  535452.614668   317625.0 -217827.614668
92  335300.176215   153000.0 -182300.176215
39  524502.746275   605000.0   80497.253725
56  300331.623937   292500.0   -7831.623937
52  356664.775989   600000.0  243335.224011
51  580983.337309   345000.0 -235983.337309
32  474187.134536   687500.0  213312.865464
31  277532.813421   280000.0    2467.186579
44  165319.976125   309000.0  143680.023875
78  341843.477874   410000.0   68156.522126
10  730395.346804   662500.0  -67895.346804

Regression statistics

                      Mean Error (ME) : 17567.2572
 

# Exhaustive Search Model

In [7]:
def train_model(variables):
    model = LinearRegression()
    model.fit(train_X[variables], train_y)
    return model

def score_model(model, variables):
    pred_y = model.predict(train_X[variables])
    # we negate as score is optimized to be as low as possible
    return -adjusted_r2_score(train_y, pred_y, model)

allVariables = train_X.columns
results = exhaustive_search(allVariables, train_model, score_model)

data = []
for result in results:
    model = result['model']
    variables = result['variables']
    AIC = AIC_score(train_y, model.predict(train_X[variables]), model)
    
    d = {'n': result['n'], 'r2adj': -result['score'], 'AIC': AIC}
    d.update({var: var in result['variables'] for var in allVariables})
    data.append(d)
pd.set_option('display.width', 100)
print(pd.DataFrame(data, columns=('n', 'r2adj', 'AIC') + tuple(sorted(allVariables))))
pd.reset_option('display.width')

     n     r2adj          AIC    age  bathrooms  bedrooms  condition  floors  grade  renovated  \
0    1  0.511919  1658.493557  False      False     False      False   False   True      False   
1    2  0.655043  1638.626420  False      False     False      False   False   True      False   
2    3  0.682112  1634.661128   True      False     False      False   False   True      False   
3    4  0.691342  1633.812255   True      False     False      False   False   True      False   
4    5  0.700169  1632.970260   True      False     False      False   False   True      False   
5    6  0.700956  1633.691164   True      False      True      False   False   True      False   
6    7  0.699426  1634.854372   True      False      True       True   False   True      False   
7    8  0.693734  1636.814869   True      False      True       True    True   True      False   
8    9  0.687689  1638.799493   True      False      True       True    True   True       True   
9   10  0.681319  16

# Backward Elimination Model

In [8]:
def train_model(variables):
    model = LinearRegression()
    model.fit(train_X[variables], train_y)
    return model

def score_model(model, variables):
    return AIC_score(train_y, model.predict(train_X[variables]), model)

best_model, best_variables = backward_elimination(train_X.columns, train_model, score_model, verbose=True)

print(best_variables)

Variables: bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterfront, view, condition, grade, age, renovated
Start: score=1642.80
Step: score=1640.80, remove bathrooms
Step: score=1638.80, remove sqft_lot
Step: score=1636.81, remove renovated
Step: score=1634.85, remove floors
Step: score=1633.69, remove condition
Step: score=1632.97, remove bedrooms
Step: score=1632.97, remove None
['sqft_living', 'waterfront', 'view', 'grade', 'age']


In [9]:
regressionSummary(valid_y, best_model.predict(valid_X[best_variables]))


Regression statistics

                      Mean Error (ME) : 20020.2085
       Root Mean Squared Error (RMSE) : 195727.2469
            Mean Absolute Error (MAE) : 133386.7047
          Mean Percentage Error (MPE) : -6.0251
Mean Absolute Percentage Error (MAPE) : 30.8875


# Forward Selection Model

In [10]:
# The initial model is the constant model - this requires special handling
# in train_model and score_model
def train_model(variables):
    if len(variables) == 0:
        return None
    model = LinearRegression()
    model.fit(train_X[variables], train_y)
    return model

def score_model(model, variables):
    if len(variables) == 0:
        return AIC_score(train_y, [train_y.mean()] * len(train_y), model, df=1)
    return AIC_score(train_y, model.predict(train_X[variables]), model)

best_model, best_variables = forward_selection(train_X.columns, train_model, score_model, verbose=True)

print(best_variables)

Variables: bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterfront, view, condition, grade, age, renovated
Start: score=1700.56, constant
Step: score=1658.49, add grade
Step: score=1638.63, add view
Step: score=1634.66, add age
Step: score=1633.81, add sqft_living
Step: score=1632.97, add waterfront
Step: score=1632.97, add None
['grade', 'view', 'age', 'sqft_living', 'waterfront']


# Stepwise Selection Model

In [11]:
best_model, best_variables = stepwise_selection(train_X.columns, train_model, score_model, verbose=True)

print(best_variables)

Variables: bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterfront, view, condition, grade, age, renovated
Start: score=1700.56, constant
Step: score=1658.49, add grade
Step: score=1638.63, add view
Step: score=1634.66, add age
Step: score=1633.81, add sqft_living
Step: score=1632.97, add waterfront
Step: score=1632.97, unchanged None
['grade', 'view', 'age', 'sqft_living', 'waterfront']


In [13]:
predictors = ['grade', 'view', 'age', 'sqft_living', 'waterfront']
outcome = 'price'

# partition data
X = pd.get_dummies(house_df[predictors], drop_first=True)
y = house_df[outcome]
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=1)

house_lm = LinearRegression()
house_lm.fit(train_X, train_y)

# print coefficients
print('intercept ', house_lm.intercept_)
print(pd.DataFrame({'Predictor': X.columns, 'coefficient': house_lm.coef_}))

# print performance measures
regressionSummary(train_y, house_lm.predict(train_X))


intercept  -1008749.699889056
     Predictor    coefficient
0        grade  161126.892851
1         view  121151.274110
2          age    2198.502151
3  sqft_living      78.059468
4   waterfront  315100.268110

Regression statistics

                      Mean Error (ME) : 0.0000
       Root Mean Squared Error (RMSE) : 174989.3334
            Mean Absolute Error (MAE) : 130751.5255
          Mean Percentage Error (MPE) : -9.0646
Mean Absolute Percentage Error (MAPE) : 27.4209


In [14]:
pred_y = house_lm.predict(train_X)

print('adjusted r2 : ', adjusted_r2_score(train_y, pred_y, house_lm))
print('AIC : ', AIC_score(train_y, pred_y, house_lm))
print('BIC : ', BIC_score(train_y, pred_y, house_lm))

adjusted r2 :  0.7001693323896968
AIC :  1632.970259865393
BIC :  1647.6306718009478
