In [1]:
import pandas as pd
import numpy as np

housing = pd.read_csv('D:/upgrad/ML/Housing.csv')

## Data Understanding

In [2]:
housing.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [3]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
price               545 non-null int64
area                545 non-null int64
bedrooms            545 non-null int64
bathrooms           545 non-null int64
stories             545 non-null int64
mainroad            545 non-null object
guestroom           545 non-null object
basement            545 non-null object
hotwaterheating     545 non-null object
airconditioning     545 non-null object
parking             545 non-null int64
prefarea            545 non-null object
furnishingstatus    545 non-null object
dtypes: int64(6), object(7)
memory usage: 55.4+ KB


## Data Preparation

Need to convert the categorical data to numerical for LR modelling.
Many variables have yes or no so mapping them to 1 or 0.

In [4]:
"""
Method that converts yes/no columns into 1/0

@author Aryan Singh
"""
def convert_boolean_to_numeric(df):
    cols = df.columns
    num_cols = df._get_numeric_data().columns
    cat_variables = list(set(cols) - set(num_cols))
    for variable in df:
        var = str(variable)
        if((var in cat_variables) & ((df[variable][0] == 'yes') | (df[variable][0] == 'no'))):
            df[var] = df[var].map({ 'yes': 1, 'no': 0})

In [5]:
convert_boolean_to_numeric(housing)

In [6]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
price               545 non-null int64
area                545 non-null int64
bedrooms            545 non-null int64
bathrooms           545 non-null int64
stories             545 non-null int64
mainroad            545 non-null int64
guestroom           545 non-null int64
basement            545 non-null int64
hotwaterheating     545 non-null int64
airconditioning     545 non-null int64
parking             545 non-null int64
prefarea            545 non-null int64
furnishingstatus    545 non-null object
dtypes: int64(12), object(1)
memory usage: 55.4+ KB


In [7]:
housing.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,furnished
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,furnished
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,semi-furnished
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,furnished
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,furnished


In [8]:
def convert_categorical_to_numeric(df):
    cols = df.columns
    num_cols = df._get_numeric_data().columns
    cat_variables = list(set(cols) - set(num_cols))
    for variable in df:
        if(str(variable) in cat_variables):
            var_dummy = pd.get_dummies(df[variable], drop_first=True)
            df = pd.concat([df, var_dummy], axis=1)
            df.drop([variable], axis=1, inplace=True)
    return df

In [9]:
housing = convert_categorical_to_numeric(housing)

In [10]:
housing.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1,0
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,0,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,0,0


In [11]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 14 columns):
price              545 non-null int64
area               545 non-null int64
bedrooms           545 non-null int64
bathrooms          545 non-null int64
stories            545 non-null int64
mainroad           545 non-null int64
guestroom          545 non-null int64
basement           545 non-null int64
hotwaterheating    545 non-null int64
airconditioning    545 non-null int64
parking            545 non-null int64
prefarea           545 non-null int64
semi-furnished     545 non-null uint8
unfurnished        545 non-null uint8
dtypes: int64(12), uint8(2)
memory usage: 52.2 KB


## Creating derived variables

In [12]:
housing['areaperbedroom'] = housing['area']/housing['bedrooms']

In [13]:
housing['bbratio'] = housing['bathrooms']/housing['bedrooms']

In [14]:
housing.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished,areaperbedroom,bbratio
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0,0,1855.0,0.5
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0,0,2240.0,1.0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1,0,3320.0,0.666667
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,0,0,1875.0,0.5
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,0,0,1855.0,0.25


## Rescaling the features

In [15]:
def normalise(x):
    return ((x-np.mean(x))/(max(x)-min(x)))

In [16]:
housing = housing.apply(normalise)

In [17]:
housing.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished,areaperbedroom,bbratio
0,0.738811,0.155977,0.206972,0.23792,0.398165,0.141284,-0.177982,-0.350459,-0.045872,0.684404,0.435474,0.765138,-0.416514,-0.326606,0.005652,0.064367
1,0.647902,0.261818,0.206972,0.904587,0.731498,0.141284,-0.177982,-0.350459,-0.045872,0.684404,0.768807,-0.234862,-0.416514,-0.326606,0.067559,0.664367
2,0.647902,0.330547,0.006972,0.23792,0.064832,0.141284,-0.177982,0.649541,-0.045872,-0.315596,0.435474,0.765138,0.583486,-0.326606,0.24122,0.264367
3,0.644872,0.161475,0.206972,0.23792,0.064832,0.141284,-0.177982,0.649541,-0.045872,0.684404,0.768807,0.765138,-0.416514,-0.326606,0.008868,0.064367
4,0.575175,0.155977,0.206972,-0.095413,0.064832,0.141284,0.822018,0.649541,-0.045872,0.684404,0.435474,-0.234862,-0.416514,-0.326606,0.005652,-0.235633


## Splitting into train and test dataset

In [18]:
X = housing.loc[:, housing.columns != 'price']
y = housing['price']

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.7, test_size=0.3, random_state=100)

## Build Linear Model using RFE

In [21]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

lm = LinearRegression()
rfe = RFE(lm, 10)
rfe.fit(X_train,y_train)

RFE(estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
  n_features_to_select=10, step=1, verbose=0)

In [22]:
print(rfe.support_)
print(rfe.ranking_)

[ True False  True  True  True False False  True  True  True  True False
 False  True  True]
[1 2 1 1 1 3 5 1 1 1 1 6 4 1 1]


In [23]:
X_train.columns

Index(['area', 'bedrooms', 'bathrooms', 'stories', 'mainroad', 'guestroom',
       'basement', 'hotwaterheating', 'airconditioning', 'parking', 'prefarea',
       'semi-furnished', 'unfurnished', 'areaperbedroom', 'bbratio'],
      dtype='object')

In [24]:
cdrop = list(X_train.columns[~rfe.support_])

In [25]:
import statsmodels.api as sm

X_train = X_train.drop(cdrop, axis=1)

X_train_1 = sm.add_constant(X_train)

lsm_2 = sm.OLS(y_train,X_train_1).fit()
print(lsm_2.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.668
Model:                            OLS   Adj. R-squared:                  0.659
Method:                 Least Squares   F-statistic:                     74.47
Date:                Sat, 09 Jun 2018   Prob (F-statistic):           2.57e-82
Time:                        10:41:37   Log-Likelihood:                 373.93
No. Observations:                 381   AIC:                            -725.9
Df Residuals:                     370   BIC:                            -682.5
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.0025      0.005     