In [137]:
#Loading packages

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

In [185]:
#Defining Root Mean Square Logarithmic Error as Kaggle uses that as the score

def rmsle(y_pred, y_true):
    assert len(y_pred) == len(y_true), "Lengths do not match!"
    return np.sqrt( np.mean( np.power( np.log1p(y_pred)-np.log1p(y_true), 2 ) ) )

In [176]:
#Loading data

df_Original = pd.read_csv('train.csv')

#Saving and dropping the dependant variable
#Y_SalePrice = df_Original.SalePrice
#df_Original = df_Original.drop(["SalePrice"], axis = 1)

#Displaying first 5 rows
df_Original.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [177]:
#Removing Id column

df_PreProcessed = df_Original.drop(["Id"], axis = 1)

df_Original.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [178]:
#Selecting categorical variable

FreqCountDict = {}

for column in df_PreProcessed:
    FreqCount = df_PreProcessed[column].value_counts()
    if len(FreqCount) < 50:
        FreqCountDict[column] = FreqCount

#All the categorical columns
#L
print(FreqCountDict.keys(), "\n\n Number of categorical variables = ", len(FreqCountDict))


dict_keys(['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'LowQualFinSF', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageCars', 'GarageQual', 'GarageCond', 'PavedDrive', '3SsnPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition']) 

 Number of categorical variables =  61


In [179]:
#Treating NAs in categorical variables before OneHotEncoding
#We first get the list of all categorical features with NAs as the data description has NA as a value in
#most categorical features

CategoricalFeatures_WithNAs = []

for cat_feature in list(FreqCountDict.keys()):
    if df_PreProcessed[cat_feature].isnull().any():
        CategoricalFeatures_WithNAs.append(cat_feature)

CategoricalFeatures_WithNAs

['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [180]:
#Looking at the "data_description.txt" file, only MasVnrType & Electrical features are truly NAs. NAs in the rest
#of the features are explained in data description file

print("Number of NAs in MasVnrType : ", df_PreProcessed['MasVnrType'].isnull().sum())
print("\nNumber of NAs in Electrical : ", df_PreProcessed['Electrical'].isnull().sum())

#Replacing NAs with relevant category

df_PreProcessed['MasVnrType'] = df_PreProcessed['MasVnrType'].fillna("None")
df_PreProcessed['Electrical'] = df_PreProcessed['Electrical'].fillna("SBrkr")

#Setting all other categorical feature NAs to "Absent"
df_PreProcessed[['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
                 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu',
                 'GarageType', 'GarageFinish', 'GarageQual',
                 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']] = df_PreProcessed[['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
                 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu',
                 'GarageType', 'GarageFinish', 'GarageQual',
                 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']].fillna("Absent")

Number of NAs in MasVnrType :  8

Number of NAs in Electrical :  1


In [181]:
#Treating NAs of numerical variables before OneHotEncoding adds columns to the data

#Identifying the numerical columns with NAs
print(df_PreProcessed.dtypes[df_PreProcessed.isnull().any().values])

print("\nNumber of NAs in LotFrontage : ", df_PreProcessed['LotFrontage'].isnull().sum())
print("\nNumber of NAs in MasVnrArea : ", df_PreProcessed['MasVnrArea'].isnull().sum())
print("\nNumber of NAs in GarageYrBlt : ", df_PreProcessed['GarageYrBlt'].isnull().sum())

#Replacing LotFrontage with mean value
df_PreProcessed['LotFrontage'] = df_PreProcessed['LotFrontage'].fillna(np.nanmean(df_PreProcessed.LotFrontage.values))

#Replacing LotFrontage with mean value
df_PreProcessed['MasVnrArea'] = df_PreProcessed['MasVnrArea'].fillna(np.nanmean(df_PreProcessed['MasVnrArea']))

#Replacing NAs in GarageYrBlt to 0
df_PreProcessed['GarageYrBlt'] = df_PreProcessed['GarageYrBlt'].fillna(0)

LotFrontage    float64
MasVnrArea     float64
GarageYrBlt    float64
dtype: object

Number of NAs in LotFrontage :  259

Number of NAs in MasVnrArea :  8

Number of NAs in GarageYrBlt :  81


In [182]:
#OneHotEncoding categorical features

for cat_feature in list(FreqCountDict.keys()):
    df_PreProcessed = df_PreProcessed.join(pd.get_dummies(df_Original[cat_feature], prefix = cat_feature, prefix_sep= "_"))
    df_PreProcessed = df_PreProcessed.drop([cat_feature], axis = 1)

print("Length of original DataFrame : ", len(df_Original.columns),
      "\n\nLength of DataFrame after OneHotEncoding : ", len(df_PreProcessed.columns))

Length of original DataFrame :  81 

Length of DataFrame after OneHotEncoding :  442


In [186]:
#Running linear regression to set baseline

#Splitting into test and train datasets on a 70/30 split
train = df_PreProcessed[:round(0.7*len(df_PreProcessed))]
test = df_PreProcessed[round(0.7*len(df_PreProcessed)) + 1:len(df_PreProcessed)]

#Removing dependent variable from test and train set
Y = train.SalePrice
X = train.drop(["SalePrice"], axis = 1)

Y_Truth = test.SalePrice
test = test.drop(["SalePrice"], axis = 1)

#Training the regression model
LinReg = LinearRegression()
LinReg.fit(X=X, y=Y)

predY = LinReg.predict(X=test)

print("\n R^2 = ", r2_score(y_pred=predY, y_true= Y_Truth))

print("\n RMSLE = ", rmsle(y_pred=predY, y_true= Y_Truth))


 R^2 =  0.667677813537

 RMSLE =  0.176666793536


In [173]:
#Calculating House & Garage Age

df_PreProcessed["GarageAge"] = df_PreProcessed.YrSdf_PreProcessed.GarageYrBlt

array([], dtype=float64)

In [None]:
1 !