In [28]:
#import packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy import stats

In [39]:
#Import data
train = pd.read_csv('SupportingFiles/train.csv')
test = pd.read_csv('SupportingFiles/test.csv')
test_ids = test[["Id"]]
#combine so we dont have to adjust both sets
combine =  pd.concat([train,test], sort = 'True')
#Drop all Ids
combine.drop(columns = ['Id'], inplace= True)
#get independent variable for sale price
y = train['SalePrice']

# Data Cleaning
<p>

In [40]:
# Check to see what columsn contains null values
nulls= combine.isnull().sum()/len(combine) * 100
print(nulls[nulls > 0])

Alley           93.216855
BsmtCond         2.809181
BsmtExposure     2.809181
BsmtFinSF1       0.034258
BsmtFinSF2       0.034258
BsmtFinType1     2.706406
BsmtFinType2     2.740665
BsmtFullBath     0.068517
BsmtHalfBath     0.068517
BsmtQual         2.774923
BsmtUnfSF        0.034258
Electrical       0.034258
Exterior1st      0.034258
Exterior2nd      0.034258
Fence           80.438506
FireplaceQu     48.646797
Functional       0.068517
GarageArea       0.034258
GarageCars       0.034258
GarageCond       5.447071
GarageFinish     5.447071
GarageQual       5.447071
GarageType       5.378554
GarageYrBlt      5.447071
KitchenQual      0.034258
LotFrontage     16.649538
MSZoning         0.137033
MasVnrArea       0.787941
MasVnrType       0.822199
MiscFeature     96.402878
PoolQC          99.657417
SalePrice       49.982871
SaleType         0.034258
TotalBsmtSF      0.034258
Utilities        0.068517
dtype: float64


In [41]:
# Nulls are replaced based on the text description file

# Lot Frontage is repalced with median
combine['LotFrontage'].fillna(train['LotFrontage'].median(), inplace=True)

# Alley way nulls are replaced with "None"
combine['Alley'].fillna('none',inplace=True)

# Garage Year Built is replaced with yearbuilt if it is null
combine['GarageYrBlt'].fillna(train['YearBuilt'], inplace=True)

# Fill MasVnrARea with 0 and msnvnr type with none
combine['MasVnrArea'].fillna(0, inplace=True)
combine['MasVnrType'].fillna('none',inplace=True)

# Change basement values to none
combine['BsmtQual'].fillna('none',inplace=True)
combine['BsmtCond'].fillna('none',inplace=True)
combine['BsmtExposure'].fillna('none',inplace=True)
combine['BsmtFinType1'].fillna('none',inplace=True)
combine['BsmtFinType2'].fillna('none',inplace=True)

# Electrical replace with mode
combine['Electrical'].fillna(train['Electrical'].mode()[0], inplace=True)

# Replace nulls with none for fireplace
combine['FireplaceQu'].fillna('none',inplace=True)

# Replace nulls with none for garage attributes
combine['GarageType'].fillna('none',inplace=True)
combine['GarageFinish'].fillna('none',inplace=True)
combine['GarageQual'].fillna('none',inplace=True)
combine['GarageCond'].fillna('none',inplace=True)

# PoolQC nulls filled with none
combine['PoolQC'].fillna('none',inplace=True)

# Fence nulls filled with none
combine['Fence'].fillna('none',inplace=True)

# Miscellaneious feature nulls filled with none
combine['MiscFeature'].fillna('none',inplace=True)

# Others:
combine['Utilities'].fillna(combine['Utilities'].mode()[0], inplace=True)
combine['Exterior1st'].fillna(combine['Exterior1st'].mode()[0], inplace=True)
combine['Exterior2nd'].fillna(combine['Exterior2nd'].mode()[0], inplace=True)
combine['KitchenQual'].fillna(combine['KitchenQual'].mode()[0], inplace=True)
combine['SaleType'].fillna(combine['SaleType'].mode()[0], inplace=True)
combine['Functional'].fillna(combine['Functional'].mode()[0], inplace=True)
combine['MSZoning'].fillna(combine['MSZoning'].mode()[0], inplace=True)
combine['BsmtFinSF1'].fillna(combine['BsmtFinSF1'].median(), inplace=True)
combine['BsmtFullBath'].fillna(combine['BsmtFullBath'].median(), inplace=True)
combine['GarageCars'].fillna(combine['GarageCars'].mode()[0], inplace=True)
combine['BsmtFinSF2'].fillna(combine['BsmtFinSF2'].median(), inplace=True)
combine['BsmtHalfBath'].fillna(combine['BsmtHalfBath'].median(), inplace=True)
combine['GarageArea'].fillna(combine['GarageArea'].mode()[0], inplace=True)
combine['BsmtUnfSF'].fillna(combine['BsmtUnfSF'].median(), inplace=True)
combine['TotalBsmtSF'].fillna(combine['TotalBsmtSF'].median(), inplace=True)

In [42]:
# Check nulls again
nulls= combine.isnull().sum()/len(combine) * 100
print(nulls[nulls > 0])

SalePrice    49.982871
dtype: float64


In [43]:
#Split Training data
train = combine[combine['SalePrice'].notnull()]

In [44]:
# Find Correlation of attributes to SalePrice
traincorr = train.corr()['SalePrice']
# convert series to dataframe so it can be sorted using pandas
traincorr = pd.DataFrame(traincorr)
# correct column label from SalePrice to correlation
traincorr.columns = ["Correlation"]
# Print out the attributes that are correlated to the 
traincorr_sig = traincorr[((traincorr['Correlation'] > .4) | (traincorr['Correlation'] < -.4)) & traincorr['Correlation'].notnull()]
traincorr_sig

Unnamed: 0,Correlation
1stFlrSF,0.605852
Fireplaces,0.466929
FullBath,0.560664
GarageArea,0.623431
GarageCars,0.640409
GarageYrBlt,0.508043
GrLivArea,0.708624
MasVnrArea,0.472614
OverallQual,0.790982
SalePrice,1.0


# Simple Linear Regression
The first attempt at predicting prices will be using simple linear regression to get a base result

In [35]:
# Use only the values that are correlated with SalePrice
train_linReg = train[traincorr_sig.index]

In [36]:
# Get dummy variables for categorical data
train_linReg = pd.get_dummies(train_linReg, drop_first = True)

In [37]:
X = train_linReg.drop('SalePrice', axis =1)

from sklearn.model_selection import train_test_split
x_train, x_cv, y_train, y_cv = train_test_split(X,y, test_size =0.3)

# Fitting multiple linear regression to training set
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_train,y_train)

# Fit forest regssiotn to training set
y_pred = model.predict(x_cv)

import sklearn.metrics as met
print(f"MAE: {met.mean_absolute_error(y_cv,y_pred)}")

MAE: 23021.7062348616


# Random Forest Regression

In [38]:
# Get dummy variables for categorical data
train_forReg = pd.get_dummies(train_linReg, drop_first = True)

In [None]:
# Fitting Random Forest Regression to the dataset
from sklearn.ensemble import RandomForestRegressor

# pick the the best amount of estimators for the model
model_for = RandomForestRegressor(n_estimators = 10)
model_for.fit(x_train,y_train)

y_pred2 = model_for.predict(x_cv)
print(mean_absolute_error(y_cv,y_pred2))

In [356]:

# Print the Training Set Accuracy and the Test Set Accuracy in order to understand overfitting
print (model.score( x_train,  y_train ) , model.score( x_cv, y_cv))
print (model_for.score( x_train,  y_train ) , model_for.score( x_cv, y_cv))

0.7969531255192451 0.7834681486300468
0.970619341254852 0.8357042327970008


In [357]:
#Fit model to test set
test = combine[combine['SalePrice'].isnull()]
test = test[traincorr_index]
test.drop('SalePrice', axis =1, inplace = True)
test=pd.get_dummies(test, drop_first = True)
pred_test = model.predict(test)

#Fit random forrest to test set
pred_test_for = model_for.predict(test)


In [358]:
print(pred_test)

[ 96130.38818735 164755.17168341 171889.06221405 ... 179273.24614164
 107478.07181373 246410.09024776]


In [362]:
#Linear Regression
output = pd.DataFrame( { 'id': test_IDs , 'SalePrice': pred_test} )
output = output[['id', 'SalePrice']]

# Forest Regression
output2 = pd.DataFrame( { 'id': test_IDs , 'SalePrice': pred_test_for} )
output2 = output2[['id', 'SalePrice']]

In [363]:
output.to_csv('adam_submission.csv',index = False)
output2.to_csv('adam_submission_for.csv',index = False)