# Kaggle - House Pricing Prediction
---

In [90]:
import pandas as pd
import numpy as np
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from scipy import stats
from sklearn.metrics import confusion_matrix
from sklearn import linear_model
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)
from sklearn.metrics import roc_auc_score
from sklearn import metrics
from sklearn.feature_selection import VarianceThreshold

%matplotlib inline

In [69]:
# Load data
df = pd.read_csv('./data/MELBOURNE_HOUSE_PRICES_LESS.csv')

In [70]:
print(df.shape)
df.head()

(60672, 13)


Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Postcode,Regionname,Propertycount,Distance,CouncilArea
0,Abbotsford,49 Lithgow St,3,h,1490000.0,S,Jellis,1/04/2017,3067,Northern Metropolitan,4019,3.0,Yarra City Council
1,Abbotsford,59A Turner St,3,h,1220000.0,S,Marshall,1/04/2017,3067,Northern Metropolitan,4019,3.0,Yarra City Council
2,Abbotsford,119B Yarra St,3,h,1420000.0,S,Nelson,1/04/2017,3067,Northern Metropolitan,4019,3.0,Yarra City Council
3,Aberfeldie,68 Vida St,3,h,1515000.0,S,Barry,1/04/2017,3040,Western Metropolitan,1543,7.5,Moonee Valley City Council
4,Airport West,92 Clydesdale Rd,2,h,670000.0,S,Nelson,1/04/2017,3042,Western Metropolitan,3464,10.4,Moonee Valley City Council


In [71]:
df['Price'].isnull().value_counts()

False    46836
True     13836
Name: Price, dtype: int64

In [72]:
# Drop null rows
df = df.dropna(subset=['Price'])

In [73]:
# Convert year feature
df['Date'] = pd.to_datetime(df['Date'])
df['year'], df['month'] = df['Date'].dt.year, df['Date'].dt.month
df = df.loc[:, ~df.columns.isin(['Date'])]

In [74]:
X = df.loc[:, ~df.columns.isin(['Price'])]
Y = df['Price']

In [75]:
nans = lambda X: X[X.isnull().any(axis=1)]
len(nans(X))

0

In [76]:
# Converting to categorical
convert_cat = []
to_drop = []
unique = None

categorical = X.select_dtypes(include=['object'])
for i in categorical:
    column = categorical[i]
    print(i)
    unique = column.nunique()
    print(unique)
    
    
    cont = pd.crosstab(X[i], Y)
    chi2_res = scipy.stats.chi2_contingency(cont)
    
    # Keep all features with a significant P-value and drop the others
    if chi2_res[1] <= 0.05:
        convert_cat.append(i)
    else:
        to_drop.append(i)

Suburb
370
Address
43240
Type
3
Method
5
SellerG
416
Regionname
8
CouncilArea
34


In [77]:
print(f'convert_cat: {convert_cat}')
print(f'to_drop: {to_drop}')

convert_cat: ['Suburb', 'Address', 'Type', 'Method', 'Regionname', 'CouncilArea']
to_drop: ['SellerG']


In [81]:
X = X.loc[:, ~X.columns.isin(to_drop)]
# Drop Address
X = X.loc[:, ~X.columns.isin(['Address'])]
convert_cat.remove('Address')

In [82]:
# Deleting duplicate rows
X = X.loc[:,~X.columns.duplicated()]

# Get dummies - conver to categroical
X = pd.get_dummies(data=X, columns=convert_cat)

In [84]:
X.shape

(46836, 426)

In [85]:
# Find all features created by get_dummies
cat_feat = []

for feature in convert_cat:
    for col in X.loc[:, X.columns.str.startswith(feature + "_")].columns:
        cat_feat.append(col)

In [87]:
gd_convert_cat = []
gd_to_drop = []

for feature in cat_feat:
    cont = pd.crosstab(X[feature], Y)
    chi2_res = scipy.stats.chi2_contingency(cont)
    
    # Keep all features with a significant P-value and drop the others
    if chi2_res[1] <= 0.05:
        gd_convert_cat.append(feature)
    else:
        gd_to_drop.append(feature)

In [88]:
len(gd_to_drop)

224

In [91]:
# Removes all low-variance features
def variance_threshold_selector(data, threshold=0.05):
    selector = VarianceThreshold(threshold)
    selector.fit(data)
    return data[data.columns[selector.get_support(indices=True)]]

vt_to_keep = variance_threshold_selector(X)

vt_to_keep.head()

Unnamed: 0,Rooms,Postcode,Propertycount,Distance,year,month,Type_h,Type_t,Type_u,Method_PI,...,Method_VB,Regionname_Eastern Metropolitan,Regionname_Northern Metropolitan,Regionname_South-Eastern Metropolitan,Regionname_Southern Metropolitan,Regionname_Western Metropolitan,CouncilArea_Banyule City Council,CouncilArea_Boroondara City Council,CouncilArea_Brimbank City Council,CouncilArea_Darebin City Council
0,3,3067,4019,3.0,2017,1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,3,3067,4019,3.0,2017,1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,3,3067,4019,3.0,2017,1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,3,3040,1543,7.5,2017,1,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,2,3042,3464,10.4,2017,1,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [92]:
X = X[vt_to_keep.columns]

In [93]:
correlation_matrix = X.corr()
display(correlation_matrix)

Unnamed: 0,Rooms,Postcode,Propertycount,Distance,year,month,Type_h,Type_t,Type_u,Method_PI,...,Method_VB,Regionname_Eastern Metropolitan,Regionname_Northern Metropolitan,Regionname_South-Eastern Metropolitan,Regionname_Southern Metropolitan,Regionname_Western Metropolitan,CouncilArea_Banyule City Council,CouncilArea_Boroondara City Council,CouncilArea_Brimbank City Council,CouncilArea_Darebin City Council
Rooms,1.0,0.091477,-0.057221,0.277865,0.001971,0.001886,0.488106,-0.057811,-0.520716,0.052548,...,0.007831,0.152197,-0.114701,0.054635,-0.09559,0.033629,0.02567,0.018835,0.061291,-0.083746
Postcode,0.091477,1.0,7e-05,0.504504,0.02191,0.009651,0.047071,-0.028183,-0.03277,-0.023121,...,-0.013326,-0.003934,-0.131544,0.23262,0.171612,-0.334633,-0.075481,-0.007904,-0.184911,-0.117536
Propertycount,-0.057221,7e-05,1.0,0.008014,0.015608,0.005655,-0.045298,-0.011189,0.061098,0.009239,...,-0.026068,-0.077785,0.25805,-0.052503,-0.022017,-0.141975,-0.140789,0.009829,-0.13141,0.379743
Distance,0.277865,0.504504,0.008014,1.0,0.038881,0.014742,0.236496,-0.05336,-0.232734,-0.053456,...,-0.108814,0.150152,-0.172619,0.465906,-0.2808,-0.096333,-0.006915,-0.187862,0.016928,-0.154965
year,0.001971,0.02191,0.015608,0.038881,1.0,-0.258643,-0.004188,0.016248,-0.007686,0.025952,...,0.080028,-0.02074,0.03291,-0.016382,-0.038577,0.02267,-0.008643,-0.014754,-0.003563,0.012332
month,0.001886,0.009651,0.005655,0.014742,-0.258643,1.0,-0.000594,-0.003073,0.003059,-0.014182,...,0.004507,-0.009702,0.003761,0.002884,-0.008864,0.007236,-0.000249,0.002765,0.00761,0.002716
Type_h,0.488106,0.047071,-0.045298,0.236496,-0.004188,-0.000594,1.0,-0.524141,-0.753748,-0.010631,...,-0.035778,0.077886,0.003653,0.032883,-0.198037,0.088815,0.043985,-0.046744,0.084313,-0.014055
Type_t,-0.057811,-0.028183,-0.011189,-0.05336,0.016248,-0.003073,-0.524141,1.0,-0.164591,0.029221,...,0.025408,-0.02553,0.007185,4e-06,0.05017,-0.023185,-0.017702,-0.013433,-0.032924,0.029142
Type_u,-0.520716,-0.03277,0.061098,-0.232734,-0.007686,0.003059,-0.753748,-0.164591,1.0,-0.010236,...,0.021832,-0.070507,-0.009776,-0.038088,0.190653,-0.084974,-0.037283,0.064504,-0.072246,-0.006209
Method_PI,0.052548,-0.023121,0.009239,-0.053456,0.025952,-0.014182,-0.010631,0.029221,-0.010236,1.0,...,-0.123609,0.012488,-0.036917,-0.016703,0.051536,-0.006566,-0.031725,0.040702,0.013383,0.021799


In [94]:
# Correlated features to be dropped
upper = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]
print(f'Number of correlated features to drop: {len(to_drop)}')

Number of correlated features to drop: 0


In [95]:
# Save the DF for future reference
X.to_csv('./data/X_house.csv')

In [96]:
# Train splits
X_train, X_temp, y_train, y_temp = train_test_split(X, Y, test_size=0.40, random_state=42)

# Dev and Test splits
X_dev, X_test, y_dev, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42)

In [101]:
X.head()

Unnamed: 0,Rooms,Postcode,Propertycount,Distance,year,month,Type_h,Type_t,Type_u,Method_PI,...,Method_VB,Regionname_Eastern Metropolitan,Regionname_Northern Metropolitan,Regionname_South-Eastern Metropolitan,Regionname_Southern Metropolitan,Regionname_Western Metropolitan,CouncilArea_Banyule City Council,CouncilArea_Boroondara City Council,CouncilArea_Brimbank City Council,CouncilArea_Darebin City Council
0,3,3067,4019,3.0,2017,1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,3,3067,4019,3.0,2017,1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,3,3067,4019,3.0,2017,1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,3,3040,1543,7.5,2017,1,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,2,3042,3464,10.4,2017,1,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0


## Linear Regression

In [110]:
# Fit a linear model.
regr = linear_model.LinearRegression(n_jobs=-1)
regr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

In [111]:
print('\nR-squared:')
print(regr.score(X_train, y_train))

print('\nR-squared:')
print(regr.score(X_dev, y_dev))


R-squared:
0.542961180319223

R-squared:
0.5293348946805985


## Ridge

In [132]:
ridge_r = linear_model.Ridge(0.1)
ridge_r.fit(X_train, y_train)

Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [131]:
print('\nR-squared:')
print(ridge_r.score(X_train, y_train))

print('\nR-squared:')
print(ridge_r.score(X_dev, y_dev))


R-squared:
0.5429611803191381

R-squared:
0.5293348978799002


## Lasso

In [135]:
lass_r = linear_model.Lasso(0.01)
lass_r.fit(X_train, y_train)

Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [134]:
print('\nR-squared:')
print(lass_r.score(X_train, y_train))

print('\nR-squared:')
print(lass_r.score(X_dev, y_dev))


R-squared:
0.542961180319216

R-squared:
0.529334896048428
