In [32]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import math
import seaborn as sns
import sklearn
from sklearn import linear_model
from sklearn import preprocessing
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix
%matplotlib inline
sns.set_style('white')



In [5]:
data = pd.read_csv('Melbourne_housing_FULL.csv')

In [21]:
data_drop = data.loc[(data.Price.isnull() == False),:]
len(data_drop)

27247

In [29]:
for col in data_drop.columns:
    print(col, data_drop[col].nunique())

Rooms 11
Price 2607
Distance 211
Bedroom2 14
Bathroom 10
Propertycount 332
Regionname_Eastern Metropolitan 2
Regionname_Eastern Victoria 2
Regionname_Northern Metropolitan 2
Regionname_Northern Victoria 2
Regionname_South-Eastern Metropolitan 2
Regionname_Southern Metropolitan 2
Regionname_Western Metropolitan 2
Regionname_Western Victoria 2
Type_h 2
Type_t 2
Type_u 2
CouncilArea_Banyule City Council 2
CouncilArea_Bayside City Council 2
CouncilArea_Boroondara City Council 2
CouncilArea_Brimbank City Council 2
CouncilArea_Cardinia Shire Council 2
CouncilArea_Casey City Council 2
CouncilArea_Darebin City Council 2
CouncilArea_Frankston City Council 2
CouncilArea_Glen Eira City Council 2
CouncilArea_Greater Dandenong City Council 2
CouncilArea_Hobsons Bay City Council 2
CouncilArea_Hume City Council 2
CouncilArea_Kingston City Council 2
CouncilArea_Knox City Council 2
CouncilArea_Macedon Ranges Shire Council 2
CouncilArea_Manningham City Council 2
CouncilArea_Maribyrnong City Council 2
Co

In [23]:
#See which columns are most contributing to the loss of data w/ dropna
#Removing BuildingArea and YearBuilt we double the data we have for testing
#for col in data_drop.columns:
#    print(str(col),len(data_drop[col].dropna()))


In [24]:
#Remove all columns that drop data below 21000
data_drop = data_drop.drop(columns=['BuildingArea','YearBuilt','Landsize','Car','Lattitude'
                                   ,'Longtitude'])

#Remove all items that have too many unique values
data_drop = data_drop.drop(columns=['Address','Date','Method','Suburb','SellerG','Postcode'])

In [25]:
#Turn string value columns we are keeping into binary
data_drop = pd.get_dummies(data_drop, columns=['Regionname','Type','CouncilArea'
                                              ])



In [26]:
data_drop = data_drop.dropna()

In [27]:
X = data_drop.drop(columns=['Price'])
y = data_drop.Price

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [30]:
#Predicting House price - Regression!
olr = linear_model.LinearRegression();
olr.fit(X_train, y_train);
olr.score(X_test, y_test)

0.6340090984488932

In [43]:
lasso = linear_model.Lasso(alpha=13)
lasso.fit(X_train, y_train)
lasso.score(X_test, y_test)


[ 1.53339512e+05 -4.05519251e+04  2.81673842e+04  1.85157091e+05
  5.29166762e-01  5.79748966e+04  8.06215277e+04 -2.01618097e+05
  3.12869568e+05 -0.00000000e+00 -7.61048527e+04 -6.09437580e+04
  3.51893254e+05  4.38446435e+05  7.94919310e+04 -1.26197578e+05
 -2.08675571e+05  6.11355530e+05  4.99344764e+05 -4.15463892e+05
  3.46650058e+05  1.29024898e+05 -5.05355274e+04  4.46032149e+05
  1.90036387e+05  8.64707105e+03 -1.63391987e+05 -2.30017063e+05
  1.40020493e+05 -1.17877303e+05  4.05513278e+05 -8.16041244e+04
 -3.23127686e+05 -1.28546940e+04  7.68958399e+04 -4.57864982e+05
  9.07960587e+04  9.24719177e+04 -1.39497830e+05 -1.30102111e+04
 -9.60478234e+04 -4.80561980e+05  2.43303693e+05  4.67714288e+05
  0.00000000e+00 -1.37603742e+05 -4.85401678e+05  6.54280891e+04
  1.82790202e+04]




In [40]:
gsc = GridSearchCV(lasso, {'alpha':[11, 12, 13, 14, 15]}, cv=3, verbose=1)
gsc.fit(X_train, y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:   38.9s finished


GridSearchCV(cv=3, error_score='raise',
       estimator=Lasso(alpha=10, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': [11, 12, 13, 14, 15]}, pre_dispatch='2*n_jobs',
       refit=True, scoring=None, verbose=1)

In [41]:
print(gsc.best_params_, gsc.best_score_)

{'alpha': 13} 0.6173369286266706


In [52]:
s_coef = np.argsort(np.abs(lasso.coef_))

In [55]:
print(X.columns[s_coef][-10:])

Index(['CouncilArea_Macedon Ranges Shire Council',
       'CouncilArea_Brimbank City Council', 'Type_h',
       'CouncilArea_Frankston City Council', 'CouncilArea_Melton City Council',
       'CouncilArea_Stonnington City Council',
       'CouncilArea_Nillumbik Shire Council',
       'CouncilArea_Wyndham City Council',
       'CouncilArea_Boroondara City Council',
       'CouncilArea_Bayside City Council'],
      dtype='object')
