In [1]:
import pandas as pd
import numpy as np
import requests
import sqlite3
import datetime
import re
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sqlite_api import SLOHouseDatabase

In [2]:
dbquery = SLOHouseDatabase()
df = dbquery.get_dataframe_from_query("SELECT * FROM HOUSES JOIN MLS_LISTINGS USING (MLS_ID)")
df.head()

Unnamed: 0,ID,MLS_ID,CITY,ADDRESS,BED,BATH,LIST_PRICE,SQ_FOOTAGE,PRICE_PER_SQFT,LIST_DATE,ID.1,SUBTYPE,AREA,YR_BUILT,LOT_SQFT,VIEW,POOL,ARB_COMISSION
0,8538,1069605,Arroyo Grande,4410 Upper Lopez Canyon,1,1,345000.0,720.0,479.0,2016-10-13,1,SFR/D,ARRG,1985,1764842,1,0,3.5
1,8305,1070362,Arroyo Grande,345 Tiger Tail,2,2,379000.0,1312.0,288.0,2016-10-21,4,SFR/A,ARRG,1976,3999,1,0,2.5
2,8519,1069663,Arroyo Grande,1164 Pacific Pointe,3,2,449000.0,1326.0,338.0,2016-10-13,5,SFR/A,ARRG,1989,3999,1,0,2.25
3,4317,1072933,Arroyo Grande,306 Hondonada,2,2,519000.0,1746.0,297.0,2016-11-15,10,SFR/D,ARRG,0,206309,1,0,2.5
4,4318,1072919,Arroyo Grande,327 Corona Del Terra,3,2,560000.0,1608.0,348.0,2016-11-15,15,SFR/D,ARRG,1975,7802,1,0,3.0


In [3]:
def extract_subtype(subtype):
    if (subtype.split('/')[1] == 'D'):
        return 0
    return 1

In [4]:
def extract_decade(year):
    if year == 0:
        return 'N/A'
    if year < 1960:
        return '< 60s'
    if year < 1970:
        return '60s'
    if year < 1980:
        return '70s'
    if year < 1990:
        return '80s'
    if year < 2000:
        return '90s'
    if year < 2010:
        return '2000s'
    return '2010s'

In [5]:
def classify(df, X_vars, y_var):
    if type(X_vars) is not list:
        X = df[X_vars].reshape(-1,1)
    else:
        X = df[X_vars]
    y = df[y_var]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=26)

    print('Classification Accuracies')
    svc = SVC()
    svc.fit(X_train, y_train)
    print('Support Vector: ' + str(svc.score(X_test, y_test)))
    
    max_score = 0
    for i in range(1, 6):
        kNN = KNeighborsClassifier(n_neighbors=i)
        kNN.fit(X_train, y_train)
        cur_score = kNN.score(X_test, y_test)
        if cur_score > max_score:
            max_score = cur_score
            maxN = i
    print('kNN (N = '+str(maxN)+'): ' + str(max_score))
    
    dtc = DecisionTreeClassifier()
    dtc.fit(X_train, y_train)
    print('Decision Tree: ' + str(dtc.score(X_test, y_test)))
    
    nbc = GaussianNB()
    nbc.fit(X_train, y_train)
    print('Naive Bayes: '+ str(nbc.score(X_test, y_test)))
    
    gpc = GaussianProcessClassifier()
    gpc.fit(X_train, y_train)
    print('Gaussian Process: '+ str(gpc.score(X_test, y_test)))

In [6]:
df['Apartment'] = df['SUBTYPE'].apply(extract_subtype)
df = pd.concat([df, pd.get_dummies(df.CITY)], axis=1)

In [7]:
class_vars = ['Arroyo Grande',
             'Atascadero',
             'Avila Beach',
             'Morro Bay',
             'Pismo Beach',
             'San Luis Obispo',
             'Templeton',
             'BED',
             'BATH',
             'LIST_PRICE',
             'SQ_FOOTAGE',
             'PRICE_PER_SQFT',
             'LOT_SQFT',
             'ARB_COMISSION',
             'POOL'
             ]

In [8]:
X = df[class_vars]
y = df['Apartment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=26)

Square footage of the entire lot is already a good predictor of whether the house is an apartment, so let's test classify function on a single variables

In [9]:
classify(df, 'LOT_SQFT', 'Apartment')

Classification Accuracies
Support Vector: 0.947136563877
kNN (N = 1): 0.929515418502
Decision Tree: 0.947136563877
Naive Bayes: 0.378854625551
Gaussian Process: 0.947136563877


Now test the full model to predict apartments and see which classifiers perform best

In [10]:
classify(df, class_vars, 'Apartment')

Classification Accuracies
Support Vector: 1.0
kNN (N = 1): 0.986784140969
Decision Tree: 0.986784140969
Naive Bayes: 0.519823788546
Gaussian Process: 1.0


In [11]:
df['Decade'] = df.YR_BUILT.apply(extract_decade)
df.Decade.value_counts()

80s      156
2000s    132
< 60s    105
N/A       78
70s       66
90s       60
60s       48
2010s     42
Name: Decade, dtype: int64

Classifying which decade the house was built in

In [12]:
mod_df = df[df['Decade'] != 'N/A']
classify(mod_df, class_vars, 'Decade')

Classification Accuracies
Support Vector: 0.89552238806
kNN (N = 1): 0.89552238806
Decision Tree: 0.925373134328
Naive Bayes: 0.203980099502
Gaussian Process: 0.925373134328
