In [55]:
import pandas as pd
import numpy as np
import requests
import sqlite3
import datetime
import re
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sqlite_api import SLOHouseDatabase
def classify_predict(df, X_vars, y_var,SVC_inclusion=True,kNN=True,Naive=True,Gaussian=True,DecTree=True):
    if type(X_vars) is not list:
        X = df[X_vars].reshape(-1,1)
    else:
        X = df[X_vars]
    y = df[y_var]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=26)
    length=len(df)
    
    print('Classification Accuracies')
    if SVC_inclusion:
        svc1=SVC()
        svc1.fit(X_train, y_train)
        initial=[[predict] for predict in svc1.predict(X_test)]

        print('Support Vector: ' + str(svc1.score(X_test, y_test)))
    if kNN:
        max_score = 0
        for i in range(1, 6):
            kNN = KNeighborsClassifier(n_neighbors=i)
            kNN.fit(X_train, y_train)
            cur_score = kNN.score(X_test, y_test)
            if cur_score > max_score:
                max_score = cur_score
                maxN = i
        kNN = KNeighborsClassifier(n_neighbors=maxN)
        kNN.fit(X_train, y_train)
        i=0
        for predict in kNN.predict(X_test):
            initial[i].append(predict)
            i+=1
        print('kNN (N = '+str(maxN)+'): ' + str(max_score))
    if DecTree:
        dtc = DecisionTreeClassifier()
        dtc.fit(X_train, y_train)
        i=0
        for predict in dtc.predict(X_test):
            initial[i].append(predict)
            i+=1
        print('Decision Tree: ' + str(dtc.score(X_test, y_test)))
    if Naive:
        nbc = GaussianNB()
        nbc.fit(X_train, y_train)
        i=0
        for predict in nbc.predict(X_test):
            initial[i].append(predict)
            i+=1
        print('Naive Bayes: '+ str(nbc.score(X_test, y_test)))
    if Gaussian:
        gpc = GaussianProcessClassifier()
        gpc.fit(X_train, y_train)
        i=0
        for predict in gpc.predict(X_test):
            initial[i].append(predict)
            i+=1
        print('Gaussian Process: '+ str(gpc.score(X_test, y_test)))
    prediction=[max(set(lst), key=lst.count) for lst in initial]
    right=[prediction[i]==y_test.reset_index()[y_var][i] for i in range(len(y_test))]
    print("Ensemble Accuracy is",np.sum(right)/len(right))

In [4]:
dbquery = SLOHouseDatabase()
df = dbquery.get_dataframe_from_query("SELECT * FROM HOUSES JOIN MLS_LISTINGS USING (MLS_ID)")
df.head()

Unnamed: 0,ID,MLS_ID,CITY,ADDRESS,BED,BATH,LIST_PRICE,SQ_FOOTAGE,PRICE_PER_SQFT,LIST_DATE,ID.1,SUBTYPE,AREA,YR_BUILT,LOT_SQFT,VIEW,POOL,ARB_COMISSION
0,8538,1069605,Arroyo Grande,4410 Upper Lopez Canyon,1,1,345000.0,720.0,479.0,2016-10-13,1,SFR/D,ARRG,1985,1764842,1,0,3.5
1,8305,1070362,Arroyo Grande,345 Tiger Tail,2,2,379000.0,1312.0,288.0,2016-10-21,4,SFR/A,ARRG,1976,3999,1,0,2.5
2,8519,1069663,Arroyo Grande,1164 Pacific Pointe,3,2,449000.0,1326.0,338.0,2016-10-13,5,SFR/A,ARRG,1989,3999,1,0,2.25
3,4317,1072933,Arroyo Grande,306 Hondonada,2,2,519000.0,1746.0,297.0,2016-11-15,10,SFR/D,ARRG,0,206309,1,0,2.5
4,4318,1072919,Arroyo Grande,327 Corona Del Terra,3,2,560000.0,1608.0,348.0,2016-11-15,15,SFR/D,ARRG,1975,7802,1,0,3.0


In [6]:
def extract_subtype(subtype):
    if (subtype.split('/')[1] == 'D'):
        return 0
    return 1
def extract_decade(year):
    if year == 0:
        return 'N/A'
    if year < 1960:
        return '< 60s'
    if year < 1970:
        return '60s'
    if year < 1980:
        return '70s'
    if year < 1990:
        return '80s'
    if year < 2000:
        return '90s'
    if year < 2010:
        return '2000s'
    return '2010s'

In [7]:
df['Apartment'] = df['SUBTYPE'].apply(extract_subtype)
df = pd.concat([df, pd.get_dummies(df.CITY)], axis=1)

In [10]:
class_vars = ['Arroyo Grande',
             'Atascadero',
             'Avila Beach',
             'Morro Bay',
             'Pismo Beach',
             'San Luis Obispo',
             'Templeton',
             'BED',
             'BATH',
             'LIST_PRICE',
             'SQ_FOOTAGE',
             'PRICE_PER_SQFT',
             'LOT_SQFT',
             'ARB_COMISSION',
             'POOL'
             ]

In [11]:
X = df[class_vars]
y = df['Apartment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=26)

In [27]:
classify_predict(df, 'LOT_SQFT', 'Apartment')

Classification Accuracies
Support Vector: 0.868421052632
kNN (N = 2): 0.848684210526
Decision Tree: 0.855263157895
Naive Bayes: 0.328947368421
Gaussian Process: 0.868421052632


[0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0]

In [31]:
print(len(df['Apartment']))
len(classify_predict(df, 'LOT_SQFT', 'Apartment'))

458
Classification Accuracies
Support Vector: 0.868421052632
kNN (N = 2): 0.848684210526
Decision Tree: 0.855263157895
Naive Bayes: 0.328947368421
Gaussian Process: 0.868421052632


152

In [32]:
[1,1]==[0,0]

False

In [56]:
classify_predict(df, 'LOT_SQFT', 'Apartment',Naive=False,kNN=False)

Classification Accuracies
Support Vector: 0.868421052632
Decision Tree: 0.855263157895
Gaussian Process: 0.868421052632
Ensemble Accuracy is 0.868421052632


In [36]:
df['Apartment'][1]

1

In [46]:
np.sum([True,True,False])

2