In [146]:
import pandas as pd
import numpy as np
import requests
import sqlite3
import datetime
import re
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sqlite_api import SLOHouseDatabase
def classify_predict(df, X_vars, y_var,SVC_inclusion=True,kNN=True,Naive=True,Gaussian=True,DecTree=True):
    if type(X_vars) is not list:
        X = df[X_vars].reshape(-1,1)
    else:
        X = df[X_vars]
    y = df[y_var]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=26)
    length=len(df)
    
    print('Classification Accuracies')
    if SVC_inclusion:
        svc1=SVC()
        svc1.fit(X_train, y_train)
        initial=[[predict] for predict in svc1.predict(X_test)]

        print('Support Vector: ' + str(svc1.score(X_test, y_test)))
    if kNN:
        max_score = 0
        for i in range(1, 6):
            kNN = KNeighborsClassifier(n_neighbors=i)
            kNN.fit(X_train, y_train)
            cur_score = kNN.score(X_test, y_test)
            if cur_score > max_score:
                max_score = cur_score
                maxN = i
        kNN = KNeighborsClassifier(n_neighbors=maxN)
        kNN.fit(X_train, y_train)
        i=0
        if SVC_inclusion==False:
            initial=[[predict] for predict in kNN.predict(X_test)]
        else:
            for predict in kNN.predict(X_test):
                initial[i].append(predict)
                i+=1
        print('kNN (N = '+str(maxN)+'): ' + str(max_score))
    if DecTree:
        dtc = DecisionTreeClassifier()
        dtc.fit(X_train, y_train)
        i=0
        for predict in dtc.predict(X_test):
            initial[i].append(predict)
            i+=1
        print('Decision Tree: ' + str(dtc.score(X_test, y_test)))
    if Naive:
        nbc = GaussianNB()
        nbc.fit(X_train, y_train)
        i=0
        for predict in nbc.predict(X_test):
            initial[i].append(predict)
            i+=1
        print('Naive Bayes: '+ str(nbc.score(X_test, y_test)))
    if Gaussian:
        gpc = GaussianProcessClassifier()
        gpc.fit(X_train, y_train)
        i=0
        for predict in gpc.predict(X_test):
            initial[i].append(predict)
            i+=1
        print('Gaussian Process: '+ str(gpc.score(X_test, y_test)))
    prediction=[max(set(lst), key=lst.count) for lst in initial]
    right=[prediction[i]==y_test.reset_index()[y_var][i] for i in range(len(y_test))]
    print("Ensemble Accuracy is",np.sum(right)/len(right))

In [109]:
dbquery = SLOHouseDatabase()
df = dbquery.get_dataframe_from_query("SELECT * FROM HOUSES JOIN MLS_LISTINGS USING (MLS_ID)")
df.head(1)

Unnamed: 0,ID,MLS_ID,CITY,ADDRESS,BED,BATH,LIST_PRICE,SQ_FOOTAGE,PRICE_PER_SQFT,LIST_DATE,ID.1,SUBTYPE,AREA,YR_BUILT,LOT_SQFT,VIEW,POOL,ARB_COMISSION
0,8538,1069605,Arroyo Grande,4410 Upper Lopez Canyon,1,1,345000.0,720.0,479.0,2016-10-13,1,SFR/D,ARRG,1985,1764842,1,0,3.5


In [110]:
def extract_subtype(subtype):
    if (subtype.split('/')[1] == 'D'):
        return 0
    return 1
def extract_decade(year):
    if year == 0:
        return 'N/A'
    if year < 1960:
        return '< 60s'
    if year < 1970:
        return '60s'
    if year < 1980:
        return '70s'
    if year < 1990:
        return '80s'
    if year < 2000:
        return '90s'
    if year < 2010:
        return '2000s'
    return '2010s'

In [111]:
df['Apartment'] = df['SUBTYPE'].apply(extract_subtype)
df = pd.concat([df, pd.get_dummies(df.CITY)], axis=1)

In [112]:
class_vars = ['Arroyo Grande',
             'Atascadero',
             'Avila Beach',
             'Morro Bay',
             'Pismo Beach',
             'San Luis Obispo',
             'Templeton',
             'BED',
             'BATH',
             'LIST_PRICE',
             'SQ_FOOTAGE',
             'PRICE_PER_SQFT',
             'LOT_SQFT',
             'ARB_COMISSION',
             'POOL'
             ]

In [113]:
X = df[class_vars]
y = df['Apartment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=26)

In [114]:
classify_predict(df, 'LOT_SQFT', 'Apartment')

Classification Accuracies
Support Vector: 0.868421052632
kNN (N = 2): 0.848684210526
Decision Tree: 0.855263157895
Naive Bayes: 0.328947368421
Gaussian Process: 0.868421052632
Ensemble Accuracy is 0.855263157895


In [115]:
classify_predict(df, 'LOT_SQFT', 'Apartment',Naive=False)

Classification Accuracies
Support Vector: 0.868421052632
kNN (N = 2): 0.848684210526
Decision Tree: 0.855263157895
Gaussian Process: 0.868421052632
Ensemble Accuracy is 0.881578947368


In [116]:
df['Decade'] = df.YR_BUILT.apply(extract_decade)
mod_df = df[df['Decade'] != 'N/A']
classify_predict(mod_df, class_vars, 'Decade', Naive=False,kNN=False)

Classification Accuracies
Support Vector: 0.731343283582
Decision Tree: 0.686567164179
Gaussian Process: 0.716417910448
Ensemble Accuracy is 0.731343283582


In [117]:
class_vars = ['Arroyo Grande',
             'Atascadero',
             'Avila Beach',
             'Morro Bay',
             'Pismo Beach',
             'San Luis Obispo',
             'Templeton',
             'Apartment',
             'LIST_PRICE',
             'SQ_FOOTAGE',
             'PRICE_PER_SQFT',
             'LOT_SQFT',
             'ARB_COMISSION',
             'POOL'
             ]


In [118]:
df['Rooms']=df['BED']+df['BATH']
df['Big']=0
df.loc[df['Rooms']>4,'Big']=1

In [119]:
slo=df.ix[df['CITY']=="San Luis Obispo"]

classify_predict(df, class_vars, 'Big')

Classification Accuracies
Support Vector: 0.973684210526
kNN (N = 1): 0.973684210526
Decision Tree: 0.973684210526
Naive Bayes: 0.842105263158
Gaussian Process: 0.75
Ensemble Accuracy is 0.986842105263


In [120]:
classify_predict(df,class_vars,'Big',Gaussian=False)

Classification Accuracies
Support Vector: 0.973684210526
kNN (N = 1): 0.973684210526
Decision Tree: 0.960526315789
Naive Bayes: 0.842105263158
Ensemble Accuracy is 0.986842105263


In [121]:
class_vars = ['Apartment',
             'LIST_PRICE',
             'SQ_FOOTAGE',
             'PRICE_PER_SQFT',
             'LOT_SQFT',
             'ARB_COMISSION',
             'POOL'
             ]
classify_predict(slo,class_vars,'Big')

Classification Accuracies
Support Vector: 0.9
kNN (N = 2): 0.95
Decision Tree: 0.9
Naive Bayes: 0.75
Gaussian Process: 0.8
Ensemble Accuracy is 1.0


In [122]:
class_vars = ['Arroyo Grande',
             'Atascadero',
             'Avila Beach',
             'Morro Bay',
             'Pismo Beach',
             'San Luis Obispo',
             'Templeton',
             'BED',
             'BATH',
             'Apartment',
             'SQ_FOOTAGE',
             'PRICE_PER_SQFT',
             'LOT_SQFT',
             'ARB_COMISSION',
             'POOL'
             ]
df['Expensive']=0
df.loc[df['LIST_PRICE']>df['LIST_PRICE'].mean(),'Expensive']=1

In [123]:
classify_predict(df,class_vars,'Expensive',Naive=False)

Classification Accuracies
Support Vector: 0.868421052632
kNN (N = 1): 0.973684210526
Decision Tree: 1.0
Gaussian Process: 0.868421052632
Ensemble Accuracy is 0.868421052632


In [124]:
slo=df.ix[df['CITY']=="San Luis Obispo"]

In [125]:
class_vars = ['BED',
             'BATH',
             'Apartment',
             'SQ_FOOTAGE',
             'PRICE_PER_SQFT',
             'LOT_SQFT',
             'ARB_COMISSION',
             'POOL'
             ]
classify_predict(slo,class_vars,'Expensive')

Classification Accuracies
Support Vector: 0.95
kNN (N = 1): 0.95
Decision Tree: 0.95
Naive Bayes: 0.875
Gaussian Process: 0.95
Ensemble Accuracy is 1.0


In [137]:
class_vars = ['Arroyo Grande',
             'Atascadero',
             'Avila Beach',
             'Morro Bay',
             'Pismo Beach',
             'San Luis Obispo',
             'Templeton',
             'Apartment',
             'LIST_PRICE',
             'SQ_FOOTAGE',
             'LOT_SQFT',
             'ARB_COMISSION',
             'BED','BATH', 'POOL'
             ]


In [147]:
df['Overpriced']=0
df.loc[df['PRICE_PER_SQFT']>df['PRICE_PER_SQFT'].mean(),'Overpriced']=1

In [148]:
classify_predict(df,class_vars,'Overpriced',Naive=False,SVC_inclusion=False)

Classification Accuracies
kNN (N = 1): 0.868421052632
Decision Tree: 0.960526315789
Gaussian Process: 0.855263157895
Ensemble Accuracy is 0.907894736842


In [149]:
classify_predict(df,class_vars,'Overpriced')

Classification Accuracies
Support Vector: 0.855263157895
kNN (N = 1): 0.868421052632
Decision Tree: 0.973684210526
Naive Bayes: 0.723684210526
Gaussian Process: 0.855263157895
Ensemble Accuracy is 0.894736842105


In [150]:
notslo=df.ix[df['CITY']!="San Luis Obispo"]

In [157]:
classify_predict(notslo,class_vars,'Overpriced')

Classification Accuracies
kNN (N = 1): 0.892857142857
Decision Tree: 0.910714285714
Gaussian Process: 0.928571428571
Ensemble Accuracy is 0.928571428571


In [158]:
slo=df.ix[df['CITY']=="San Luis Obispo"]

In [163]:
classify_predict(slo,class_vars,'Overpriced',Naive=False,SVC_inclusion=False)

Classification Accuracies
kNN (N = 1): 0.95
Decision Tree: 1.0
Gaussian Process: 0.9
Ensemble Accuracy is 0.95
