In [1]:
import pandas as pd
import numpy as np
import requests
import sqlite3
import datetime
import re
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sqlite_api import SLOHouseDatabase
def classify_predict(df, X_vars, y_var,SVC_inclusion=True,kNN=True,Naive=True,Gaussian=True,DecTree=True):
    if type(X_vars) is not list:
        X = df[X_vars].reshape(-1,1)
    else:
        X = df[X_vars]
    y = df[y_var]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=26)
    length=len(df)
    
    print('Classification Accuracies')
    if SVC_inclusion:
        svc1=SVC()
        svc1.fit(X_train, y_train)
        initial=[[predict] for predict in svc1.predict(X_test)]

        print('Support Vector: ' + str(svc1.score(X_test, y_test)))
    if kNN:
        max_score = 0
        for i in range(1, 6):
            kNN = KNeighborsClassifier(n_neighbors=i)
            kNN.fit(X_train, y_train)
            cur_score = kNN.score(X_test, y_test)
            if cur_score > max_score:
                max_score = cur_score
                maxN = i
        kNN = KNeighborsClassifier(n_neighbors=maxN)
        kNN.fit(X_train, y_train)
        i=0
        if SVC_inclusion==False:
            initial=[[predict] for predict in kNN.predict(X_test)]
        else:
            for predict in kNN.predict(X_test):
                initial[i].append(predict)
                i+=1
        print('kNN (N = '+str(maxN)+'): ' + str(max_score))
    if DecTree:
        dtc = DecisionTreeClassifier()
        dtc.fit(X_train, y_train)
        i=0
        for predict in dtc.predict(X_test):
            initial[i].append(predict)
            i+=1
        print('Decision Tree: ' + str(dtc.score(X_test, y_test)))
    if Naive:
        nbc = GaussianNB()
        nbc.fit(X_train, y_train)
        i=0
        for predict in nbc.predict(X_test):
            initial[i].append(predict)
            i+=1
        print('Naive Bayes: '+ str(nbc.score(X_test, y_test)))
    if Gaussian:
        gpc = GaussianProcessClassifier()
        gpc.fit(X_train, y_train)
        i=0
        for predict in gpc.predict(X_test):
            initial[i].append(predict)
            i+=1
        print('Gaussian Process: '+ str(gpc.score(X_test, y_test)))
    prediction=[max(set(lst), key=lst.count) for lst in initial]
    right=[prediction[i]==y_test.reset_index()[y_var][i] for i in range(len(y_test))]
    print("Ensemble Accuracy is",np.sum(right)/len(right))

In [2]:
dbquery = SLOHouseDatabase()
df = dbquery.get_dataframe_from_query("SELECT * FROM HOUSES JOIN MLS_LISTINGS USING (MLS_ID)")
df.head(1)

Unnamed: 0,ID,MLS_ID,CITY,ADDRESS,BED,BATH,LIST_PRICE,SQ_FOOTAGE,PRICE_PER_SQFT,LIST_DATE,ID.1,SUBTYPE,AREA,YR_BUILT,LOT_SQFT,VIEW,POOL,ARB_COMISSION
0,8538,1069605,Arroyo Grande,4410 Upper Lopez Canyon,1,1,345000.0,720.0,479.0,2016-10-13,1,SFR/D,ARRG,1985,1764842,1,0,3.5


In [3]:
def extract_subtype(subtype):
    if (subtype.split('/')[1] == 'D'):
        return 0
    return 1
def extract_decade(year):
    if year == 0:
        return 'N/A'
    if year < 1960:
        return '< 60s'
    if year < 1970:
        return '60s'
    if year < 1980:
        return '70s'
    if year < 1990:
        return '80s'
    if year < 2000:
        return '90s'
    if year < 2010:
        return '2000s'
    return '2010s'

##### Create column for 1hot encoding whether property is an apartment & expand City feature with dummy vars.

In [4]:
df['Apartment'] = df['SUBTYPE'].apply(extract_subtype)
df = pd.concat([df, pd.get_dummies(df.CITY)], axis=1)

In [5]:
### set of columns for use in classification
class_vars = ['Arroyo Grande',
             'Atascadero',
             'Avila Beach',
             'Morro Bay',
             'Pismo Beach',
             'San Luis Obispo',
             'Templeton',
             'BED',
             'BATH',
             'LIST_PRICE',
             'SQ_FOOTAGE',
             'PRICE_PER_SQFT',
             'LOT_SQFT',
             'ARB_COMISSION',
             'POOL'
             ]

###### Split our data into train and test for use in classifier.

In [6]:
X = df[class_vars]
y = df['Apartment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=26)

### Experiment 1:
###### Using entire dataframe, predict/classify whether property is an appartment based in the Lot size (SqFt.)

In [7]:
classify_predict(df, 'LOT_SQFT', 'Apartment')

Classification Accuracies
Support Vector: 0.868421052632
kNN (N = 2): 0.848684210526
Decision Tree: 0.855263157895
Naive Bayes: 0.328947368421
Gaussian Process: 0.868421052632
Ensemble Accuracy is 0.855263157895


###### Repeat the question, exclude Naive Bayes classifier this time.

In [8]:
classify_predict(df, 'LOT_SQFT', 'Apartment',Naive=False)

Classification Accuracies
Support Vector: 0.868421052632
kNN (N = 2): 0.848684210526
Decision Tree: 0.855263157895
Gaussian Process: 0.868421052632
Ensemble Accuracy is 0.881578947368


#### Observations:
###### Note the 3% gain in accuracy in our ensembled classifier when Naive Bayes is excluded.

### Experiment 2:
###### Excluding records without a valid YR_BUILT feature, predict/classify the decade in which a property was build using the set of properties listed in class_vars.

In [9]:
### exclude homes without a valid year built datapoint

df['Decade'] = df.YR_BUILT.apply(extract_decade)
mod_df = df[df['Decade'] != 'N/A']

In [10]:
classify_predict(mod_df, class_vars, 'Decade')

Classification Accuracies
Support Vector: 0.731343283582
kNN (N = 1): 0.65671641791
Decision Tree: 0.671641791045
Naive Bayes: 0.149253731343
Gaussian Process: 0.716417910448
Ensemble Accuracy is 0.65671641791


In [11]:
### Naive & KNN performed poorly -- exclude them.
classify_predict(mod_df, class_vars, 'Decade', Naive=False,kNN=False)

Classification Accuracies
Support Vector: 0.731343283582
Decision Tree: 0.671641791045
Gaussian Process: 0.716417910448
Ensemble Accuracy is 0.671641791045


#### Observations
###### Note 2% gain in Accuracy in ensembled classifier when Naive Bayes & KNN excluded. 

### Experiment 3:
###### Predict  the size of a home, without the use of features describing #Beds / #Bathrooms. (Home is 'BIG' if Number of BED + BATH > 4)

In [12]:
class_vars = ['Arroyo Grande',
             'Atascadero',
             'Avila Beach',
             'Morro Bay',
             'Pismo Beach',
             'San Luis Obispo',
             'Templeton',
             'Apartment',
             'LIST_PRICE',
             'SQ_FOOTAGE',
             'PRICE_PER_SQFT',
             'LOT_SQFT',
             'ARB_COMISSION',
             'POOL'
             ]


In [13]:
df['Rooms']=df['BED']+df['BATH']
df['Big']=0
df.loc[df['Rooms']>4,'Big']=1

In [14]:
slo=df.ix[df['CITY']=="San Luis Obispo"]

classify_predict(df, class_vars, 'Big')

Classification Accuracies
Support Vector: 0.973684210526
kNN (N = 1): 0.973684210526
Decision Tree: 0.960526315789
Naive Bayes: 0.842105263158
Gaussian Process: 0.75
Ensemble Accuracy is 0.973684210526


###### Exclude Gaussian Process to try to improve ensemble.

In [15]:
classify_predict(df,class_vars,'Big',Gaussian=False)

Classification Accuracies
Support Vector: 0.973684210526
kNN (N = 1): 0.973684210526
Decision Tree: 0.947368421053
Naive Bayes: 0.842105263158
Ensemble Accuracy is 0.973684210526


##### Observations:
###### 1.3% gain in ensembled classifier accuracy using City, ApartmentYN, List Price,  SqFt, PricePerSqFt, LotSqFt, BuyerComission, Pool excluding Gaussian Process

### Experiment 3.1:
###### Repeat the experiment for the subset of properties in SLO.

In [16]:
### Exclude CITY features from our set of predictors.
class_vars = ['Apartment',
             'LIST_PRICE',
             'SQ_FOOTAGE',
             'PRICE_PER_SQFT',
             'LOT_SQFT',
             'ARB_COMISSION',
             'POOL'
             ]

In [17]:
classify_predict(slo,class_vars,'Big')

Classification Accuracies
Support Vector: 0.9
kNN (N = 2): 0.95
Decision Tree: 0.9
Naive Bayes: 0.75
Gaussian Process: 0.8
Ensemble Accuracy is 1.0


##### Observations:
###### 100% accuracy in classifying 'Big' homes in SLO using ensembled classifier.

### Experiment 4:
###### Predict whether a home is Expensive, using the set of predictors below.  Expensive will be defined as any home such that LIST_PRICE > 2 Standard Deviations from the mean.

In [18]:
class_vars = ['Arroyo Grande',
             'Atascadero',
             'Avila Beach',
             'Morro Bay',
             'Pismo Beach',
             'San Luis Obispo',
             'Templeton',
             'BED',
             'BATH',
             'Apartment',
             'SQ_FOOTAGE',
             'PRICE_PER_SQFT',
             'LOT_SQFT',
             'ARB_COMISSION',
             'POOL'
             ]
df['Expensive']=0
## 1hot encode 'Expensive homes'
df.loc[df['LIST_PRICE']>(df['LIST_PRICE'].mean() + 2*df['LIST_PRICE'].std()),'Expensive']=1

In [19]:
classify_predict(df,class_vars,'Expensive',Naive=False)

Classification Accuracies
Support Vector: 0.973684210526
kNN (N = 1): 0.960526315789
Decision Tree: 1.0
Gaussian Process: 0.973684210526
Ensemble Accuracy is 0.973684210526


### Experiment 4.1:
###### Repeat experiment 4 for the subset of properties in SLO.

In [20]:
slo=df.ix[df['CITY']=="San Luis Obispo"]
class_vars = ['BED',
             'BATH',
             'Apartment',
             'SQ_FOOTAGE',
             'PRICE_PER_SQFT',
             'LOT_SQFT',
             'ARB_COMISSION',
             'POOL'
             ]

classify_predict(slo,class_vars,'Expensive')

Classification Accuracies
Support Vector: 1.0
kNN (N = 1): 1.0
Decision Tree: 1.0
Naive Bayes: 0.975
Gaussian Process: 1.0
Ensemble Accuracy is 1.0


##### Observations:
###### 97.4% accuracy for predicting whether a home in SLO County is expensive using ensembled classifier.
###### 100% accuracy using ensembled classifier for prediction Expensive homes in SLO

### Experiment 5:
###### Predict whether a property is Overpriced in SLO County. Overpriced defined as 2 standard deviations from mean price per square foot.

In [21]:
class_vars = ['Arroyo Grande',
             'Atascadero',
             'Avila Beach',
             'Morro Bay',
             'Pismo Beach',
             'San Luis Obispo',
             'Templeton',
             'Apartment',
             'LIST_PRICE',
             'SQ_FOOTAGE',
             'LOT_SQFT',
             'ARB_COMISSION',
             'BED',
             'BATH',
             'POOL'
             ]
county_class_vars = class_vars.copy()

In [22]:
df['Overpriced']=0
df.loc[df['PRICE_PER_SQFT'] > (df['PRICE_PER_SQFT'].mean() + 2 * df['PRICE_PER_SQFT'].std()),'Overpriced']=1

In [23]:
class_vars.remove('LIST_PRICE')
classify_predict(df,class_vars,'Overpriced')

Classification Accuracies
Support Vector: 0.960526315789
kNN (N = 1): 0.960526315789
Decision Tree: 0.986842105263
Naive Bayes: 0.947368421053
Gaussian Process: 0.960526315789
Ensemble Accuracy is 0.960526315789


In [25]:
classify_predict(df,class_vars,'Overpriced',Naive=False,SVC_inclusion=False)

Classification Accuracies
kNN (N = 1): 0.960526315789
Decision Tree: 0.986842105263
Gaussian Process: 0.960526315789
Ensemble Accuracy is 0.960526315789


### Experiment 5.1:
###### Repeat experiment for properties not in SLO city.

In [26]:
notslo=df.ix[df['CITY']!="San Luis Obispo"]

In [27]:
classify_predict(notslo,class_vars,'Overpriced')

Classification Accuracies
Support Vector: 0.982142857143
kNN (N = 1): 0.982142857143
Decision Tree: 0.982142857143
Naive Bayes: 0.482142857143
Gaussian Process: 0.982142857143
Ensemble Accuracy is 0.982142857143


### Experiment 5.2:
###### Repeat experiment for properties in SLO city. 

In [28]:
slo=df.ix[df['CITY']=="San Luis Obispo"]

In [29]:
classify_predict(slo,class_vars,'Overpriced')

Classification Accuracies
Support Vector: 1.0
kNN (N = 1): 1.0
Decision Tree: 1.0
Naive Bayes: 1.0
Gaussian Process: 1.0
Ensemble Accuracy is 1.0


### Experiment 5.3
##### Lets change our definition of overpriced. Overpriced  if Price/SqFt > Mean(Price/SqFt)

In [30]:
df2 = df.copy()
df2['Overpriced']=0
df2.loc[df2['PRICE_PER_SQFT'] > df2['PRICE_PER_SQFT'].mean(),'Overpriced']=1

### Experiment 5.3.1: All properties in SLO County

In [53]:
classify_predict(df2,class_vars,'Overpriced')

Classification Accuracies
Support Vector: 0.848684210526
kNN (N = 1): 0.802631578947
Decision Tree: 0.934210526316
Naive Bayes: 0.598684210526
Gaussian Process: 0.855263157895
Ensemble Accuracy is 0.868421052632


In [57]:
classify_predict(df2,class_vars,'Overpriced',Naive=False,kNN=False)

Classification Accuracies
Support Vector: 0.848684210526
Decision Tree: 0.934210526316
Gaussian Process: 0.855263157895
Ensemble Accuracy is 0.855263157895


### Experiment 5.3.2: Properties not in SLO city.

In [33]:
notslo2=df2.ix[df2['CITY']!="San Luis Obispo"]
classify_predict(notslo2,class_vars,'Overpriced')

Classification Accuracies
Support Vector: 0.910714285714
kNN (N = 1): 0.910714285714
Decision Tree: 0.875
Naive Bayes: 0.464285714286
Gaussian Process: 0.928571428571
Ensemble Accuracy is 0.875


In [58]:
classify_predict(notslo2, class_vars, 'Overpriced', Naive=False, DecTree=False)

Classification Accuracies
Support Vector: 0.910714285714
kNN (N = 1): 0.910714285714
Gaussian Process: 0.928571428571
Ensemble Accuracy is 0.928571428571


### Experiment 5.3.3: Properties in SLO city.

In [34]:
slo2=df2.ix[df2['CITY']=="San Luis Obispo"]
classify_predict(slo2,class_vars,'Overpriced')

Classification Accuracies
Support Vector: 0.9
kNN (N = 1): 0.95
Decision Tree: 0.9
Naive Bayes: 0.6
Gaussian Process: 0.9
Ensemble Accuracy is 0.9


#### Observations:
###### Note difference in accuracies with a change in definition of overpriced.

### Experiment 6:
###### Predict whether a home has a View in SLO county.


In [35]:
classify_predict(df, county_class_vars, 'VIEW')

Classification Accuracies
Support Vector: 1.0
kNN (N = 1): 0.973684210526
Decision Tree: 0.973684210526
Naive Bayes: 0.644736842105
Gaussian Process: 0.723684210526
Ensemble Accuracy is 0.973684210526


### 6.1
###### Predict whether a home in SLO city has a View

In [36]:
slo=df.ix[df['CITY']=="San Luis Obispo"]
classify_predict(slo, class_vars, 'VIEW')

Classification Accuracies
Support Vector: 1.0
kNN (N = 1): 1.0
Decision Tree: 1.0
Naive Bayes: 0.45
Gaussian Process: 0.75
Ensemble Accuracy is 1.0


### Experiment 6.2
###### Predict whether a home not in SLO city has a view. 

In [37]:
notslo=df.ix[df['CITY']!="San Luis Obispo"]
classify_predict(notslo, class_vars, 'VIEW')

Classification Accuracies
Support Vector: 1.0
kNN (N = 1): 0.982142857143
Decision Tree: 0.982142857143
Naive Bayes: 0.589285714286
Gaussian Process: 0.75
Ensemble Accuracy is 0.964285714286


### Observations:
##### Support Vector
SVM seemed to be one of the most consistent classifiers of the group (close to kNN and Decision Tree). In most situations it performed as good or better than the best classifier. When it was outperformed, it wasn't outperformed by much.

##### k-Nearest Neighbors
kNN was amongst the group that performed the best. Like SVM and Decision Trees, it was at or near the top for most of the questions we asked. We checked every k from 1 to 6, but it seemed like only k=1 or k=2 were the best performers. For this particular dataset we can say a lower k seems to perform better.

##### Decision Tree
In terms of accuracy, it looks like Decision Trees might be the single strongest predictor. It was close to SVM and kNN, but edged out those algorithms more often than it was edged out. This assumes an unlimited depth of decision tree though. In many cases, a decision tree is desired because it is intuitive and simple to understand. In all of the cases we tried, limiting the depth caused the accuracy to dip well below the SVM and kNN classifiers, so we would only recommend decision trees if you didn't need them to be simple and just needed them to be strong predictors.

##### Naive Bayes
The weakest performer by far. In nearly every case, Naive Bayes had the lowest accuracy by a significant margin. Only occasionally did it match the other classifiers, and only one time did it beat another (beat Gaussian Process when classifying if a home was 'big' or not). Some research shows that Naive Bayes has a strong assumption of independence between the explanatory variables. Obviously our dataset has many variables that are strongly correlated with each other, so we determined that was the reason for Naive Bayes not being a great choice for this dataset.

##### Gaussian Process
The Guassian Process classifier seemed inconsistent and unreliable for the questions we asked from this dataset. Sometimes it was one of the best predictors, but in some other cases it was well below the top 3 classifiers. There also didn't seem to be a pattern of when it worked well and when it didn't.