In [2]:
from datetime import datetime, date
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn import grid_search
from sklearn import linear_model
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split

In [3]:
file = 'building_blight_features.csv'
data = pd.read_csv(file)
buildings = pd.DataFrame(data, columns = ['Address','Latitude','Longitude','blight','CrimeCount','311Count','BlightViolationCount','Ward','ParcelNo','PropAddr','SalePrice','TaxStatus','ResYrBuilt','IsImproved','AppraisedValue','TaxedValue'])
buildings
print "Rowcount: ", "{:,}".format(len(buildings))

Rowcount:  160,985


In [4]:
features = buildings[['CrimeCount','311Count','BlightViolationCount','SalePrice','IsImproved','AppraisedValue','TaxedValue']].columns

In [5]:
features

Index([u'CrimeCount', u'311Count', u'BlightViolationCount', u'SalePrice',
       u'IsImproved', u'AppraisedValue', u'TaxedValue'],
      dtype='object')

In [6]:
# create training and testing sets
train, test = train_test_split(buildings, test_size = 0.25)

In [7]:
# identify what we are predicting!
y, _ = pd.factorize(train['blight'])

In [8]:
# Naive Bayes
bayes = GaussianNB().fit(train[features], y)
bayes_predict = bayes.predict(test[features])

# Logistic regression
logistic = linear_model.LogisticRegression().fit(train[features], y)
logistic_predict = logistic.predict(test[features])

# Random Forest
rf = RandomForestClassifier().fit(train[features], y)
rf_predict = rf.predict(test[features])

In [9]:
pd.crosstab(test['blight'], rf_predict, rownames=['actual'], colnames=['preds'])

preds,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,38952,0
1.0,0,1295


In [10]:
# Classification Metrics
print(metrics.classification_report(test.blight, bayes_predict))
print(metrics.classification_report(test.blight, logistic_predict))
print(metrics.classification_report(test.blight, rf_predict))

             precision    recall  f1-score   support

        0.0       1.00      0.14      0.24     38952
        1.0       0.04      0.98      0.07      1295

avg / total       0.96      0.16      0.23     40247

             precision    recall  f1-score   support

        0.0       0.97      1.00      0.99     38952
        1.0       1.00      0.13      0.22      1295

avg / total       0.97      0.97      0.96     40247

             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00     38952
        1.0       1.00      1.00      1.00      1295

avg / total       1.00      1.00      1.00     40247

