In [48]:
from datetime import datetime, date
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn import linear_model
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split

In [52]:
file = 'data/building_blight_features.csv'
data = pd.read_csv(file)
buildings = pd.DataFrame(data, columns = ['Address','Latitude','Longitude','blight','CrimeCount','311Count','BlightViolationCount'])
buildings
print "Rowcount: ", "{:,}".format(len(buildings))

Rowcount:  170,196


In [53]:
# identify the features!
features = buildings.columns[4:7]

In [54]:
features

Index([u'CrimeCount', u'311Count', u'BlightViolationCount'], dtype='object')

In [56]:
# create training and testing sets
train, test = train_test_split(buildings, test_size = 0.25)

In [57]:
# identify what we are predicting!
y, _ = pd.factorize(train['blight'])

In [58]:
# Naive Bayes
bayes = GaussianNB().fit(train[features], y)
bayes_predict = bayes.predict(test[features])

# Logistic regression
logistic = linear_model.LogisticRegression().fit(train[features], y)
logistic_predict = logistic.predict(test[features])

# Random Forest
rf = RandomForestClassifier().fit(train[features], y)
rf_predict = rf.predict(test[features])

In [59]:
pd.crosstab(test['blight'], rf_predict, rownames=['actual'], colnames=['preds'])

preds,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,41212,0
1.0,0,1337


In [60]:
# Classification Metrics
print(metrics.classification_report(test.blight, bayes_predict))
print(metrics.classification_report(test.blight, logistic_predict))
print(metrics.classification_report(test.blight, rf_predict))

             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00     41212
        1.0       1.00      1.00      1.00      1337

avg / total       1.00      1.00      1.00     42549

             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00     41212
        1.0       1.00      1.00      1.00      1337

avg / total       1.00      1.00      1.00     42549

             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00     41212
        1.0       1.00      1.00      1.00      1337

avg / total       1.00      1.00      1.00     42549

