In [1]:
%matplotlib inline
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
import datetime

import numpy as np
from time import time

from sklearn import metrics
from sklearn import preprocessing

from sklearn.datasets.species_distributions import construct_grids
from sklearn.neighbors import KernelDensity
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
import sklearn.metrics as metrics
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')



def parse_location(loc):
    loc = loc.strip("()").split(',')
    lat = loc[0].strip()
    long = loc[1].strip()
    return float(lat), float(long)

def classification_report(y_true, y_pred, verbose=False):
    confusion_matrix = metrics.confusion_matrix(y_true, y_pred)
    accuracy_score = metrics.accuracy_score(y_true, y_pred)
    recall_score = metrics.recall_score(y_true, y_pred)
    precision_score = metrics.precision_score(y_true, y_pred)
    f1_score = metrics.f1_score(y_true, y_pred)
    fpr, tpr, thresholds = metrics.roc_curve(y_true, y_pred)
    auc = metrics.auc(fpr, tpr)
    
    if verbose:
        print('confusion_matrix')
        print(confusion_matrix)
        print('accuracy_score', accuracy_score)
        print('recall_score', recall_score)
        print('precision_score', precision_score)
        print('f1_score', f1_score)
        print('auc', auc)
        print('null_accuracy', (len(y_true)-sum(y_true))/len(y_true))
    
    return confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, auc
    

def get_scaler(column):
    X = np.array(column).reshape(column.shape[0], 1)
    return preprocessing.MinMaxScaler().fit(column)

def scale(scaler, column):
    X = np.array(column).reshape(column.shape[0], 1)
    return scaler.transform(X)

In [2]:
%time df = pd.read_csv('features_temporal_full_year_with_weather_2500_final.tsv', sep='\t', parse_dates=['timestamp'])
df.index = df.timestamp.apply(lambda x: x.month)
df.head()

CPU times: user 2 s, sys: 72 ms, total: 2.08 s
Wall time: 2.07 s


Unnamed: 0_level_0,cell_range,timestamp,crime_freq,yelp_freq,police_freq,lat,long,police_factor,yelp_factor,prev_7_days_crime_freq,prev_day_crime_freq,PRCP,SNOW,TMAX,TMIN
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,"((41.5487, -88.3713), (41.560078, -88.345754))",2006-01-01,0,0,0,41.554389,-88.358527,7.041842999999999e-26,0.019742,,,0.8,0,7.2,-3.3
1,"((41.5487, -88.3713), (41.560078, -88.345754))",2006-01-02,0,0,0,41.554389,-88.358527,7.041842999999999e-26,0.019742,,0.0,9.4,0,5.6,4.4
1,"((41.5487, -88.3713), (41.560078, -88.345754))",2006-01-03,0,0,0,41.554389,-88.358527,7.041842999999999e-26,0.019742,,0.0,0.0,0,5.6,3.9
1,"((41.5487, -88.3713), (41.560078, -88.345754))",2006-01-04,0,0,0,41.554389,-88.358527,7.041842999999999e-26,0.019742,,0.0,0.5,0,6.1,2.2
1,"((41.5487, -88.3713), (41.560078, -88.345754))",2006-01-05,0,0,0,41.554389,-88.358527,7.041842999999999e-26,0.019742,,0.0,0.0,0,2.8,-0.6


In [3]:
df.dropna(inplace=True)

In [4]:
X = df
y = df.crime_freq.apply(lambda x: 1 if x > 0 else 0)

tscv = TimeSeriesSplit(n_splits=11)

frames = []
for train_index, test_index in tscv.split(X.index.unique()):
    print("TRAIN:", train_index+1, "TEST:", test_index+1)
#     X_train, X_test = X[X.index.isin(train_index+1)], X[X.index.isin(test_index+1)]
#     y_train, y_test = y[y.index.isin(train_index+1)], y[y.index.isin(test_index+1)]
#         print(X_train.timestamp.values[0], X_train.timestamp.values[-1], X_test.timestamp.values[0], X_test.timestamp.values[-1])


TRAIN: [1] TEST: [2]
TRAIN: [1 2] TEST: [3]
TRAIN: [1 2 3] TEST: [4]
TRAIN: [1 2 3 4] TEST: [5]
TRAIN: [1 2 3 4 5] TEST: [6]
TRAIN: [1 2 3 4 5 6] TEST: [7]
TRAIN: [1 2 3 4 5 6 7] TEST: [8]
TRAIN: [1 2 3 4 5 6 7 8] TEST: [9]
TRAIN: [1 2 3 4 5 6 7 8 9] TEST: [10]
TRAIN: [ 1  2  3  4  5  6  7  8  9 10] TEST: [11]
TRAIN: [ 1  2  3  4  5  6  7  8  9 10 11] TEST: [12]


# Training Test Defined

In [5]:
def _classifier_name(classifier):
    
    name = type(classifier).__name__
    if name == 'SVC':
        return classifier.kernel + ' ' + name
    return name
    
def train_and_test(X_train, X_test, y_train, y_test, features=None, verbose=False, random_state = 4, classifiers=None):
    def scale_columns(train, test, column_names):
        for column_name in column_names:
            scaler = get_scaler(train[column_name])
            train[column_name] = scale(scaler, train[column_name])
            test[column_name] = scale(scaler, test[column_name])
    
    scale_columns(X_train, X_test, features)
    
    test = X_test.copy() 
    test['label'] = y_test
    
    X_train = X_train[features]
    X_test = X_test[features]
    
    # class_weight = {
    #     1: 10,
    #     0: 1
    # }
    training_size = len(X_train.index.unique())
    
    class_weight = "balanced"
    
    if classifiers is None:
        classifiers = [
            DecisionTreeClassifier(max_depth=5, class_weight=class_weight, random_state=random_state),
            RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, class_weight=class_weight, random_state=random_state),
        ]
    report = []
    for classifier in classifiers:
        if verbose:
            print('\n')
            print('#'*10, type(classifier).__name__, '#'*10)
        classifier.fit(X_train, y_train)
        y_test_pred = classifier.predict(X_test)
        
        test['pred_label'] = y_test_pred
        test.to_csv('predictions_full_year_2500_v2/%s_%s.tsv' % (_classifier_name(classifier), training_size), sep='\t', index=False)
        test[test.label == 1][['lat', 'long']].to_csv('predictions_full_year_2500_v2/Locations/%s_true_crime_locations.csv' % _classifier_name(classifier), index=False)
        test[test.pred_label == 1][['lat', 'long']].to_csv('predictions_full_year_2500_v2/Locations/%s_pred_crime_locations.csv' % _classifier_name(classifier), index=False)
        
        confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, auc = classification_report(y_test, y_test_pred, verbose=verbose)
        report.append([_classifier_name(classifier), accuracy_score, recall_score, precision_score, f1_score, auc])
    return report

# time series split using cross validation time series

In [None]:
def test_models(features, splits=2, verbose=False, classifiers=None):
    if verbose:
        print()
        print('#'*100)
        print('#'*10 + str(features) + '#'*10)
        print('#'*100)
        print()

    X = df
    y = df.crime_freq.apply(lambda x: 1 if x > 0 else 0)
    
    tscv = TimeSeriesSplit(n_splits=splits)
    
    frames = []
    for train_index, test_index in tscv.split(X.index.unique()):
#         print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[X.index.isin(train_index+1)], X[X.index.isin(test_index+1)]
        y_train, y_test = y[y.index.isin(train_index+1)], y[y.index.isin(test_index+1)]
#         print(X_train.timestamp.values[0], X_train.timestamp.values[-1], X_test.timestamp.values[0], X_test.timestamp.values[-1])
    
        train_start_date, train_end_date = X_train.timestamp.values[0], X_train.timestamp.values[-1]
        test_start_date, test_end_date = X_test.timestamp.values[0], X_test.timestamp.values[-1]
        training_size = len(X_train.index.unique())
        
        if verbose:
            print()
            print('>'*100)
            print("Training: ", str(train_start_date), "to" , str(train_end_date))
            print("Test: ", str(test_start_date), "to" , str(test_end_date))
            print('>'*100)
            print()
        
        report = train_and_test(X_train, X_test, y_train, y_test, features=features, verbose=verbose, classifiers=classifiers)
        report = [classifier_report+[train_start_date, train_end_date, test_start_date, test_end_date, training_size] for classifier_report in report]
            
        frames.extend(report)
        
    return pd.DataFrame(frames, columns=['classifier', 
                                             'accuracy_score', 
                                             'recall_score', 
                                             'precision_score', 
                                             'f1_score', 
                                             'auc', 
                                             'train_start_date', 
                                             'train_end_date', 'test_start_date', 'test_end_date', 'training_size'
                                            ])
        

In [None]:
class_weight = "balanced"
random_state = 4

classifiers = [
        KNeighborsClassifier(n_neighbors=5),
#         SVC(kernel="linear", C=0.025, class_weight=class_weight, random_state=random_state),
        SVC(gamma=2, C=1, class_weight=class_weight, random_state=random_state),
#         GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
        DecisionTreeClassifier(max_depth=5, class_weight=class_weight, random_state=random_state),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, class_weight=class_weight, random_state=random_state),
#         MLPClassifier(alpha=1, random_state=random_state),
#         AdaBoostClassifier(random_state=random_state),
#         GaussianNB(),
#         QuadraticDiscriminantAnalysis(),
        LogisticRegression(C=1e5, class_weight=class_weight, random_state=random_state)
        ]

features = [
#     'PRCP', 
#     'SNOW', 
#     'TMAX', 
#     'TMIN',
#     'crime_freq', 
#      'yelp_freq', 
#      'police_freq', 
     'police_factor', 
#     'crime_factor', 
    'yelp_factor', 
    'prev_day_crime_freq',
    'prev_7_days_crime_freq'
]

report_df = test_models(features, classifiers=classifiers, splits=11, verbose=True)
ax = report_df[['classifier', 
            'accuracy_score', 
            'recall_score', 
            'precision_score', 
            'f1_score', 
            'auc']].groupby(by=['classifier']).mean().plot(rot=90)

for classifier in report_df.classifier.unique():
    ax = report_df[report_df.classifier == classifier].groupby(by=['training_size']).mean().plot(title=classifier)
    ax.xaxis.set_ticks(report_df.training_size.unique())



####################################################################################################
##########['police_factor', 'yelp_factor', 'prev_day_crime_freq', 'prev_7_days_crime_freq']##########
####################################################################################################


>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Training:  2006-01-08T00:00:00.000000000 to 2006-01-31T00:00:00.000000000
Test:  2006-02-01T00:00:00.000000000 to 2006-02-28T00:00:00.000000000
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>



########## KNeighborsClassifier ##########
confusion_matrix
[[63880   528]
 [  405  5187]]
accuracy_score 0.986671428571
recall_score 0.927575107296
precision_score 0.907611548556
f1_score 0.917484743964
auc 0.959688683942
null_accuracy 0.920114285714


########## SVC ##########
confusion_matrix
[[63675   733]
 [  171  5421]]
accuracy_score 0

In [None]:
report_df
# report_df.to_csv('Jan_2_split_classification_report.tsv', sep='\t', index=False)