# Model Training

This notebook is used to train the processed crime data set on different models and compare the results.

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

## Methods

In [2]:
def createAndSaveLabelEncoder(data, column):
    '''
    function to create and save label encoder object for the given column.
    '''
    le = LabelEncoder()
    le.fit(data[column])
    with open('store/'+column+'_label_encoder.pkl', 'wb') as f:
        pickle.dump(le, f)
    data[column] = le.transform(data[column])
    return data

In [3]:
def loadLabelEncoder(column):
    '''
    function to load label encoder object for the given column.
    '''
    with open('store/'+column+'_label_encoder.pkl', 'rb') as f:
        le = pickle.load(f)
    return le

In [4]:
def saveModel(model, name):
    '''
    function to save model object with the given name.
    '''
    with open('store/'+name+'_model.pkl', 'wb') as f:
        pickle.dump(model, f)
    print('%s Model saved successfully.' % name)

## Model

Get the processed data.

In [5]:
crime_data = pd.read_pickle('store/crime_data.pkl', compression='gzip')

In [6]:
crime_data.head()

Unnamed: 0,category,day,district,longitude,latitude,month,year,time_interval,resolved,label,near_facilities,near_private_spaces,near_colleges,near_public_open_spaces,near_commuter_stops,near_public_park,near_landmarks,near_schools
1,robbery,sunday,tenderloin,-122.414406,37.784191,february,2015,t4,0,high,1,1,1,1,1,1,1,1
2,assault,sunday,tenderloin,-122.414406,37.784191,february,2015,t4,0,high,1,1,1,1,1,1,1,1
4,vandalism,tuesday,northern,-122.431119,37.800469,january,2015,t5,0,high,1,0,1,1,1,1,1,1
7,vandalism,saturday,bayview,-122.374019,37.729203,january,2015,t6,0,high,1,0,0,1,0,1,1,1
8,burglary,saturday,central,-122.406568,37.787809,january,2015,t5,0,high,1,1,1,1,1,1,1,1


Drop columns that do not need to be considered for classification model.

In [7]:
crime_data = crime_data.drop(['category', 'year'], axis=1)

Store the columns list.

In [8]:
columns = list(crime_data)

Label encode all the columns that require it.

In [9]:
cols = ['day', 'district', 'month', 'time_interval', 'label']
for col in cols:
    crime_data = createAndSaveLabelEncoder(crime_data, col)

Extract labels.

In [10]:
labels = crime_data['label']
crime_data = crime_data.drop('label', axis=1)

Split the train and test data sets.

In [11]:
X_train, X_test, y_train, y_test = train_test_split(crime_data.values, labels, test_size=0.33, random_state=42)

Train the models.

In [12]:
kNN_model = KNeighborsClassifier(n_neighbors=3)
kNN_model.fit(X_train, y_train) 

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

## Tuning

## Evaluation

In [13]:
y_pred = kNN_model.predict(X_test)
score = f1_score(y_test, y_pred, average='weighted')
print("F1 Score: %f" %(score))
# y_pred = kNN_model.predict(X_test[0].reshape(1, -1))
# score = f1_score(y_test[0].reshape(1, -1), y_pred, average='weighted')
# print("F1 Score: %f" %(score))

F1 Score: 0.667100


## Save the winner model.

In [15]:
columns

['day',
 'district',
 'longitude',
 'latitude',
 'month',
 'time_interval',
 'resolved',
 'label',
 'near_facilities',
 'near_private_spaces',
 'near_colleges',
 'near_public_open_spaces',
 'near_commuter_stops',
 'near_public_park',
 'near_landmarks',
 'near_schools']

In [14]:
# le = loadLabelEncoder('label')
# labels = le.inverse_transform(labels.values)