In [None]:
%matplotlib inline
%load_ext watermark
%watermark -v -d -u -p pyprind

# San Francisco Crime Classification Challenge

## Introduction:

Currently, supervised classification is becoming a very important problem with a wide range of applications, from sentiment analysis and cancer detection to fraud detection systems, from [authorship profiling systems](https://github.com/alonsopg/AuthorProfiling) to image recognition and loan prediction systems.

This [is my solution](http://alonsopg.com/) for the [Kaggle's San Francisco Crime Classification Challenge](https://www.kaggle.com/c/sf-crime). My approach for this task is based on [XGboost's Gradient Boosting implementation](https://arxiv.org/pdf/1603.02754v3.pdf).


## The task:

This competition's dataset provides nearly 12 years of crime reports from across all of San Francisco's neighborhoods. Given the date, location, district, address, description, and coordinates of historical records the task is about predicting the category of crime that occurred.

<img src="files/SF_pic.png">

## The dataset:

The dataset consist in two spreadsheet files with 1762309 instances (rows) of crime reports, which are divided into training and testing chunks. The reason of this "chunking" is that we will use the training data to train an algorithm that yields a model that learns to identify new unlabeled, unclassified or unannotated categories of crimes given its other attributes (`Dates, DayOfWeek, PdDistrict, Address, X, and Y coordinates`)

### Training data:

With regards to the training data, the historical records consist a `.csv` file very similar to a database table. Interestingly, this historical records have a `Category` column that represents the `label` or target mark that represents the category of the crime (`WARRANTS, OTHER OFFENSES, LARCENY/THEFT, VEHICLE THEFT, VANDALISM, NON-CRIMINAL, ROBBERY, ASSAULT, WEAPON LAWS, BURGLARY, SUSPICIOUS OCC, DRUNKENNESS, FORGERY/COUNTERFEITING, DRUG/NARCOTIC, STOLEN PROPERTY, SECONDARY CODES, TRESPASS, MISSING PERSON, FRAUD, KIDNAPPING, RUNAWAY, DRIVING UNDER THE INFLUENCE, SEX OFFENSES FORCIBLE, PROSTITUTION, DISORDERLY CONDUCT, ARSON, FAMILY OFFENSES, LIQUOR LAWS, BRIBERY, EMBEZZLEMENT, SUICIDE, LOITERING, SEX OFFENSES NON FORCIBLE, EXTORTION, GAMBLING, BAD CHECKS, TREA, RECOVERED VEHICLE, PORNOGRAPHY/OBSCENE MAT`)


<img src="files/pic2.png">


### Testing data:

On the other hand, the testing data doesn't have a category column (target).
<img src="files/pic1.png">

## Gradient Boosting Classification Trees:

The state of the art in supervised classification is gr


As usual data is represented in form of instances with the $\{\}$


## Solution:

For my solution I'll use a Python programming environment (Numpy, pandas, sklearn, and XGboost).

### Training data

In [1]:
#Firstly let's read the training data:
import pandas as pd
training_data  = pd.read_csv('/Users/user/Jupyter/datasets/SFCCC_dataset/train 2.csv')
training_data.tail(2)

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
878047,2003-01-06 00:01:00,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Monday,SOUTHERN,NONE,TOWNSEND ST / 2ND ST,-122.390531,37.780607
878048,2003-01-06 00:01:00,FORGERY/COUNTERFEITING,"CHECKS, FORGERY (FELONY)",Monday,BAYVIEW,NONE,1800 Block of NEWCOMB AV,-122.394926,37.738212


Let's split the `Dates` column into two columns (`Dates` and `Time`):

In [2]:
training_data["Dates"], training_data["Time"] = zip(*training_data["Dates"].str.split().tolist())
training_data.tail(2)
# If we wanted to delete the dates or time column:
#del training_data["Dates"]

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Time
878047,2003-01-06,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Monday,SOUTHERN,NONE,TOWNSEND ST / 2ND ST,-122.390531,37.780607,00:01:00
878048,2003-01-06,FORGERY/COUNTERFEITING,"CHECKS, FORGERY (FELONY)",Monday,BAYVIEW,NONE,1800 Block of NEWCOMB AV,-122.394926,37.738212,00:01:00


### Testing data

In [3]:
# let's read the test data:
testing_data = pd.read_csv('/Users/user/Desktop/neew.csv')
#Delete the index column
testing_data.tail(2)

Unnamed: 0,Dates,Time,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Category
48,2015-05-13,15:58:00,STOLEN AND RECOVERED VEHICLE,Wednesday,NORTHERN,NONE,2400 Block of CALIFORNIA ST,-122.434689,37.788854,VEHICLE THEFT
49,2015-05-13,17:28:00,"DRIVERS LICENSE, SUSPENDED OR REVOKED",Wednesday,MISSION,"ARREST, BOOKED",23RD ST / VERMONT ST,-122.403525,37.754453,OTHER OFFENSES


Let's apply the previous procedure just to give coherence to our data:

In [None]:
# Por el momento me lo salto, esto debido a que lo verificaremos a mano
testing_data["Dates"], testing_data["Time"] = zip(*testing_data["Dates"].str.split().tolist())
testing_data.tail(2)
# If we wanted to delete the dates or time column:
#del training_data["Dates"] 

## Feature Engineering

In [4]:
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.pipeline import FeatureUnion
from xgboost import XGBClassifier
from sklearn.cross_validation import KFold, train_test_split

#Categorical values

#district_le = LabelEncoder().fit_transform(training_data['PdDistrict'].values.ravel().reshape(-1, 1))
#dates_le = LabelEncoder().fit_transform(training_data['Dates'].values.ravel().reshape(-1, 1))
#day_of_week_le = LabelEncoder().fit_transform(training_data['DayOfWeek'].values.ravel().reshape(-1, 1))

#district_le = LabelEncoder().fit_transform(training_data['PdDistrict'].reshape(-1, 1))
#dates_le = LabelEncoder().fit_transform(training_data['Dates'].values.reshape(-1, 1))
#day_of_week_le = LabelEncoder().fit_transform(training_data['DayOfWeek'].reshape(-1, 1))

district_le = LabelEncoder().fit_transform(training_data['PdDistrict'].values.reshape(-1, 1).ravel())
dates_le = LabelEncoder().fit_transform(training_data['Dates'].values.reshape(-1, 1).ravel())
day_of_week_le = LabelEncoder().fit_transform(training_data['DayOfWeek'].values.reshape(-1, 1).ravel())


In [5]:
day_of_week_le.shape

(878049,)

In [6]:
district_feats = OneHotEncoder().fit_transform(district_le.reshape(-1, 1)).A
dates_feats = OneHotEncoder().fit_transform(dates_le.reshape(-1, 1)).A
day_of_week_feats = OneHotEncoder().fit_transform(day_of_week_le.reshape(-1, 1)).A
#words
descript_feats = CountVectorizer().fit_transform(training_data['Descript'].values)

In [7]:
print(district_feats.shape, dates_feats.shape, day_of_week_feats.shape)

(878049, 10) (878049, 2249) (878049, 7)


In [8]:
dates_feats

array([[ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       ..., 
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ...,  0.,  0.,  0.]])

In [9]:
#print(district_feats,'\n', dates_feats,'\n', day_of_week_feats)
#print(10*'\n*\n')
print(district_feats)

[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  1.  0.  0.]
 [ 0.  0.  0. ...,  1.  0.  0.]
 [ 1.  0.  0. ...,  0.  0.  0.]]


In [10]:
import scipy as sparse
import numpy as np

#X_combined_features = np.vstack((descript_feats.A.T, district_feats, dates_feats, day_of_week_feats)).T

#X_combined_features = np.vstack((descript_feats.A.T, 
#                                 district_feats, 
#                                 dates_feats, day_of_week_feats)).T


#descript_feats is sparse turn it into a array with .A
X_combined_features =  np.column_stack((descript_feats.A,
                                        district_feats, dates_feats, 
                                        day_of_week_feats))


y = LabelEncoder().fit_transform(training_data['Category'].values.ravel())


In [11]:
type(X_combined_features)

numpy.ndarray

In [None]:
scaler = StandardScaler()
X = scaler.partial_fit(X_combined_features)

In [None]:
X_combined_features.shape


In [None]:
type(X_combined_features)

In [None]:
X_combined_features.shape

## Supervised classification pipeline

In [None]:
#Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = False)

clf_pipeline = make_pipeline(StandardScaler(), XGBClassifier())

In [None]:
clf = Pipeline([
        #Features
        ('features', FeatureUnion(transformer_list=[
                    ('descript', Pipeline([
                                ('select', FunctionTransformer(lambda X: X[:,0], validate=False)),
                                ('tfidf', CountVectorizer(binary=True)),
                            ])),
                    ('district', Pipeline([
                                ('select', FunctionTransformer(lambda X: X[:,[1]], validate=False)),
                                ('transform', FunctionTransformer(lambda X: district_le.transform(X.ravel()).reshape(-1,1), validate=False)),
                            ])),
                    ('resolution', Pipeline([
                                ('select', FunctionTransformer(lambda X: X[:,[2]], validate=False)),
                                ('transform', FunctionTransformer(lambda X: resolution_le.transform(X.ravel()).reshape(-1,1), validate=False)),
                            ])),
                    ('dates', Pipeline([
                                ('select', FunctionTransformer(lambda X: X[:,[3]], validate=False)),
                                ('transform', FunctionTransformer(lambda X: dates_le.transform(X.ravel()).reshape(-1, 1), validate=False)), 
                            ])),
                    
                    ('dayOfWeek', Pipeline([
                                ('select', FunctionTransformer(lambda X: X[:,[4]], validate=False)),
                                ('transform', FunctionTransformer(lambda X: day_of_week_le.transform(X.ravel()).reshape(-1, 1), validate=False)), 
                            ])), 
                ])),
        #Estimator
        ('est', XGBClassifier(n_estimators=30, max_depth=2))])




X = training_data[['Descript', 'PdDistrict', 'Resolution', 'Dates', 'DayOfWeek']].values


categories_le = LabelEncoder()
y = categories_le.fit_transform(training_data['Category'].values.ravel())


kfold = KFold(n=len(y), n_folds=10, random_state=False)

results = cross_val_score(clf, X, y, cv=kfold)

In [None]:
print("Accuracy: %.2f%% " % (results.mean()))

In [None]:
clf = Pipeline([
        #Features
        ('features', FeatureUnion(transformer_list=[
                    ('descript', Pipeline([
                                ('select', FunctionTransformer(lambda X: X[:,0], validate=False)),
                                ('tfidf', CountVectorizer(binary=True)),
                            ])),
                    ('district', Pipeline([
                                ('select', FunctionTransformer(lambda X: X[:,[1]], validate=False)),
                                ('transform', FunctionTransformer(lambda X: district_le.transform(X.ravel()).reshape(-1,1), validate=False)),
                            ])),
                    ('resolution', Pipeline([
                                ('select', FunctionTransformer(lambda X: X[:,[2]], validate=False)),
                                ('transform', FunctionTransformer(lambda X: resolution_le.transform(X.ravel()).reshape(-1,1), validate=False)),
                            ])),
                    ('dates', Pipeline([
                                ('select', FunctionTransformer(lambda X: X[:,[3]], validate=False)),
                                ('transform', FunctionTransformer(lambda X: dates_le.transform(X.ravel()).reshape(-1, 1), validate=False)), 
                            ])),
                    
                    ('dayOfWeek', Pipeline([
                                ('select', FunctionTransformer(lambda X: X[:,[4]], validate=False)),
                                ('transform', FunctionTransformer(lambda X: day_of_week_le.transform(X.ravel()).reshape(-1, 1), validate=False)), 
                            ])), 
                ])),
        #Estimator
        ('est', XGBClassifier(n_estimators=30, max_depth=2))])




X = training_data[['Descript', 'PdDistrict', 'Resolution', 'Dates', 'DayOfWeek']].values


categories_le = LabelEncoder()
y = categories_le.fit_transform(training_data['Category'].values.ravel())


kfold = KFold(n=len(y), n_folds=10, random_state=False)

results = cross_val_score(clf, X, y, cv=kfold)

In [None]:
# After the loading train data
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from xgboost import XGBClassifier

district_le = LabelEncoder().fit(training_data['PdDistrict'].values.ravel())
resolution_le = LabelEncoder().fit(training_data['Resolution'].values.ravel())
clf = Pipeline([
        ('features', FeatureUnion(transformer_list=[
                    ('descript', Pipeline([
                                ('select', FunctionTransformer(lambda X: X[:,0], validate=False)),
                                ('tfidf', CountVectorizer(binary=True)),
                            ])),
                    ('district', Pipeline([
                                ('select', FunctionTransformer(lambda X: X[:,[1]], validate=False)),
                                ('transform', FunctionTransformer(lambda X: district_le.transform(X.ravel()).reshape(-1,1), validate=False)),
                            ])),
                    ('resolution', Pipeline([
                                ('select', FunctionTransformer(lambda X: X[:,[2]], validate=False)),
                                ('transform', FunctionTransformer(lambda X: resolution_le.transform(X.ravel()).reshape(-1,1), validate=False)),
                            ])),
                ])),
        ('est', XGBClassifier(n_estimators=30, max_depth=2)),
    ])
categories_le = LabelEncoder()
X = training_data[['Descript', 'PdDistrict', 'Resolution']].values
y = categories_le.fit_transform(training_data['Category'].values.ravel())
print(cross_val_score(clf, X, y, cv=3, verbose=3))


# Operate
from sklearn.cross_validation import train_test_split

district_le = LabelEncoder().fit(training_data['PdDistrict'].values.ravel())
resolution_le = LabelEncoder().fit(training_data['Resolution'].values.ravel())
categories_le = LabelEncoder()
X = training_data[['Descript', 'PdDistrict', 'Resolution']].values
y = categories_le.fit_transform(training_data['Category'].values.ravel())
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
clf = Pipeline([
        ('features', FeatureUnion(transformer_list=[
                    ('descript', Pipeline([
                                ('select', FunctionTransformer(lambda X: X[:,0], validate=False)),
                                ('tfidf', CountVectorizer(binary=True)),
                            ])),
                    ('district', Pipeline([
                                ('select', FunctionTransformer(lambda X: X[:,[1]], validate=False)),
                                ('transform', FunctionTransformer(lambda X: district_le.transform(X.ravel()).reshape(-1,1), validate=False)),
                            ])),
                    ('resolution', Pipeline([
                                ('select', FunctionTransformer(lambda X: X[:,[2]], validate=False)),
                                ('transform', FunctionTransformer(lambda X: resolution_le.transform(X.ravel()).reshape(-1,1), validate=False)),
                            ])),
                ])),
        ('est', XGBClassifier(n_estimators=30, max_depth=2)),
    ])
clf.fit(X_train, y_train)
print(clf.predict(X_test))
print(categories_le.inverse_transform(clf.predict(X_test)))
