In [None]:
# Required for xgboost windows.
# See: https://www.ibm.com/developerworks/community/blogs/jfp/entry/Installing_XGBoost_For_Anaconda_on_Windows?lang=en
import os

mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-6.2.0-posix-seh-rt_v5-rev1\\mingw64\\bin'

os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']

In [1]:
import xgboost as xgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm
import seaborn as sns
import re
from IPython.display import display # Allows the use of display() for DataFrames
# Show matplotlib plots inline (nicely formatted in the notebook)
%matplotlib inline



In [2]:
# Loads pickled dataset into pandas DataFrame
df_train = pd.read_pickle('preprocess_2.pickle')

display(df_train.head())

Unnamed: 0,Category,X,Y,Day_Friday,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday,...,PD_INGLESIDE,PD_MISSION,PD_NORTHERN,PD_PARK,PD_RICHMOND,PD_SOUTHERN,PD_TARAVAL,PD_TENDERLOIN,Time_Day,Time_Night
0,WARRANTS,4.807506,3.631637,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,OTHER OFFENSES,4.807506,3.631637,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,OTHER OFFENSES,4.807493,3.63232,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,LARCENY/THEFT,4.807515,3.632332,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,LARCENY/THEFT,4.807611,3.631556,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [3]:
# Label Encode the target variable (Category)
# This is for the XGBoost input later
# E.g. LARCENY/THEFT ... 1
# ASSAULT ... 2

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(df_train['Category'])
df_train['Category'] = le.transform(df_train['Category'])

display(df_train.head())

Unnamed: 0,Category,X,Y,Day_Friday,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday,...,PD_INGLESIDE,PD_MISSION,PD_NORTHERN,PD_PARK,PD_RICHMOND,PD_SOUTHERN,PD_TARAVAL,PD_TENDERLOIN,Time_Day,Time_Night
0,31,4.807506,3.631637,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,18,4.807506,3.631637,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,18,4.807493,3.63232,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,13,4.807515,3.632332,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,13,4.807611,3.631556,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [4]:
# Split into train, test sets
from sklearn.model_selection import train_test_split

features = df_train.drop(['Category'], axis=1, inplace=False)
target = df_train['Category']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

print "Training set has {} samples.".format(X_train.shape[0])
print "Testing set has {} samples.".format(X_test.shape[0])

Training set has 613980 samples.
Testing set has 263135 samples.


In [None]:
# Search for optimal parameters for XGBoost
# Parameter doc: https://github.com/dmlc/xgboost/blob/master/doc/parameter.md
from sklearn.model_selection import GridSearchCV

# Parameters to do GridSearch on
cv_params = {
            'max_depth': [1, 3, 6], 
            'min_child_weight': [1, 2]
            }

# XGBoost parameters
ind_params = {
            'learning_rate': 0.3,
            'seed': 42,
            'subsample': 1,
            'objective': 'multi:softmax'
             }

# Initialize GridSearch with its parameters
optimized_GBM = GridSearchCV(estimator=xgb.XGBClassifier(**ind_params), 
                             param_grid=cv_params, 
                             scoring='accuracy',
                             cv=5,
                             n_jobs=-1)

optimized_GBM.fit(X_train[:300], y_train[:300])
optimized_GBM.grid_scores_

In [None]:
# XGBoost library
# Convert pandas DataFrame to DMatrix to make XGBoost more efficient
dmat = xgb.DMatrix(features, target)

# Initialize parameters
clf_params = {
              'eta': 0.3, 
              'seed': 42, 
              'subsample': 0.8, 
              'colsample_bytree': 0.8, 
              'objective': 'multi:softmax',
              'num_class': 33,
              'max_depth': 3, 
              'min_child_weight': 1
             }

# Apply early stopping CV
clf_cv = xgb.cv(params = clf_params, 
                dtrain = dmat, 
                num_boost_round = 3000, 
                nfold = 5,
                metrics = ['merror'],
                early_stopping_rounds = 100)

clf_cv.tail(5)

In [None]:
# Train final model
clf_params = {
              'eta': 0.1, 
              'seed':0, 
              'subsample': 0.8, 
              'colsample_bytree': 0.8, 
              'objective': 'binary:logistic', 
              'max_depth':3, 
              'min_child_weight':1
              } 

final_gb = xgb.train(clf_params, dmat, num_boost_round = 432)

In [None]:
# Plot the feature importance
xgb.plot_importance(final_gb)

In [8]:
# Search for optimal parameters
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import ExtraTreesClassifier

# Parameters to do GridSearch on
cv_params = {
            'n_estimators': [10, 9, 8, 7, 6, 5]
            }

# XGBoost parameters
ind_params = {
            'random_state': 42,
            'n_jobs': -1,
             }

# Initialize GridSearch with its parameters
optimized_ET = GridSearchCV(estimator=ExtraTreesClassifier(**ind_params), 
                             param_grid=cv_params, 
                             scoring='f1_micro',
                             n_jobs=-1)

optimized_ET.fit(X_train[:10000], y_train[:10000])
#optimized_ET.cv_results_
optimized_ET.best_params_

{'n_estimators': 8}

In [3]:
# Train model
from time import time
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OutputCodeClassifier
from sklearn.linear_model import LogisticRegression


def train_classifier(clf, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    
    # Print the results
    print "Trained model in {:.4f} seconds".format(end - start)

def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    end = time()
    
    # Print and return results
    print "Made predictions in {:.4f} seconds.".format(end - start)
    return f1_score(target.values, y_pred, average='micro')
    #return accuracy_score(target.values, y_pred)
    #return classification_report(target.values, y_pred)


def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    print "Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    print "F1 score for training set: {:.4f}.".format(predict_labels(clf, X_train, y_train))
    print "F1 score for test set: {:.4f}.".format(predict_labels(clf, X_test, y_test))
    #print "accuracy_score for training set: {:.4f}.".format(predict_labels(clf, X_train, y_train))
    #print "accuracy_score test set: {:.4f}.".format(predict_labels(clf, X_test, y_test))
    #print "Report for testing set:", predict_labels(clf, X_test, y_test)
    
clf = RandomForestClassifier(random_state=42) #0.22
#clf = AdaBoostClassifier(random_state=42) #0.22
#clf = KNeighborsClassifier() #0.2235
#clf = GaussianNB()
#clf = GradientBoostingClassifier(max_features='auto')
#clf = ExtraTreesClassifier(n_jobs=-1) # 0.2578
#clf = SVC(random_state=42)
#clf = OneVsRestClassifier(RandomForestClassifier(random_state=42),n_jobs=-1)
#clf = OneVsOneClassifier(RandomForestClassifier(random_state=42),n_jobs=-1)
#clf = OutputCodeClassifier(RandomForestClassifier(random_state=42),n_jobs=-1)
#clf = LogisticRegression(random_state=42, solver='sag', multi_class='multinomial')
#clf = OneVsRestClassifier(SVC(kernel='linear', random_state=42),n_jobs=-1)

train_predict(clf, X_train, y_train, X_test, y_test)

NameError: name 'X_train' is not defined

In [6]:
# Load test dataset
df_test = pd.read_csv('test.csv')

display(df_test.head())

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


In [7]:
# Preprocess test dataset
def preprocess_data(df):
    # Remove outliers
    df = df[df.Y != 90]
    
    # Convert Dates column to the correct dtype
    df['Dates'] = df['Dates'].astype('datetime64[ns]')
    
    # Convert longitude and latitude into natural log to spread out the skewness
    df['X'] = df['X'].map(lambda x: np.log(-x))
    df['Y'] = df['Y'].map(lambda x: np.log(x))
    
    # Extract time from Dates and add a new column TimeOfDay representing Day/Night
    # Day: 6:00AM - 5:59PM, Night: 6:00PM - 5:59AM
    from datetime import datetime

    time_of_day = []
    start_t = datetime.strptime('06:00', '%H:%M').time()
    end_t = datetime.strptime('18:00', '%H:%M').time()

    for row in df['Dates']:
        if start_t <= row.time() < end_t:
            time_of_day.append('Day')
        else:
            time_of_day.append('Night')
        
    df['TimeOfDay'] = time_of_day
    df.drop(['Dates'], axis=1, inplace=True)
    
    # Create dummy variables for all categorical features
    df_X1 = pd.get_dummies(df['DayOfWeek'], prefix = 'Day')
    df_X2 = pd.get_dummies(df['PdDistrict'], prefix = 'PD')
    df_X3 = pd.get_dummies(df['TimeOfDay'], prefix = 'Time')

    df = df.join(df_X1)
    df = df.join(df_X2)
    df = df.join(df_X3)
    df.drop(['DayOfWeek', 'PdDistrict', 'TimeOfDay', 'Address'], axis = 1, inplace = True)
    
    return df

df_test = preprocess_data(df_test)
display(df_test.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

Unnamed: 0,Id,X,Y,Day_Friday,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday,...,PD_INGLESIDE,PD_MISSION,PD_NORTHERN,PD_PARK,PD_RICHMOND,PD_SOUTHERN,PD_TARAVAL,PD_TENDERLOIN,Time_Day,Time_Night
0,0,4.807291,3.630589,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,4.807225,3.63052,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,4.807507,3.632103,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,3,4.8076,3.630228,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,4,4.8076,3.630228,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [9]:
# Predict model
ind = df_test['Id']
features_test = df_test.drop(['Id'], axis=1, inplace=False)

pred = clf.predict(features_test)

print pred[:5]

# Reverse transform the encoded labels of Categories to string values
y_pred  = le.inverse_transform(pred)

print y_pred[:5]

[30 18 13  1  1]
['VEHICLE THEFT' 'OTHER OFFENSES' 'LARCENY/THEFT' 'ASSAULT' 'ASSAULT']


In [10]:
# Output results into DataFrame
col_list = ['ARSON',
            'ASSAULT',
            'BAD CHECKS',
            'BRIBERY',
            'BURGLARY',
            'DISORDERLY CONDUCT',
            'DRIVING UNDER THE INFLUENCE',
            'DRUG/NARCOTIC',
            'DRUNKENNESS',
            'EMBEZZLEMENT',
            'EXTORTION',
            'FAMILY OFFENSES',
            'FORGERY/COUNTERFEITING',
            'FRAUD',
            'GAMBLING',
            'KIDNAPPING',
            'LARCENY/THEFT',
            'LIQUOR LAWS',
            'LOITERING',
            'MISSING PERSON',
            'NON-CRIMINAL',
            'OTHER OFFENSES',
            'PORNOGRAPHY/OBSCENE MAT',
            'PROSTITUTION',
            'RECOVERED VEHICLE',
            'ROBBERY',
            'RUNAWAY',
            'SECONDARY CODES',
            'SEX OFFENSES FORCIBLE',
            'SEX OFFENSES NON FORCIBLE',
            'STOLEN PROPERTY',
            'SUICIDE',
            'SUSPICIOUS OCC',
            'TREA',
            'TRESPASS',
            'VANDALISM',
            'VEHICLE THEFT',
            'WARRANTS',
            'WEAPON LAWS']

df_res = pd.DataFrame(data=0,index=ind, columns=col_list, dtype='int32')
        
def output_test_results(df, y_pred):
    for i in range(len(y_pred)):
        for col in list(df.columns.values):
            if y_pred[i] == col:
                df[col].loc[i] = 1
    return df
        
df_res = output_test_results(df_res, y_pred)
display(df_res.head())

Unnamed: 0_level_0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
df_res.to_csv('submission.csv')