In [1]:
# linear algebra
import numpy as np 

# data processing
import pandas as pd 

# data visualization
import seaborn as sns
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import style

# Algorithms
from sklearn import linear_model
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
import xgboost as xgb

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder

# Metrics 
from sklearn.metrics import log_loss
from sklearn.model_selection import cross_val_score

# Model Selection & Hyperparameter tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from skopt import BayesSearchCV
from skopt.space  import Real, Categorical, Integer


# Clustering
from sklearn.cluster import KMeans

# Mathematical Functions
import math

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
train_df.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Dates       878049 non-null  object 
 1   Category    878049 non-null  object 
 2   Descript    878049 non-null  object 
 3   DayOfWeek   878049 non-null  object 
 4   PdDistrict  878049 non-null  object 
 5   Resolution  878049 non-null  object 
 6   Address     878049 non-null  object 
 7   X           878049 non-null  float64
 8   Y           878049 non-null  float64
dtypes: float64(2), object(7)
memory usage: 60.3+ MB


In [5]:
train_df.columns.values

array(['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict',
       'Resolution', 'Address', 'X', 'Y'], dtype=object)

In [6]:
# set show nulls to True
train_df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Dates       878049 non-null  object 
 1   Category    878049 non-null  object 
 2   Descript    878049 non-null  object 
 3   DayOfWeek   878049 non-null  object 
 4   PdDistrict  878049 non-null  object 
 5   Resolution  878049 non-null  object 
 6   Address     878049 non-null  object 
 7   X           878049 non-null  float64
 8   Y           878049 non-null  float64
dtypes: float64(2), object(7)
memory usage: 60.3+ MB


In [7]:
# 878,049 instances in training set (or recorded crime instances in SF)
# 9 columns (8 potential features + 1 label (Category))

# 2 columns with float values
# 7 objects

# 0 null values

In [8]:
#look at category column 

train_df['Category'].value_counts()

LARCENY/THEFT                  174900
OTHER OFFENSES                 126182
NON-CRIMINAL                    92304
ASSAULT                         76876
DRUG/NARCOTIC                   53971
VEHICLE THEFT                   53781
VANDALISM                       44725
WARRANTS                        42214
BURGLARY                        36755
SUSPICIOUS OCC                  31414
MISSING PERSON                  25989
ROBBERY                         23000
FRAUD                           16679
FORGERY/COUNTERFEITING          10609
SECONDARY CODES                  9985
WEAPON LAWS                      8555
PROSTITUTION                     7484
TRESPASS                         7326
STOLEN PROPERTY                  4540
SEX OFFENSES FORCIBLE            4388
DISORDERLY CONDUCT               4320
DRUNKENNESS                      4280
RECOVERED VEHICLE                3138
KIDNAPPING                       2341
DRIVING UNDER THE INFLUENCE      2268
RUNAWAY                          1946
LIQUOR LAWS 

In [9]:
#look at PD district column
train_df['PdDistrict'].value_counts()

SOUTHERN      157182
MISSION       119908
NORTHERN      105296
BAYVIEW        89431
CENTRAL        85460
TENDERLOIN     81809
INGLESIDE      78845
TARAVAL        65596
PARK           49313
RICHMOND       45209
Name: PdDistrict, dtype: int64

In [10]:
# which day of week crime was commited
train_df['DayOfWeek'].value_counts()

Friday       133734
Wednesday    129211
Saturday     126810
Thursday     125038
Tuesday      124965
Monday       121584
Sunday       116707
Name: DayOfWeek, dtype: int64

In [11]:
## Count number of observations for Resolution feature
train_df['Resolution'].value_counts()

NONE                                      526790
ARREST, BOOKED                            206403
ARREST, CITED                              77004
LOCATED                                    17101
PSYCHOPATHIC CASE                          14534
UNFOUNDED                                   9585
JUVENILE BOOKED                             5564
COMPLAINANT REFUSES TO PROSECUTE            3976
DISTRICT ATTORNEY REFUSES TO PROSECUTE      3934
NOT PROSECUTED                              3714
JUVENILE CITED                              3332
PROSECUTED BY OUTSIDE AGENCY                2504
EXCEPTIONAL CLEARANCE                       1530
JUVENILE ADMONISHED                         1455
JUVENILE DIVERTED                            355
CLEARED-CONTACT JUVENILE FOR MORE INFO       217
PROSECUTED FOR LESSER OFFENSE                 51
Name: Resolution, dtype: int64

In [12]:
#looking at latitudes and longitudes
train_df[['X','Y']].describe()

Unnamed: 0,X,Y
count,878049.0,878049.0
mean,-122.422616,37.77102
std,0.030354,0.456893
min,-122.513642,37.707879
25%,-122.432952,37.752427
50%,-122.41642,37.775421
75%,-122.406959,37.784369
max,-120.5,90.0


#  Data preprocessig


In [13]:
"""Data cleaning
> imputation or removal of outlier values
> Feature Engineering (Feature Creation)
> Feature Encoding
> Integer encode or label encode ordinal categorical features that maintain order (Year, Business Quarter, Block/Street Number)
Usually:
> One hot encode nominal categorical features (DayOfWeek, PdDistrict, StreetType, Category)
  mainly for logistic regression
> However, Random Forests & Boosting algorithms can handle nominal categorical features directly, so we just integer encode these features"""

'Data cleaning\n> imputation or removal of outlier values\n> Feature Engineering (Feature Creation)\n> Feature Encoding\n> Integer encode or label encode ordinal categorical features that maintain order (Year, Business Quarter, Block/Street Number)\nUsually:\n> One hot encode nominal categorical features (DayOfWeek, PdDistrict, StreetType, Category)\n  mainly for logistic regression\n> However, Random Forests & Boosting algorithms can handle nominal categorical features directly, so we just integer encode these features'

In [14]:
#data cleaning
#data removal
# data imputer

In [15]:
train_df[train_df['Y'] == train_df['Y'].max()]

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
660485,2005-12-30 17:00:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Friday,TENDERLOIN,NONE,5THSTNORTH ST / OFARRELL ST,-120.5,90.0
660711,2005-12-30 00:34:00,ASSAULT,INFLICT INJURY ON COHABITEE,Friday,BAYVIEW,"ARREST, BOOKED",JAMESLICKFREEWAY HY / SILVER AV,-120.5,90.0
660712,2005-12-30 00:34:00,ASSAULT,AGGRAVATED ASSAULT WITH BODILY FORCE,Friday,BAYVIEW,"ARREST, BOOKED",JAMESLICKFREEWAY HY / SILVER AV,-120.5,90.0
661106,2005-12-29 00:07:00,NON-CRIMINAL,"AIDED CASE, MENTAL DISTURBED",Thursday,TENDERLOIN,PSYCHOPATHIC CASE,5THSTNORTH ST / EDDY ST,-120.5,90.0
666430,2005-11-30 11:25:00,OTHER OFFENSES,TRAFFIC VIOLATION,Wednesday,TENDERLOIN,"ARREST, CITED",5THSTNORTH ST / ELLIS ST,-120.5,90.0
...,...,...,...,...,...,...,...,...,...
844995,2003-06-11 08:49:00,OTHER OFFENSES,"DRIVERS LICENSE, SUSPENDED OR REVOKED",Wednesday,INGLESIDE,"ARREST, CITED",JAMES LICK FREEWAY HY / CESAR CHAVEZ ST,-120.5,90.0
845842,2003-06-09 09:25:00,OTHER OFFENSES,"DRIVERS LICENSE, SUSPENDED OR REVOKED",Monday,INGLESIDE,"ARREST, CITED",JAMES LICK FREEWAY HY / CESAR CHAVEZ ST,-120.5,90.0
852880,2003-05-02 01:00:00,SEX OFFENSES FORCIBLE,"FORCIBLE RAPE, BODILY FORCE",Friday,SOUTHERN,COMPLAINANT REFUSES TO PROSECUTE,3RD ST / JAMES LICK FREEWAY HY,-120.5,90.0
857248,2003-04-14 16:30:00,ROBBERY,"ROBBERY ON THE STREET, STRONGARM",Monday,BAYVIEW,COMPLAINANT REFUSES TO PROSECUTE,GILMAN AV / FITCH ST,-120.5,90.0


In [16]:
train_df

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541
...,...,...,...,...,...,...,...,...,...
878044,2003-01-06 00:15:00,ROBBERY,ROBBERY ON THE STREET WITH A GUN,Monday,TARAVAL,NONE,FARALLONES ST / CAPITOL AV,-122.459033,37.714056
878045,2003-01-06 00:01:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,INGLESIDE,NONE,600 Block of EDNA ST,-122.447364,37.731948
878046,2003-01-06 00:01:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,SOUTHERN,NONE,5TH ST / FOLSOM ST,-122.403390,37.780266
878047,2003-01-06 00:01:00,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Monday,SOUTHERN,NONE,TOWNSEND ST / 2ND ST,-122.390531,37.780607


I notice that there are 108 rows with incorrect coordinates, and they seem to be the exact same two coordinates (90, -120.5). There are many ways to handle this. We need to do data imputation, which can be done several ways. For now, I will randomly sample from a normal distribution with the range of a standard deviation from the mean. However, I could use a linear regression model to predict the latitude and longitude values (based on other variables such as PD district?) and use that to impute the bad / inconsistent data points.

Another method is to completely remove this data. Since I already have a lot of data, and I do not want this incorrect data to affect my results, I could remove them. However, I will stick with data imputation.



In [17]:
train_df['Y'].replace(to_replace = train_df['Y'].max(),value = np.nan , inplace = True)
train_df['X'].replace(to_replace = train_df['X'].max(),value = np.nan , inplace = True)
test_df['Y'].replace(to_replace = test_df['Y'].max(),value = np.nan , inplace = True)
test_df['X'].replace(to_replace = test_df['X'].max(),value = np.nan , inplace = True)

In [18]:
train_df.isnull().sum()

Dates          0
Category       0
Descript       0
DayOfWeek      0
PdDistrict     0
Resolution     0
Address        0
X             67
Y             67
dtype: int64

In [19]:
test_df.isnull().sum()

Id             0
Dates          0
DayOfWeek      0
PdDistrict     0
Address        0
X             76
Y             76
dtype: int64

In [20]:
data = [train_df,test_df]
for dataset in data:
    mean_X = dataset["X"].mean()
    mean_Y =  dataset["Y"].mean()
    std_X  = dataset["X"].std()
    std_Y  = dataset["Y"].std()
    max_X  =  mean_X  + std_X
    min_X =  mean_X  - std_X
    max_Y  = mean_Y + std_Y
    min_Y  =  mean_Y - std_Y
    
    is_null = dataset['Y'].isnull().sum()
    random_X = (max_X - min_X) * np.random.randn(is_null) + min_X
    random_Y = (max_Y - min_Y) * np.random.randn(is_null) + min_Y
    
    X_slice = dataset['X'].copy()
    Y_slice = dataset['Y'].copy()
    X_slice[np.isnan(X_slice)] = random_X
    Y_slice[np.isnan(Y_slice)] = random_Y
    dataset['X'] = X_slice
    dataset['Y'] = Y_slice
    

In [21]:
train_df[["X","Y"]].describe()

Unnamed: 0,X,Y
count,878049.0,878049.0
mean,-122.422764,37.767033
std,0.025287,0.024168
min,-122.577784,37.604629
25%,-122.432952,37.752427
50%,-122.416422,37.775421
75%,-122.406959,37.784368
max,-122.344743,37.852808


# feature engineering

> Lets create some features from the data that exists in the current feature space
> There are couple categories of features 
 . Temporial features
 . spatial features

# Temporal features

We want to have a column for Time, so we must parse through the 'Dates' feature to create the 'Time' feature

In [22]:
# transform it into python datetime
train_df['Dates'] = pd.to_datetime(train_df["Dates"],format = "%Y-%m-%d %H:%M:%S")
test_df["Dates"] = pd.to_datetime(test_df["Dates"], format="%Y-%m-%d %H:%M:%S")

In [23]:
# make minute column
train_df['Minute']  = train_df["Dates"].map(lambda x : x.minute)
test_df['Minute']   = test_df["Dates"].map(lambda x: x.minute)

In [24]:
# make hour column
train_df['Hour']  = train_df["Dates"].map(lambda x : x.hour)
test_df['Hour']   = test_df["Dates"].map(lambda x: x.hour)

In [25]:
# make day columns
train_df["Day"] =  train_df["Dates"].map(lambda x:x.day)
test_df["Day"] = test_df["Dates"].map(lambda x: x.day)

In [26]:
# make month column
train_df["Month"] = train_df["Dates"].map(lambda x: x.month)
test_df["Month"] = test_df["Dates"].map(lambda x: x.month)

In [27]:
# Year
train_df["Year"] = train_df["Dates"].map(lambda x: x.year)
test_df["Year"] = test_df["Dates"].map(lambda x: x.year)

In [28]:
# Hour Zone 0 - Pass midnight, 1 - morning, 2 - afternoon, 3 - dinner / sun set, 4 - night
def get_hour_zone(hour):
    if hour >= 2 and hour < 8: 
        return 0
    elif hour >= 8 and hour < 12: 
        return 1
    elif hour >= 12 and hour < 18: 
        return 2
    elif hour >= 18 and hour < 22: 
        return 3
    elif hour < 2 or hour >= 22: 
        return 4
    
train_df["Hour_Zone"] = train_df["Hour"].map(get_hour_zone)
test_df["Hour_Zone"] = test_df["Hour"].map(get_hour_zone)

In [29]:
#Add week of the year
train_df["WeekOfYear"] = train_df["Dates"].map(lambda x: int(x.weekofyear / 2) - 1)
test_df["WeekOfYear"] = test_df["Dates"].map(lambda x: int(x.weekofyear / 2))

print(sorted(train_df['WeekOfYear'].unique()))
print(sorted(test_df['WeekOfYear'].unique()))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]


In [30]:
train_df.head(10)

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Minute,Hour,Day,Month,Year,Hour_Zone,WeekOfYear
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,53,23,13,5,2015,4,9
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,53,23,13,5,2015,4,9
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,33,23,13,5,2015,4,9
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,30,23,13,5,2015,4,9
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,30,23,13,5,2015,4,9
5,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM UNLOCKED AUTO,Wednesday,INGLESIDE,NONE,0 Block of TEDDY AV,-122.403252,37.713431,30,23,13,5,2015,4,9
6,2015-05-13 23:30:00,VEHICLE THEFT,STOLEN AUTOMOBILE,Wednesday,INGLESIDE,NONE,AVALON AV / PERU AV,-122.423327,37.725138,30,23,13,5,2015,4,9
7,2015-05-13 23:30:00,VEHICLE THEFT,STOLEN AUTOMOBILE,Wednesday,BAYVIEW,NONE,KIRKWOOD AV / DONAHUE ST,-122.371274,37.727564,30,23,13,5,2015,4,9
8,2015-05-13 23:00:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,RICHMOND,NONE,600 Block of 47TH AV,-122.508194,37.776601,0,23,13,5,2015,4,9
9,2015-05-13 23:00:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,CENTRAL,NONE,JEFFERSON ST / LEAVENWORTH ST,-122.419088,37.807802,0,23,13,5,2015,4,9


# holiday feature
certain crime may be more apparent on holidays

In [31]:
# google this 
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

# Training set
cal = calendar()
holidays = cal.holidays(start=train_df['Dates'].min(), end=train_df['Dates'].max())
train_df['Holiday'] = train_df['Dates'].dt.date.astype('datetime64').isin(holidays)

In [32]:
# Test set
cal = calendar()
holidays = cal.holidays(start=test_df['Dates'].min(), end=test_df['Dates'].max())
test_df['Holiday'] = test_df['Dates'].dt.date.astype('datetime64').isin(holidays)

In [33]:
len(train_df[train_df["Holiday"] == True])


25653

In [34]:
len(test_df[test_df['Holiday'] == True])

18316

In [35]:
train_df.head(10)

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Minute,Hour,Day,Month,Year,Hour_Zone,WeekOfYear,Holiday
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,53,23,13,5,2015,4,9,False
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,53,23,13,5,2015,4,9,False
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,33,23,13,5,2015,4,9,False
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,30,23,13,5,2015,4,9,False
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,30,23,13,5,2015,4,9,False
5,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM UNLOCKED AUTO,Wednesday,INGLESIDE,NONE,0 Block of TEDDY AV,-122.403252,37.713431,30,23,13,5,2015,4,9,False
6,2015-05-13 23:30:00,VEHICLE THEFT,STOLEN AUTOMOBILE,Wednesday,INGLESIDE,NONE,AVALON AV / PERU AV,-122.423327,37.725138,30,23,13,5,2015,4,9,False
7,2015-05-13 23:30:00,VEHICLE THEFT,STOLEN AUTOMOBILE,Wednesday,BAYVIEW,NONE,KIRKWOOD AV / DONAHUE ST,-122.371274,37.727564,30,23,13,5,2015,4,9,False
8,2015-05-13 23:00:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,RICHMOND,NONE,600 Block of 47TH AV,-122.508194,37.776601,0,23,13,5,2015,4,9,False
9,2015-05-13 23:00:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,CENTRAL,NONE,JEFFERSON ST / LEAVENWORTH ST,-122.419088,37.807802,0,23,13,5,2015,4,9,False


# Businesss hour featurs

There should be an effect of business hours on the type of crime committed
Let's create a binary feature where
1 is typical business hours [8:00AM - 6:00PM]
0 is not business hours [6:01PM - 7:59 AM]

In [36]:
from datetime import datetime, time
def time_in_range(start, end , x):
    """Returns true if x is in the inclusive range [start,end]"""
    if start <= end:
        return start <= x <= end
    else :
        return start <= x or x <= end

    
    
def map_business_hours(date):
    # convert 24 hrs time in am and pm
    time_parsed = date.time()
    business_start  = time(8,0,0)
    business_end =  time(18,0,0)
    
    if time_in_range(business_start , business_end, time_parsed):
        return 1
    else:
        return 0


train_df['BusinessHour'] = train_df['Dates'].map(map_business_hours).astype('uint8')
test_df['BusinessHour'] = test_df['Dates'].map(map_business_hours).astype('uint8')   
    
    

In [37]:
train_df["BusinessHour"].value_counts()

1    455215
0    422834
Name: BusinessHour, dtype: int64

In [38]:
train_df.head(5)

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Minute,Hour,Day,Month,Year,Hour_Zone,WeekOfYear,Holiday,BusinessHour
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,53,23,13,5,2015,4,9,False,0
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,53,23,13,5,2015,4,9,False,0
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,33,23,13,5,2015,4,9,False,0
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,30,23,13,5,2015,4,9,False,0
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,30,23,13,5,2015,4,9,False,0


# season features
1  = winter , 2 = winter , 3 = summer , 4 = Fall

In [39]:
train_df['Season']  = (train_df['Month'] %12 +3 )//3
test_df['Season']  = (test_df['Month'] %12 +3 )//3

In [40]:
train_df.tail()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Minute,Hour,Day,Month,Year,Hour_Zone,WeekOfYear,Holiday,BusinessHour,Season
878044,2003-01-06 00:15:00,ROBBERY,ROBBERY ON THE STREET WITH A GUN,Monday,TARAVAL,NONE,FARALLONES ST / CAPITOL AV,-122.459033,37.714056,15,0,6,1,2003,4,0,False,0,1
878045,2003-01-06 00:01:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,INGLESIDE,NONE,600 Block of EDNA ST,-122.447364,37.731948,1,0,6,1,2003,4,0,False,0,1
878046,2003-01-06 00:01:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,SOUTHERN,NONE,5TH ST / FOLSOM ST,-122.40339,37.780266,1,0,6,1,2003,4,0,False,0,1
878047,2003-01-06 00:01:00,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Monday,SOUTHERN,NONE,TOWNSEND ST / 2ND ST,-122.390531,37.780607,1,0,6,1,2003,4,0,False,0,1
878048,2003-01-06 00:01:00,FORGERY/COUNTERFEITING,"CHECKS, FORGERY (FELONY)",Monday,BAYVIEW,NONE,1800 Block of NEWCOMB AV,-122.394926,37.738212,1,0,6,1,2003,4,0,False,0,1


# weekends
weekend may have an affect on what kind of crime are commited
weekday  = 0 ,  weekend   =1

In [41]:
days = {'Monday':0 ,'Tuesday':0 ,'Wednesday':0 ,'Thursday':0 ,'Friday':0, 'Saturday':1 ,'Sunday':1}
train_df["Weekend"]  = train_df["DayOfWeek"].replace(days)

test_df['Weekend'] = test_df['DayOfWeek'].replace(days)

In [42]:
train_df['Weekend'].value_counts()

0    634532
1    243517
Name: Weekend, dtype: int64

# street type 
street type can have an effect on crime that happend

In [43]:
train_df['Address'].value_counts().index

Index(['800 Block of BRYANT ST', '800 Block of MARKET ST',
       '2000 Block of MISSION ST', '1000 Block of POTRERO AV',
       '900 Block of MARKET ST', '0 Block of TURK ST', '0 Block of 6TH ST',
       '300 Block of ELLIS ST', '400 Block of ELLIS ST',
       '16TH ST / MISSION ST',
       ...
       '5TH AV / CALIFORNIA ST', 'HAZELWOOD AV / YERBABUENA AV',
       '0 Block of TROY AL', '900 Block of MARTIN LUTHER KING JR DR',
       'CLIPPER ST / PORTOLA DR', 'PRECITA AV / CESAR CHAVEZ ST',
       '0 Block of COLUSA PL', '2800 Block of KEITH ST',
       'CABRILLO ST / ARGUELLO BL', 'OFARRELL ST / CYRIL MAGNIN ST'],
      dtype='object', length=23228)

In [44]:
import re

    
def find_streets(address):
    street_types = ['AV', 'ST', 'CT', 'PZ', 'LN', 'DR', 'PL', 'HY', 
                    'FY', 'WY', 'TR', 'RD', 'BL', 'WAY', 'CR', 'AL', 'I-80',  
                    'RW', 'WK','EL CAMINO DEL MAR']
    street_pattern = '|'.join(street_types)
    streets = re.findall(street_pattern, address)
    if len(streets) == 0:
        # Debug
#         print(address)
        return 'OTHER'
    elif len(streets) == 1:
        return streets[0]
    else:
#         print(address)
        return 'INT'


train_df['StreetType']  =train_df['Address'].map(find_streets)
test_df['StreetType'] = test_df['Address'].map(find_streets)


In [45]:
train_df['StreetType'].value_counts()

INT                  389995
ST                   358797
AV                    92467
BL                    13074
DR                     8200
WY                     4063
RD                     2384
PZ                     2347
CT                     2059
LN                     1356
PL                      863
HY                      819
TR                      766
I-80                    322
CR                      291
AL                      150
WAY                      55
EL CAMINO DEL MAR        21
OTHER                    14
WK                        5
RW                        1
Name: StreetType, dtype: int64

In [46]:
# Check for null values
train_df['StreetType'].isnull().sum()

0

In [47]:
train_df.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Minute,...,Day,Month,Year,Hour_Zone,WeekOfYear,Holiday,BusinessHour,Season,Weekend,StreetType
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,53,...,13,5,2015,4,9,False,0,2,0,INT
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,53,...,13,5,2015,4,9,False,0,2,0,INT
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,33,...,13,5,2015,4,9,False,0,2,0,INT
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,30,...,13,5,2015,4,9,False,0,2,0,INT
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,30,...,13,5,2015,4,9,False,0,2,0,ST


In [48]:
def find_block_number(address):
    block_num_pattern = '[0-9]+\s[Block]'
    block_num = re.search(block_num_pattern, address)
    if block_num:
#         print(address)
        num_pattern = '[0-9]+'
        block_no_pos = re.search(num_pattern, address)
        # Get integer of found regular expression
        block_no = int(block_no_pos.group())
        # Convert block number by dividing by 100 and adding 1 (0 = addresses with no block)
        block_map = (block_no // 100) + 1
#         print(block_map)
        return block_map
    else:
#         print(address)
        # 
        return 0


train_df['BlockNo'] = train_df['Address'].map(find_block_number)
test_df['BlockNo'] = test_df['Address'].map(find_block_number)

In [49]:
train_df['BlockNo'].value_counts()

0     260818
1      76325
2      51917
9      51718
3      38407
       ...  
82         7
79         5
81         4
84         4
80         3
Name: BlockNo, Length: 85, dtype: int64

# X, Y cordinates 
Normalize and scale x and y 

I use K-Means clustering to create a new feature for the longitude and latitude by grouping clusters of points based on Euclidean distances.

I also extract more spatial features from the X, Y coordinates by transforming them from the cartesian space to the polar space (Reference)

three variants of rotated Cartesian coordinates (rotated by 30, 45, 60 degree each)
Polar coordinates (i.e. the 'r' and the angle 'theta')

The approach makes some intuitive sense i.e. that having such features should help in extracting some more spatial information (than relying on the current x-y alone)


In [50]:
print(train_df['X'].nunique(), train_df["Y"].nunique())
"""StandardScaler() will normalize the features i.e. each column of X,
INDIVIDUALLY, so that each column/feature/variable will have μ = 0 and σ = 1. mean_ value =0 and std_value as 1"""
xy_scaler =  StandardScaler().fit(train_df[["X","Y"]])
train_df[["X","Y"]]  = xy_scaler.transform(train_df[["X","Y"]])
test_df[['X', 'Y']] = xy_scaler.transform(test_df[['X', 'Y']])

34309 34309


In [51]:
train_df.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Minute,...,Month,Year,Hour_Zone,WeekOfYear,Holiday,BusinessHour,Season,Weekend,StreetType,BlockNo
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-0.123667,0.313049,53,...,5,2015,4,9,False,0,2,0,INT,0
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-0.123667,0.313049,53,...,5,2015,4,9,False,0,2,0,INT,0
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-0.063215,1.381244,33,...,5,2015,4,9,False,0,2,0,INT,0
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-0.167311,1.400208,30,...,5,2015,4,9,False,0,2,0,INT,16
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-0.631667,0.18654,30,...,5,2015,4,9,False,0,2,0,ST,2


In [52]:
# X-Y plane rotation and space transformation to extract more spatial information
# 2-dimensional rotation based on below functions:
# rotated x = xcos - ysin
# rotated y = xsin + ycos
# Conver from cartesian space -> polar space

cos_30 = math.cos(math.radians(30))
sin_30 = math.sin(math.radians(30))
cos_45 = math.cos(math.radians(45))
sin_45 = math.sin(math.radians(45))
cos_60 = math.cos(math.radians(60))
sin_60 = math.sin(math.radians(60))


train_df["Rot30_X"] = train_df['X'] * cos_30 - train_df['Y'] * sin_30 
train_df["Rot30_Y"] = train_df['X'] * sin_30 + train_df['Y'] * cos_30
train_df["Rot45_X"] = train_df['X'] * cos_45 - train_df['Y'] * sin_45  
train_df["Rot45_Y"] = train_df['X'] * sin_45 + train_df['Y'] * cos_45
train_df["Rot60_X"] = train_df['X'] * cos_60 - train_df['Y'] * sin_60  
train_df["Rot60_Y"] = train_df['X'] * sin_60 + train_df['Y'] * cos_60
train_df["Radius"] = np.sqrt(train_df['X'] ** 2 + train_df['Y'] ** 2)
train_df["Angle"] = np.arctan2(train_df['X'], train_df['Y'])

test_df["Rot30_X"] = test_df['X'] * cos_30 - test_df['Y'] * sin_30  
test_df["Rot30_Y"] = test_df['X'] * sin_30 + test_df['Y'] * cos_30
test_df["Rot45_X"] = test_df['X'] * cos_45 - test_df['Y'] * sin_45  
test_df["Rot45_Y"] = test_df['X'] * sin_45 + test_df['Y'] * cos_45
test_df["Rot60_X"] = test_df['X'] * cos_60 - test_df['Y'] * sin_60  
test_df["Rot60_Y"] = test_df['X'] * sin_60 + test_df['Y'] * cos_60
test_df["Radius"] = np.sqrt(test_df['X'] ** 2 + test_df['Y'] ** 2)
test_df["Angle"] = np.arctan2(test_df['X'], test_df['Y'])

In [53]:
train_df.describe()

Unnamed: 0,X,Y,Minute,Hour,Day,Month,Year,Hour_Zone,WeekOfYear,BusinessHour,...,Weekend,BlockNo,Rot30_X,Rot30_Y,Rot45_X,Rot45_Y,Rot60_X,Rot60_Y,Radius,Angle
count,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,...,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0
mean,6.803159e-13,2.256863e-13,20.155026,13.412655,15.570623,6.436509,2008.712046,2.21274,12.185458,0.518439,...,0.277339,7.743576,4.766802e-13,5.33989e-13,3.204876e-13,6.407877e-13,1.504479e-13,7.040754e-13,1.21928,0.278878
std,1.000001,1.000001,18.594915,6.549573,8.783005,3.428972,3.631194,1.214321,7.48294,0.49966,...,0.447685,10.233725,0.930852,1.064667,0.9196905,1.074324,0.930852,1.064667,0.716489,1.652417
min,-6.130357,-6.719908,0.0,0.0,1.0,1.0,2003.0,0.0,0.0,0.0,...,0.0,0.0,-4.850658,-6.521283,-4.273285,-5.744007,-3.404696,-5.964769,0.020063,-3.141529
25%,-0.4028553,-0.6043449,0.0,9.0,8.0,3.0,2006.0,1.0,6.0,0.0,...,0.0,0.0,-0.4145891,-0.5664325,-0.491059,-0.5402203,-0.584429,-0.4589185,0.742427,-1.065839
50%,0.2508282,0.3470658,19.0,14.0,16.0,6.0,2009.0,2.0,12.0,1.0,...,0.0,4.0,0.06283902,0.2143032,-0.09704231,0.2239784,-0.2373134,0.3123171,1.003706,0.508752
75%,0.6250505,0.7172717,33.0,19.0,23.0,9.0,2012.0,3.0,19.0,1.0,...,1.0,11.0,0.4804464,0.8071739,0.4085382,0.7867667,0.5063012,0.7512157,1.639694,1.17239
max,3.085404,3.549181,59.0,23.0,31.0,12.0,2015.0,4.0,25.0,1.0,...,1.0,84.0,3.466201,4.094484,4.077166,3.953283,5.24092,3.542673,6.864877,3.141199


In [54]:
# run KMeans separately on both the training set and test set
data = [train_df, test_df]
num_clusters = 40
for dataset in data:
    coordinates = dataset.loc[:,['Y','X']]
    kmeans = KMeans(n_clusters=num_clusters, random_state=1).fit(coordinates)
    id_labels=kmeans.labels_
#     print(kmeans.cluster_centers_)
    dataset['Cluster'] = id_labels

In [55]:
train_df.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Minute,...,BlockNo,Rot30_X,Rot30_Y,Rot45_X,Rot45_Y,Rot60_X,Rot60_Y,Radius,Angle,Cluster
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-0.123667,0.313049,53,...,0,-0.263623,0.209275,-0.308805,0.133913,-0.332942,0.049426,0.33659,-0.376223,32
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-0.123667,0.313049,53,...,0,-0.263623,0.209275,-0.308805,0.133913,-0.332942,0.049426,0.33659,-0.376223,32
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-0.063215,1.381244,33,...,0,-0.745368,1.164585,-1.021387,0.931987,-1.2278,0.635876,1.38269,-0.045735,22
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-0.167311,1.400208,30,...,16,-0.845,1.12896,-1.108403,0.871789,-1.296271,0.555208,1.410168,-0.118927,22
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-0.631667,0.18654,30,...,2,-0.64031,-0.154286,-0.57856,-0.314753,-0.477382,-0.45377,0.658635,-1.283645,32


# Drop featureas 
we made a lot of features some have become useless

In [56]:
# Drop Address feature from both train and test set
train_df.drop(['Address'], axis=1, inplace=True)
test_df.drop(['Address'], axis=1, inplace=True)

In [57]:
# We don't need Dates column anymore
train_df.drop(['Dates'], axis=1, inplace=True)
test_df.drop(['Dates'], axis=1, inplace=True)

In [58]:
# Drop Resolution column since test set does not have this column
train_df.drop(['Resolution'], axis=1, inplace=True)

In [59]:
# Drop Descript column since test set does not have this column
train_df.drop(['Descript'], axis=1, inplace=True)


In [60]:
train_df.head(10)

Unnamed: 0,Category,DayOfWeek,PdDistrict,X,Y,Minute,Hour,Day,Month,Year,...,BlockNo,Rot30_X,Rot30_Y,Rot45_X,Rot45_Y,Rot60_X,Rot60_Y,Radius,Angle,Cluster
0,WARRANTS,Wednesday,NORTHERN,-0.123667,0.313049,53,23,13,5,2015,...,0,-0.263623,0.209275,-0.308805,0.133913,-0.332942,0.049426,0.33659,-0.376223,32
1,OTHER OFFENSES,Wednesday,NORTHERN,-0.123667,0.313049,53,23,13,5,2015,...,0,-0.263623,0.209275,-0.308805,0.133913,-0.332942,0.049426,0.33659,-0.376223,32
2,OTHER OFFENSES,Wednesday,NORTHERN,-0.063215,1.381244,33,23,13,5,2015,...,0,-0.745368,1.164585,-1.021387,0.931987,-1.2278,0.635876,1.38269,-0.045735,22
3,LARCENY/THEFT,Wednesday,NORTHERN,-0.167311,1.400208,30,23,13,5,2015,...,16,-0.845,1.12896,-1.108403,0.871789,-1.296271,0.555208,1.410168,-0.118927,22
4,LARCENY/THEFT,Wednesday,PARK,-0.631667,0.18654,30,23,13,5,2015,...,2,-0.64031,-0.154286,-0.57856,-0.314753,-0.477382,-0.45377,0.658635,-1.283645,32
5,LARCENY/THEFT,Wednesday,INGLESIDE,0.771618,-2.217937,30,23,13,5,2015,...,1,1.777209,-1.534981,2.113935,-1.022702,2.306599,-0.440727,2.348327,2.806791,36
6,VEHICLE THEFT,Wednesday,INGLESIDE,-0.022244,-1.733514,30,23,13,5,2015,...,0,0.847493,-1.51239,1.210051,-1.241509,1.490145,-0.886021,1.733657,-3.128761,2
7,VEHICLE THEFT,Wednesday,BAYVIEW,2.036209,-1.633131,30,23,13,5,2015,...,0,2.579974,-0.396228,2.594615,0.285019,2.432437,0.946843,2.610223,2.246783,6
8,LARCENY/THEFT,Wednesday,RICHMOND,-3.378362,0.395914,0,23,13,5,2015,...,7,-3.123705,-1.346309,-2.668816,-2.108909,-2.032053,-2.72779,3.401482,-1.454137,8
9,LARCENY/THEFT,Wednesday,CENTRAL,0.145401,1.68691,0,23,13,5,2015,...,0,-0.717534,1.533608,-1.090011,1.29564,-1.388206,0.969376,1.693165,0.085981,13


In [61]:
train_df["PdDistrict"].value_counts().index

Index(['SOUTHERN', 'MISSION', 'NORTHERN', 'BAYVIEW', 'CENTRAL', 'TENDERLOIN',
       'INGLESIDE', 'TARAVAL', 'PARK', 'RICHMOND'],
      dtype='object')

In [62]:
pd_districts = {'SOUTHERN':0 , 'MISSION':1 , 'NORTHERN':2, 'BAYVIEW':3, 'CENTRAL':4, 'TENDERLOIN':5,
              'INGLESIDE':6, 'TARAVAL':7, 'PARK':8, 'RICHMOND':9}
train_df['PdDistrict'].replace(pd_districts,inplace = True)
test_df['PdDistrict'].replace(pd_districts, inplace=True)

In [63]:
train_df.head()

Unnamed: 0,Category,DayOfWeek,PdDistrict,X,Y,Minute,Hour,Day,Month,Year,...,BlockNo,Rot30_X,Rot30_Y,Rot45_X,Rot45_Y,Rot60_X,Rot60_Y,Radius,Angle,Cluster
0,WARRANTS,Wednesday,2,-0.123667,0.313049,53,23,13,5,2015,...,0,-0.263623,0.209275,-0.308805,0.133913,-0.332942,0.049426,0.33659,-0.376223,32
1,OTHER OFFENSES,Wednesday,2,-0.123667,0.313049,53,23,13,5,2015,...,0,-0.263623,0.209275,-0.308805,0.133913,-0.332942,0.049426,0.33659,-0.376223,32
2,OTHER OFFENSES,Wednesday,2,-0.063215,1.381244,33,23,13,5,2015,...,0,-0.745368,1.164585,-1.021387,0.931987,-1.2278,0.635876,1.38269,-0.045735,22
3,LARCENY/THEFT,Wednesday,2,-0.167311,1.400208,30,23,13,5,2015,...,16,-0.845,1.12896,-1.108403,0.871789,-1.296271,0.555208,1.410168,-0.118927,22
4,LARCENY/THEFT,Wednesday,8,-0.631667,0.18654,30,23,13,5,2015,...,2,-0.64031,-0.154286,-0.57856,-0.314753,-0.477382,-0.45377,0.658635,-1.283645,32


In [64]:
data = [train_df,test_df]
for dataset in data:
    year_le = LabelEncoder()
    year_le.fit(dataset['Year'].unique())
    print(list(year_le.classes_))
    dataset['Year']=year_le.transform(dataset['Year']) 

[2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015]
[2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015]


In [65]:
train_df["Year"].unique()

array([12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,  0], dtype=int64)

In [66]:
dict(zip(year_le.classes_, year_le.transform(year_le.classes_)))

{2003: 0,
 2004: 1,
 2005: 2,
 2006: 3,
 2007: 4,
 2008: 5,
 2009: 6,
 2010: 7,
 2011: 8,
 2012: 9,
 2013: 10,
 2014: 11,
 2015: 12}

In [67]:
train_df.head()

Unnamed: 0,Category,DayOfWeek,PdDistrict,X,Y,Minute,Hour,Day,Month,Year,...,BlockNo,Rot30_X,Rot30_Y,Rot45_X,Rot45_Y,Rot60_X,Rot60_Y,Radius,Angle,Cluster
0,WARRANTS,Wednesday,2,-0.123667,0.313049,53,23,13,5,12,...,0,-0.263623,0.209275,-0.308805,0.133913,-0.332942,0.049426,0.33659,-0.376223,32
1,OTHER OFFENSES,Wednesday,2,-0.123667,0.313049,53,23,13,5,12,...,0,-0.263623,0.209275,-0.308805,0.133913,-0.332942,0.049426,0.33659,-0.376223,32
2,OTHER OFFENSES,Wednesday,2,-0.063215,1.381244,33,23,13,5,12,...,0,-0.745368,1.164585,-1.021387,0.931987,-1.2278,0.635876,1.38269,-0.045735,22
3,LARCENY/THEFT,Wednesday,2,-0.167311,1.400208,30,23,13,5,12,...,16,-0.845,1.12896,-1.108403,0.871789,-1.296271,0.555208,1.410168,-0.118927,22
4,LARCENY/THEFT,Wednesday,8,-0.631667,0.18654,30,23,13,5,12,...,2,-0.64031,-0.154286,-0.57856,-0.314753,-0.477382,-0.45377,0.658635,-1.283645,32


In [68]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 27 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Category      878049 non-null  object 
 1   DayOfWeek     878049 non-null  object 
 2   PdDistrict    878049 non-null  int64  
 3   X             878049 non-null  float64
 4   Y             878049 non-null  float64
 5   Minute        878049 non-null  int64  
 6   Hour          878049 non-null  int64  
 7   Day           878049 non-null  int64  
 8   Month         878049 non-null  int64  
 9   Year          878049 non-null  int64  
 10  Hour_Zone     878049 non-null  int64  
 11  WeekOfYear    878049 non-null  int64  
 12  Holiday       878049 non-null  bool   
 13  BusinessHour  878049 non-null  uint8  
 14  Season        878049 non-null  int64  
 15  Weekend       878049 non-null  int64  
 16  StreetType    878049 non-null  object 
 17  BlockNo       878049 non-null  int64  
 18  Rot3

# Day of week 
we are going to use sklearn's LabelEncoder to encode the categorical data to numeric

In [69]:
data = [train_df,test_df]
for dataset in data:
    dow_le = LabelEncoder()
    dow_le.fit(dataset['DayOfWeek'].unique())
    print(list(dow_le.classes_))
    dataset['DayOfWeek'] = dow_le.transform(dataset['DayOfWeek'])
    

['Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday']
['Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday']


In [70]:
train_df['DayOfWeek'].unique()

array([6, 5, 1, 3, 2, 0, 4])

In [71]:
# So we know the mapping (important)
dict(zip(dow_le.classes_, dow_le.transform(dow_le.classes_)))

{'Friday': 0,
 'Monday': 1,
 'Saturday': 2,
 'Sunday': 3,
 'Thursday': 4,
 'Tuesday': 5,
 'Wednesday': 6}

In [72]:
train_df.head()

Unnamed: 0,Category,DayOfWeek,PdDistrict,X,Y,Minute,Hour,Day,Month,Year,...,BlockNo,Rot30_X,Rot30_Y,Rot45_X,Rot45_Y,Rot60_X,Rot60_Y,Radius,Angle,Cluster
0,WARRANTS,6,2,-0.123667,0.313049,53,23,13,5,12,...,0,-0.263623,0.209275,-0.308805,0.133913,-0.332942,0.049426,0.33659,-0.376223,32
1,OTHER OFFENSES,6,2,-0.123667,0.313049,53,23,13,5,12,...,0,-0.263623,0.209275,-0.308805,0.133913,-0.332942,0.049426,0.33659,-0.376223,32
2,OTHER OFFENSES,6,2,-0.063215,1.381244,33,23,13,5,12,...,0,-0.745368,1.164585,-1.021387,0.931987,-1.2278,0.635876,1.38269,-0.045735,22
3,LARCENY/THEFT,6,2,-0.167311,1.400208,30,23,13,5,12,...,16,-0.845,1.12896,-1.108403,0.871789,-1.296271,0.555208,1.410168,-0.118927,22
4,LARCENY/THEFT,6,8,-0.631667,0.18654,30,23,13,5,12,...,2,-0.64031,-0.154286,-0.57856,-0.314753,-0.477382,-0.45377,0.658635,-1.283645,32


In [73]:
train_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 27 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Category      878049 non-null  object 
 1   DayOfWeek     878049 non-null  int32  
 2   PdDistrict    878049 non-null  int64  
 3   X             878049 non-null  float64
 4   Y             878049 non-null  float64
 5   Minute        878049 non-null  int64  
 6   Hour          878049 non-null  int64  
 7   Day           878049 non-null  int64  
 8   Month         878049 non-null  int64  
 9   Year          878049 non-null  int64  
 10  Hour_Zone     878049 non-null  int64  
 11  WeekOfYear    878049 non-null  int64  
 12  Holiday       878049 non-null  bool   
 13  BusinessHour  878049 non-null  uint8  
 14  Season        878049 non-null  int64  
 15  Weekend       878049 non-null  int64  
 16  StreetType    878049 non-null  object 
 17  BlockNo       878049 non-null  int64  
 18  Rot3

# street type 
we use Label Encoder here


In [74]:
data = [train_df, test_df]

for dataset in data:
    st_le = LabelEncoder()
    st_le.fit(dataset['StreetType'].unique())
    print(list(st_le.classes_))
    dataset['StreetType']=st_le.transform(dataset['StreetType'])

['AL', 'AV', 'BL', 'CR', 'CT', 'DR', 'EL CAMINO DEL MAR', 'HY', 'I-80', 'INT', 'LN', 'OTHER', 'PL', 'PZ', 'RD', 'RW', 'ST', 'TR', 'WAY', 'WK', 'WY']
['AL', 'AV', 'BL', 'CR', 'CT', 'DR', 'EL CAMINO DEL MAR', 'HY', 'I-80', 'INT', 'LN', 'OTHER', 'PL', 'PZ', 'RD', 'RW', 'ST', 'TR', 'WAY', 'WK', 'WY']


In [75]:
train_df['StreetType'].unique()

array([ 9, 16,  1, 20,  4,  5, 17,  7, 14, 13,  2, 12, 10, 18,  3,  0, 11,
       19, 15,  6,  8])

In [76]:
train_df.head()

Unnamed: 0,Category,DayOfWeek,PdDistrict,X,Y,Minute,Hour,Day,Month,Year,...,BlockNo,Rot30_X,Rot30_Y,Rot45_X,Rot45_Y,Rot60_X,Rot60_Y,Radius,Angle,Cluster
0,WARRANTS,6,2,-0.123667,0.313049,53,23,13,5,12,...,0,-0.263623,0.209275,-0.308805,0.133913,-0.332942,0.049426,0.33659,-0.376223,32
1,OTHER OFFENSES,6,2,-0.123667,0.313049,53,23,13,5,12,...,0,-0.263623,0.209275,-0.308805,0.133913,-0.332942,0.049426,0.33659,-0.376223,32
2,OTHER OFFENSES,6,2,-0.063215,1.381244,33,23,13,5,12,...,0,-0.745368,1.164585,-1.021387,0.931987,-1.2278,0.635876,1.38269,-0.045735,22
3,LARCENY/THEFT,6,2,-0.167311,1.400208,30,23,13,5,12,...,16,-0.845,1.12896,-1.108403,0.871789,-1.296271,0.555208,1.410168,-0.118927,22
4,LARCENY/THEFT,6,8,-0.631667,0.18654,30,23,13,5,12,...,2,-0.64031,-0.154286,-0.57856,-0.314753,-0.477382,-0.45377,0.658635,-1.283645,32


In [77]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 27 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Category      878049 non-null  object 
 1   DayOfWeek     878049 non-null  int32  
 2   PdDistrict    878049 non-null  int64  
 3   X             878049 non-null  float64
 4   Y             878049 non-null  float64
 5   Minute        878049 non-null  int64  
 6   Hour          878049 non-null  int64  
 7   Day           878049 non-null  int64  
 8   Month         878049 non-null  int64  
 9   Year          878049 non-null  int64  
 10  Hour_Zone     878049 non-null  int64  
 11  WeekOfYear    878049 non-null  int64  
 12  Holiday       878049 non-null  bool   
 13  BusinessHour  878049 non-null  uint8  
 14  Season        878049 non-null  int64  
 15  Weekend       878049 non-null  int64  
 16  StreetType    878049 non-null  int32  
 17  BlockNo       878049 non-null  int64  
 18  Rot3

# HOliday
false  = 0 
True  =1


In [78]:
train_df['Holiday'].replace(True,1,inplace = True)
train_df['Holiday'].replace(False, 0, inplace=False)
test_df['Holiday'].replace(True,1,inplace = True)
test_df['Holiday'].replace(False, 0, inplace=False)
train_df['Holiday'] = train_df['Holiday'].astype('uint8')
train_df['Holiday'] = train_df['Holiday'].astype('uint8')


# Category

In [79]:
data = [train_df]

for dataset in data:
    cat_le = LabelEncoder()
    cat_le.fit(dataset['Category'].unique())
    print(list(cat_le.classes_))
    dataset['Category']=cat_le.transform(dataset['Category'])

['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY', 'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE', 'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION', 'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING', 'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING', 'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES', 'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE', 'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE', 'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE', 'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT', 'WARRANTS', 'WEAPON LAWS']


In [80]:
len(train_df['Category'].unique())

39

In [81]:
# So we know the mapping (important)
dict(zip(cat_le.classes_, cat_le.transform(cat_le.classes_)))

{'ARSON': 0,
 'ASSAULT': 1,
 'BAD CHECKS': 2,
 'BRIBERY': 3,
 'BURGLARY': 4,
 'DISORDERLY CONDUCT': 5,
 'DRIVING UNDER THE INFLUENCE': 6,
 'DRUG/NARCOTIC': 7,
 'DRUNKENNESS': 8,
 'EMBEZZLEMENT': 9,
 'EXTORTION': 10,
 'FAMILY OFFENSES': 11,
 'FORGERY/COUNTERFEITING': 12,
 'FRAUD': 13,
 'GAMBLING': 14,
 'KIDNAPPING': 15,
 'LARCENY/THEFT': 16,
 'LIQUOR LAWS': 17,
 'LOITERING': 18,
 'MISSING PERSON': 19,
 'NON-CRIMINAL': 20,
 'OTHER OFFENSES': 21,
 'PORNOGRAPHY/OBSCENE MAT': 22,
 'PROSTITUTION': 23,
 'RECOVERED VEHICLE': 24,
 'ROBBERY': 25,
 'RUNAWAY': 26,
 'SECONDARY CODES': 27,
 'SEX OFFENSES FORCIBLE': 28,
 'SEX OFFENSES NON FORCIBLE': 29,
 'STOLEN PROPERTY': 30,
 'SUICIDE': 31,
 'SUSPICIOUS OCC': 32,
 'TREA': 33,
 'TRESPASS': 34,
 'VANDALISM': 35,
 'VEHICLE THEFT': 36,
 'WARRANTS': 37,
 'WEAPON LAWS': 38}

In [82]:
train_df.head()

Unnamed: 0,Category,DayOfWeek,PdDistrict,X,Y,Minute,Hour,Day,Month,Year,...,BlockNo,Rot30_X,Rot30_Y,Rot45_X,Rot45_Y,Rot60_X,Rot60_Y,Radius,Angle,Cluster
0,37,6,2,-0.123667,0.313049,53,23,13,5,12,...,0,-0.263623,0.209275,-0.308805,0.133913,-0.332942,0.049426,0.33659,-0.376223,32
1,21,6,2,-0.123667,0.313049,53,23,13,5,12,...,0,-0.263623,0.209275,-0.308805,0.133913,-0.332942,0.049426,0.33659,-0.376223,32
2,21,6,2,-0.063215,1.381244,33,23,13,5,12,...,0,-0.745368,1.164585,-1.021387,0.931987,-1.2278,0.635876,1.38269,-0.045735,22
3,16,6,2,-0.167311,1.400208,30,23,13,5,12,...,16,-0.845,1.12896,-1.108403,0.871789,-1.296271,0.555208,1.410168,-0.118927,22
4,16,6,8,-0.631667,0.18654,30,23,13,5,12,...,2,-0.64031,-0.154286,-0.57856,-0.314753,-0.477382,-0.45377,0.658635,-1.283645,32


In [83]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 27 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Category      878049 non-null  int32  
 1   DayOfWeek     878049 non-null  int32  
 2   PdDistrict    878049 non-null  int64  
 3   X             878049 non-null  float64
 4   Y             878049 non-null  float64
 5   Minute        878049 non-null  int64  
 6   Hour          878049 non-null  int64  
 7   Day           878049 non-null  int64  
 8   Month         878049 non-null  int64  
 9   Year          878049 non-null  int64  
 10  Hour_Zone     878049 non-null  int64  
 11  WeekOfYear    878049 non-null  int64  
 12  Holiday       878049 non-null  uint8  
 13  BusinessHour  878049 non-null  uint8  
 14  Season        878049 non-null  int64  
 15  Weekend       878049 non-null  int64  
 16  StreetType    878049 non-null  int32  
 17  BlockNo       878049 non-null  int64  
 18  Rot3

as we can see  all categorical values has been changed to numeric


 Convert all to 16 bit integers so less memory and will train faster (no loss in data since our integers dont reach

In [84]:
columns_to_convert = ['DayOfWeek', 'PdDistrict', 'Minute', 'Hour', 'Day', 'Month', 'Year', 
                      'Hour_Zone', 'WeekOfYear', 'Season', 'StreetType', 'BlockNo', 'Cluster']
train_df[columns_to_convert]  = train_df[columns_to_convert].astype('int16')
test_df[columns_to_convert] = test_df[columns_to_convert].astype('int16')

train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 27 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Category      878049 non-null  int32  
 1   DayOfWeek     878049 non-null  int16  
 2   PdDistrict    878049 non-null  int16  
 3   X             878049 non-null  float64
 4   Y             878049 non-null  float64
 5   Minute        878049 non-null  int16  
 6   Hour          878049 non-null  int16  
 7   Day           878049 non-null  int16  
 8   Month         878049 non-null  int16  
 9   Year          878049 non-null  int16  
 10  Hour_Zone     878049 non-null  int16  
 11  WeekOfYear    878049 non-null  int16  
 12  Holiday       878049 non-null  uint8  
 13  BusinessHour  878049 non-null  uint8  
 14  Season        878049 non-null  int16  
 15  Weekend       878049 non-null  int64  
 16  StreetType    878049 non-null  int16  
 17  BlockNo       878049 non-null  int16  
 18  Rot3

 # Building Machine Learning Models
 Lets train few models
 
 Models:
    Stochastic Gradient Descent (with elastic net regularization)
    Gaussian Naive Bayes
    K Nearest Neighbors
    Logistic Regression (with L1 regularization)
    Random Forest
    XGBoost
    
   Almost all the default scikit-learn ML algorithm hyperparameters exhibit bad performance
   Researched online & read literature to determine some more ideal default hyperparameters
   
 Couple things to note:
            Decision tree models including Ensemble methods (Random Forest & XGBoost) can handle categorical variables without one-hot encoding them.
        Linear models (SGD & Logistic Regression) cannot handle categorical features & need features to be OHE before training
    Always OneHotEncode before you split data up to training/dev/test so that all features & classes will be represented

In [85]:
X_train =  train_df.drop("Category",axis = 1).copy()
Y_train = train_df["Category"].copy()

X_test = test_df.drop("Id", axis=1).copy()

In [86]:
def one_hot_encode(train_data):
    '''One Hot Encode the categorical features'''
    encoded_train_data = train_data

    encoded_train_data = pd.concat([encoded_train_data, 
                                    pd.get_dummies(pd.Series(encoded_train_data['PdDistrict']), prefix='PdDistrict')], axis=1)
    encoded_train_data = pd.concat([encoded_train_data, 
                                    pd.get_dummies(pd.Series(encoded_train_data['DayOfWeek']), prefix='DayOfWeek')], axis=1)
    encoded_train_data = pd.concat([encoded_train_data, 
                                    pd.get_dummies(pd.Series(encoded_train_data['StreetType']), prefix='StreetType')], axis=1)
    encoded_train_data = pd.concat([encoded_train_data, 
                                    pd.get_dummies(pd.Series(encoded_train_data['Season']), prefix='Season')], axis=1)
    encoded_train_data = pd.concat([encoded_train_data, 
                                    pd.get_dummies(pd.Series(encoded_train_data['Hour_Zone']), prefix='Hour_Zone')], axis=1)
    encoded_train_data = pd.concat([encoded_train_data, 
                                    pd.get_dummies(pd.Series(encoded_train_data['Cluster']), prefix='Cluster')], axis=1)
    encoded_train_data = encoded_train_data.drop(['Cluster','StreetType', 'Season', 'Hour_Zone', 'DayOfWeek', 'PdDistrict'], axis=1)

    return encoded_train_data

In [87]:
X_encoded_train = one_hot_encode(X_train)

In [88]:
# Use these for ML algorithms that can't handle categorical data (Logistic Regression, Linear Models)
mini_encoded_train_data, mini_encoded_dev_data, mini_train_labels, mini_dev_labels = train_test_split(X_encoded_train, 
                                                                                      Y_train,
                                                                                      stratify=Y_train,
                                                                                      test_size=0.5,
                                                                                      random_state=1)

In [89]:
# Use these for ML algorithms that can handle categorical data without OHE
mini_train_data, mini_dev_data, mini_train_labels, mini_dev_labels = train_test_split(X_train, 
                                                                                      Y_train,
                                                                                      stratify=Y_train,
                                                                                      test_size=0.5,
                                                                                      random_state=1)

In [90]:
# K Neighbors

"""knn = KNeighborsClassifier()
knn.fit(mini_train_data, mini_train_labels)
pred_probs = knn.predict_proba(mini_dev_data)
knn_loss = log_loss(mini_dev_labels, pred_probs)


print('KNN Validation Log Loss: ', knn_loss)"""

"knn = KNeighborsClassifier()\nknn.fit(mini_train_data, mini_train_labels)\npred_probs = knn.predict_proba(mini_dev_data)\nknn_loss = log_loss(mini_dev_labels, pred_probs)\n\n\nprint('KNN Validation Log Loss: ', knn_loss)"

In [92]:
mini_dev_data.shape

(439025, 26)

In [93]:
# Logistic Regression
logreg = LogisticRegression(penalty='l1', C=1.5, solver='saga', multi_class='multinomial', 
                            tol=0.0001, max_iter=1000, verbose=3, n_jobs=3, random_state=1)

logreg.fit(mini_encoded_train_data, mini_train_labels)
pred_probs = logreg.predict_proba(mini_encoded_dev_data)

logreg_loss = log_loss(mini_dev_labels, pred_probs)


print('Logistic Regression Validation Log Loss: ', logreg_loss)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.


max_iter reached after 16440 seconds


[Parallel(n_jobs=3)]: Done   1 out of   1 | elapsed: 274.0min finished


Logistic Regression Validation Log Loss:  2.472794078896379
