In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Data Processing

In [14]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [15]:
# Show 5 random rows from dataset
train_df.sample(5)

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
2451,2015-04-30 15:38:00,PROSTITUTION,LOITERING FOR PURPOSE OF PROSTITUTION,Thursday,NORTHERN,"ARREST, BOOKED",1500 Block of VANNESS AV,-122.422063,37.78992
715889,2005-03-23 21:30:00,BURGLARY,"BURGLARY, FORCIBLE ENTRY",Wednesday,PARK,NONE,500 Block of DIVISADERO ST,-122.437823,37.774522
642712,2006-04-04 00:39:00,OTHER OFFENSES,"DRIVERS LICENSE, SUSPENDED OR REVOKED",Tuesday,CENTRAL,"ARREST, CITED",BROADWAY ST / STOCKTON ST,-122.408623,37.797627
621784,2006-07-16 09:00:00,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM",Sunday,SOUTHERN,NONE,800 Block of BRYANT ST,-122.403405,37.775421
721235,2005-02-24 08:43:00,OTHER OFFENSES,"DRIVERS LICENSE, SUSPENDED OR REVOKED",Thursday,INGLESIDE,"ARREST, CITED",VIRGINIA AV / MISSION ST,-122.421489,37.743122


## Preprocessing & Feature Engineering

In [16]:
train_df = train_df.drop('Resolution', axis=1)

In [17]:
train_df.Dates.dtype

dtype('O')

### Missing Values

In [18]:
assert train_df.Dates.isnull().any() == False
assert test_df.Dates.isnull().any() == False

In [19]:
assert train_df.Dates.str.match('\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d').all() == True
assert test_df.Dates.str.match('\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d').all() == True

In [20]:
train_df['Date'] = pd.to_datetime(train_df.Dates)
test_df['Date'] = pd.to_datetime(test_df.Dates)

train_df = train_df.drop('Dates', axis=1)
test_df = test_df.drop('Dates', axis=1)
train_df.sample(1)

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Address,X,Y,Date
682710,LARCENY/THEFT,PETTY THEFT FROM A BUILDING,Friday,TARAVAL,1100 Block of JUNIPERO SERRA BL,-122.472712,37.717868,2005-09-09 08:29:00


In [21]:
# Confirm that it was parsed to Datetime
train_df.Date.dtype

dtype('<M8[ns]')

## Time of Day

In [22]:
train_df['IsDay'] = 0
train_df.loc[ (train_df.Date.dt.hour > 6) & (train_df.Date.dt.hour < 20), 'IsDay' ] = 1
test_df['IsDay'] = 0
test_df.loc[ (test_df.Date.dt.hour > 6) & (test_df.Date.dt.hour < 20), 'IsDay' ] = 1

train_df.sample(3)

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Address,X,Y,Date,IsDay
703924,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Saturday,TARAVAL,38TH AV / RIVERA ST,-122.49631,37.745878,2005-05-21 00:01:00,0
275945,ASSAULT,AGGRAVATED ASSAULT WITH A KNIFE,Friday,RICHMOND,3900 Block of GEARY BL,-122.46162,37.78126,2011-08-12 01:25:00,0
129279,KIDNAPPING,FALSE IMPRISONMENT,Friday,MISSION,2800 Block of 23RD ST,-122.409742,37.754424,2013-08-23 19:09:00,1


In [23]:
days_to_int_dic = {
        'Monday': 1,
        'Tuesday': 2,
        'Wednesday': 3,
        'Thursday': 4,
        'Friday': 5,
        'Saturday': 6,
        'Sunday': 7,
}
train_df['DayOfWeek'] = train_df['DayOfWeek'].map(days_to_int_dic)
test_df ['DayOfWeek'] = test_df ['DayOfWeek'].map(days_to_int_dic)

train_df.DayOfWeek.unique()

array([3, 2, 1, 7, 6, 5, 4])

In [24]:
train_df['Hour'] = train_df.Date.dt.hour
train_df['Month'] = train_df.Date.dt.month
train_df['Year'] = train_df.Date.dt.year
train_df['Year'] = train_df['Year'] - 2000 # The Algorithm doesn't know the difference. It's just easier to work like that

test_df['Hour'] = test_df.Date.dt.hour
test_df['Month'] = test_df.Date.dt.month
test_df['Year'] = test_df.Date.dt.year
test_df['Year'] = test_df['Year'] - 2000 # The Algorithm doesn't know the difference. It's just easier to work like that

train_df.sample(1)

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Address,X,Y,Date,IsDay,Hour,Month,Year
538194,LARCENY/THEFT,PETTY THEFT WITH PRIOR,6,SOUTHERN,700 Block of MARKET ST,-122.405359,37.785977,2007-10-06 13:10:00,1,13,10,7


In [25]:
train_df['HourCos'] = np.cos((train_df['Hour']*2*np.pi)/24 )
train_df['DayOfWeekCos'] = np.cos((train_df['DayOfWeek']*2*np.pi)/7 )
train_df['MonthCos'] = np.cos((train_df['Month']*2*np.pi)/12 )

test_df['HourCos'] = np.cos((test_df['Hour']*2*np.pi)/24 )
test_df['DayOfWeekCos'] = np.cos((test_df['DayOfWeek']*2*np.pi)/7 )
test_df['MonthCos'] = np.cos((test_df['Month']*2*np.pi)/12 )

train_df.sample(1)

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Address,X,Y,Date,IsDay,Hour,Month,Year,HourCos,DayOfWeekCos,MonthCos
161135,FRAUD,DEFRAUDING AN INNKEEPER,6,SOUTHERN,1200 Block of MARKET ST,-122.415449,37.778294,2013-03-23 23:55:00,0,23,3,13,0.965926,0.62349,6.123234000000001e-17


In [26]:
train_df = pd.get_dummies(train_df, columns=['PdDistrict'])
test_df  = pd.get_dummies(test_df,  columns=['PdDistrict'])
train_df.sample(2)

Unnamed: 0,Category,Descript,DayOfWeek,Address,X,Y,Date,IsDay,Hour,Month,...,PdDistrict_BAYVIEW,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN
446062,PROSTITUTION,SOLICITS TO VISIT HOUSE OF PROSTITUTION,2,17TH ST / SHOTWELL ST,-122.41623,37.763634,2009-02-03 09:08:00,1,9,2,...,0,0,0,1,0,0,0,0,0,0
506332,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,6,1700 Block of PACIFIC AV,-122.424076,37.794765,2008-03-22 22:00:00,0,22,3,...,0,0,0,0,1,0,0,0,0,0


### Label Encoding

In [27]:
from sklearn.preprocessing import LabelEncoder

cat_le = LabelEncoder()
train_df['CategoryInt'] = pd.Series(cat_le.fit_transform(train_df.Category))
train_df.sample(5)
#cat_le.classes_

Unnamed: 0,Category,Descript,DayOfWeek,Address,X,Y,Date,IsDay,Hour,Month,...,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN,CategoryInt
826582,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,6,ALABAMA ST / 20TH ST,-122.411581,37.759083,2003-09-06 23:00:00,0,23,9,...,0,0,1,0,0,0,0,0,0,16
680176,ASSAULT,BATTERY,4,600 Block of OFARRELL ST,-122.415972,37.785357,2005-09-22 18:35:00,1,18,9,...,0,0,0,0,0,0,0,0,1,1
704771,LARCENY/THEFT,ATTEMPTED PETTY THEFT OF PROPERTY,4,700 Block of JONES ST,-122.413547,37.788278,2005-05-19 06:30:00,0,6,5,...,1,0,0,0,0,0,0,0,0,16
275503,OTHER OFFENSES,PAROLE VIOLATION,6,IRVING ST / 9TH AV,-122.46631,37.764037,2011-08-13 07:00:00,1,7,8,...,0,0,0,0,0,0,0,1,0,21
102199,DRUG/NARCOTIC,POSSESSION OF HEROIN FOR SALES,1,GOLDEN GATE AV / LEAVENWORTH ST,-122.41387,37.781862,2014-01-06 12:55:00,1,12,1,...,0,0,0,0,0,0,0,0,1,7


In [28]:
train_df['InIntersection'] = 1
train_df.loc[train_df.Address.str.contains('Block'), 'InIntersection'] = 0

test_df['InIntersection'] = 1
test_df.loc[test_df.Address.str.contains('Block'), 'InIntersection'] = 0

In [29]:
train_df.sample(10)

Unnamed: 0,Category,Descript,DayOfWeek,Address,X,Y,Date,IsDay,Hour,Month,...,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN,CategoryInt,InIntersection
104929,ASSAULT,THREATS AGAINST LIFE,6,14TH ST / MISSION ST,-122.419983,37.768273,2013-12-14 12:50:00,1,12,12,...,0,1,0,0,0,0,0,0,1,1
400901,OTHER OFFENSES,MISCELLANEOUS INVESTIGATION,7,700 Block of JERROLD AV,-122.373456,37.72968,2009-09-20 13:32:00,1,13,9,...,0,0,0,0,0,0,0,0,21,0
844591,DRUG/NARCOTIC,UNDER THE INFLUENCE OF CONTROLLED SUBSTANCES,4,200 Block of SHOTWELL ST,-122.416375,37.764423,2003-06-12 08:45:00,1,8,6,...,0,1,0,0,0,0,0,0,7,0
521225,NON-CRIMINAL,IMPOUNDED VEHICLE,3,GOLDEN GATE AV / HYDE ST,-122.415508,37.781654,2008-01-09 01:37:00,0,1,1,...,0,0,0,0,0,0,0,1,20,1
261970,NON-CRIMINAL,FOUND PROPERTY,1,700 Block of VALLEJO ST,-122.409792,37.798508,2011-10-31 09:20:00,1,9,10,...,0,0,0,0,0,0,0,0,20,0
802463,VEHICLE THEFT,TAMPERING WITH A VEHICLE,5,400 Block of GOETTINGEN ST,-122.406379,37.72623,2004-01-09 00:01:00,0,0,1,...,0,0,0,0,0,0,0,0,36,0
856248,NON-CRIMINAL,LOST PROPERTY,3,900 Block of KEARNY ST,-122.405284,37.796551,2003-04-16 22:00:00,0,22,4,...,0,0,0,0,0,0,0,0,20,0
779972,DRUG/NARCOTIC,POSSESSION OF HEROIN,2,400 Block of EDDY ST,-122.415067,37.783563,2004-04-27 10:10:00,1,10,4,...,0,0,0,0,0,0,0,1,7,0
716669,ASSAULT,THREATS AGAINST LIFE,2,1800 Block of JACKSON ST,-122.425519,37.793565,2005-03-22 00:01:00,0,0,3,...,0,0,1,0,0,0,0,0,1,0
485268,DRUG/NARCOTIC,POSSESSION OF NARCOTICS PARAPHERNALIA,6,BERNALHTS BL / ANDERSON ST,-122.415017,37.742184,2008-07-12 01:08:00,0,1,7,...,1,0,0,0,0,0,0,0,7,1


# Feature Selection <a name="feature-selection"></a>

In [30]:
train_df.columns

Index(['Category', 'Descript', 'DayOfWeek', 'Address', 'X', 'Y', 'Date',
       'IsDay', 'Hour', 'Month', 'Year', 'HourCos', 'DayOfWeekCos', 'MonthCos',
       'PdDistrict_BAYVIEW', 'PdDistrict_CENTRAL', 'PdDistrict_INGLESIDE',
       'PdDistrict_MISSION', 'PdDistrict_NORTHERN', 'PdDistrict_PARK',
       'PdDistrict_RICHMOND', 'PdDistrict_SOUTHERN', 'PdDistrict_TARAVAL',
       'PdDistrict_TENDERLOIN', 'CategoryInt', 'InIntersection'],
      dtype='object')

In [31]:
feature_cols = ['X', 'Y', 'IsDay', 'DayOfWeek', 'Month', 'Hour', 'Year', 'InIntersection',
                'PdDistrict_BAYVIEW', 'PdDistrict_CENTRAL', 'PdDistrict_INGLESIDE',
                'PdDistrict_MISSION', 'PdDistrict_NORTHERN', 'PdDistrict_PARK',
                'PdDistrict_RICHMOND', 'PdDistrict_SOUTHERN', 'PdDistrict_TARAVAL', 'PdDistrict_TENDERLOIN']
target_col = 'CategoryInt'

train_x = train_df[feature_cols]
train_y = train_df[target_col]

test_ids = test_df['Id']
test_x = test_df[feature_cols]

In [32]:
type(train_x), type(train_y)

(pandas.core.frame.DataFrame, pandas.core.series.Series)

## Create Model

In [33]:
import xgboost as xgb
train_xgb = xgb.DMatrix(train_x, label=train_y)
test_xgb  = xgb.DMatrix(test_x)

## Play with the parameters and do Cross-Validation

In [34]:
params = {
    'max_depth': 4,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 39,
}

In [35]:
CROSS_VAL = False
if CROSS_VAL:
    print('Doing Cross-validation ...')
    cv = xgb.cv(params, train_xgb, nfold=3, early_stopping_rounds=10, metrics='mlogloss', verbose_eval=True)
    cv