In [1]:
import pandas  as pd
import datetime
import time

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
train_df.shape

(878049, 9)

In [4]:
train_df = train_df[abs(train_df['Y'])< 90]

In [5]:
train_df.shape

(877982, 9)

In [6]:
train_df['Dates'] = pd.to_datetime(train_df['Dates'])
test_df['Dates'] = pd.to_datetime(test_df['Dates'])

In [7]:
train_df.head()
test_df.head()

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


In [8]:
train_df['IsDay'] = 0
train_df.loc[(train_df['Dates'].dt.hour > 6) & (train_df['Dates'].dt.hour < 20) , 'IsDay'] = 1


test_df['IsDay'] = 0
test_df.loc[(test_df['Dates'].dt.hour > 6) & (test_df['Dates'].dt.hour < 20) , 'IsDay'] = 1

In [9]:
hour_of_day_train = train_df['Dates'].dt.hour


hour_of_day_test = test_df['Dates'].dt.hour

In [10]:
month_train = pd.get_dummies(train_df['Dates'].dt.month, drop_first = True)

month_test = pd.get_dummies(test_df['Dates'].dt.month, drop_first = True)

In [11]:
year_train = pd.get_dummies(train_df['Dates'].dt.year, drop_first = True)

year_test = pd.get_dummies(test_df['Dates'].dt.year, drop_first = True)

In [12]:
day_of_week_train = pd.get_dummies(train_df['DayOfWeek'], drop_first = True)

day_of_week_test = pd.get_dummies(test_df['DayOfWeek'], drop_first = True)

In [13]:
police_district_train = pd.get_dummies(train_df['PdDistrict'] , drop_first = True)

police_district_test = pd.get_dummies(test_df['PdDistrict'] , drop_first = True)

In [14]:
# is the address an intersection
train_df['Intersection'] = 1
train_df.loc[train_df['Address'].str.contains('Block'), 'Intersection'] = 0

test_df['Intersection'] = 1
test_df.loc[test_df['Address'].str.contains('Block'), 'Intersection'] = 0

In [15]:
X_train = pd.DataFrame(columns = ['hour','long', 'lat','isday', 'intersection'])

X_test = pd.DataFrame(columns = ['hour','long', 'lat','isday', 'intersection'])

In [16]:
X_train['hour'] = hour_of_day_train
X_train['long'] = train_df['X']
X_train['lat'] = train_df['Y']
X_train['isday'] =  train_df['IsDay']
X_train['intersection'] = train_df['Intersection']


X_test['hour'] = hour_of_day_test
X_test['long'] = test_df['X']
X_test['lat'] = test_df['Y']
X_test['isday'] =  test_df['IsDay']
X_test['intersection'] = test_df['Intersection']

In [17]:
X_train.head()

X_test.head()

Unnamed: 0,hour,long,lat,isday,intersection
0,23,-122.399588,37.735051,0,0
1,23,-122.391523,37.732432,0,1
2,23,-122.426002,37.792212,0,0
3,23,-122.437394,37.721412,0,0
4,23,-122.437394,37.721412,0,0


In [18]:
X_train = pd.concat([X_train , day_of_week_train, month_train, year_train, police_district_train], axis = 1)

X_test = pd.concat([X_test , day_of_week_test, month_test, year_test, police_district_test], axis = 1)

In [19]:
X_train.head()

X_test.head()

Unnamed: 0,hour,long,lat,isday,intersection,Monday,Saturday,Sunday,Thursday,Tuesday,...,2015,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,SOUTHERN,TARAVAL,TENDERLOIN
0,23,-122.399588,37.735051,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
1,23,-122.391523,37.732432,0,1,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
2,23,-122.426002,37.792212,0,0,0,0,1,0,0,...,1,0,0,0,1,0,0,0,0,0
3,23,-122.437394,37.721412,0,0,0,0,1,0,0,...,1,0,1,0,0,0,0,0,0,0
4,23,-122.437394,37.721412,0,0,0,0,1,0,0,...,1,0,1,0,0,0,0,0,0,0


In [20]:
X_train.shape

X_test.shape

(884262, 43)

In [21]:
from sklearn.preprocessing import  LabelEncoder

category_le = LabelEncoder()
y_train = category_le.fit_transform(train_df['Category'])

In [22]:
y_train

array([37, 21, 21, ..., 16, 35, 12])

In [23]:
#XgBoost training
import xgboost as xgb

In [24]:
train_xgb = xgb.DMatrix(X_train, label = y_train)

test_xgb = xgb.DMatrix(X_test)

In [25]:
params = {
    'max_depth': 4,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 39,
}

In [26]:
cv = xgb.cv(params , train_xgb,nfold = 3, early_stopping_rounds = 10, metrics = 'mlogloss', verbose_eval= True)

[0]	train-mlogloss:3.1441+0.0016515	test-mlogloss:3.14536+0.000493667
[1]	train-mlogloss:2.9649+0.00301572	test-mlogloss:2.96666+0.0019943
[2]	train-mlogloss:2.8473+0.00234002	test-mlogloss:2.84951+0.00116325
[3]	train-mlogloss:2.7622+0.00278796	test-mlogloss:2.76474+0.00132823
[4]	train-mlogloss:2.69858+0.00171388	test-mlogloss:2.70167+0.000741629
[5]	train-mlogloss:2.64952+0.0013369	test-mlogloss:2.65308+0.000751545
[6]	train-mlogloss:2.61142+0.000869656	test-mlogloss:2.61541+0.000815559
[7]	train-mlogloss:2.58079+0.0011718	test-mlogloss:2.58517+0.00077722
[8]	train-mlogloss:2.55541+0.00119387	test-mlogloss:2.56014+0.000451995
[9]	train-mlogloss:2.53463+0.00124977	test-mlogloss:2.53971+0.000492118


In [27]:
cv

Unnamed: 0,train-mlogloss-mean,train-mlogloss-std,test-mlogloss-mean,test-mlogloss-std
0,3.144105,0.001651,3.145363,0.000494
1,2.964899,0.003016,2.96666,0.001994
2,2.847303,0.00234,2.849511,0.001163
3,2.762204,0.002788,2.764742,0.001328
4,2.698577,0.001714,2.701666,0.000742
5,2.649517,0.001337,2.65308,0.000752
6,2.611415,0.00087,2.615412,0.000816
7,2.580793,0.001172,2.585175,0.000777
8,2.555414,0.001194,2.560135,0.000452
9,2.534626,0.00125,2.539711,0.000492


In [28]:
model = xgb.train(params,train_xgb,10)

In [29]:
model

<xgboost.core.Booster at 0x29496c06dd8>

In [31]:
train_pred = model.predict(xgb.DMatrix(X_train))

In [32]:
test_pred = model.predict(test_xgb)

In [51]:
cols = category_le.inverse_transform(y_train)

In [55]:
category_le.classes_

array(['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY',
       'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE',
       'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION',
       'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING',
       'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING',
       'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES',
       'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE',
       'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE',
       'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE',
       'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT',
       'WARRANTS', 'WEAPON LAWS'], dtype=object)

In [56]:
train_df.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,IsDay,Intersection
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,0,1
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,0,1
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,0,1
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,0,0
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,0,0


In [58]:
final_df = pd.DataFrame(columns = category_le.classes_ , data = test_pred)

In [60]:
final_df.insert(0,'Id', test_df['Id'])

In [61]:
final_df.head()

Unnamed: 0,Id,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0,0.006394,0.122599,0.005104,0.005457,0.047023,0.006659,0.006757,0.025971,0.007343,...,0.005181,0.007806,0.005641,0.034098,0.00501,0.010086,0.072387,0.08622,0.037818,0.017941
1,1,0.005059,0.059858,0.004227,0.004552,0.005016,0.005555,0.0075,0.071687,0.0062,...,0.004322,0.006512,0.004217,0.022881,0.004179,0.004903,0.039192,0.055835,0.047279,0.016869
2,2,0.00497,0.06189,0.003967,0.004011,0.087556,0.005176,0.005252,0.013211,0.007945,...,0.003953,0.007695,0.004288,0.025625,0.003894,0.009882,0.057149,0.055947,0.01897,0.006557
3,3,0.006673,0.106737,0.005046,0.005395,0.040559,0.006584,0.006681,0.02426,0.00708,...,0.005122,0.007717,0.005577,0.032485,0.004953,0.008999,0.073266,0.126713,0.026852,0.01544
4,4,0.006673,0.106737,0.005046,0.005395,0.040559,0.006584,0.006681,0.02426,0.00708,...,0.005122,0.007717,0.005577,0.032485,0.004953,0.008999,0.073266,0.126713,0.026852,0.01544


In [62]:
final_df.shape

(884262, 40)

In [63]:
final_df.to_csv('submission_v2.csv', index = False)