In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss


In [2]:
train = pd.read_csv("C:\\Users\\IRC-240\\Desktop\\LHL Bootcamo\\sf-crime\\test.csv\\test.csv")
test = pd.read_csv("C:\\Users\\IRC-240\\Desktop\\LHL Bootcamo\\sf-crime\\train.csv\\train.csv")

In [3]:
train.head()

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


In [4]:
test.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Dates       878049 non-null  object 
 1   Category    878049 non-null  object 
 2   Descript    878049 non-null  object 
 3   DayOfWeek   878049 non-null  object 
 4   PdDistrict  878049 non-null  object 
 5   Resolution  878049 non-null  object 
 6   Address     878049 non-null  object 
 7   X           878049 non-null  float64
 8   Y           878049 non-null  float64
dtypes: float64(2), object(7)
memory usage: 60.3+ MB


In [6]:
#combine both dfs and drop columns that are not in boths dfs, and drop address column
combined_df =  pd.concat([test.drop(['Category', 'Descript', 'Resolution', 'Address'], axis=1), train.drop(['Id', 'Address'], axis=1)], axis=0)

combined_df.head()

Unnamed: 0,Dates,DayOfWeek,PdDistrict,X,Y
0,2015-05-13 23:53:00,Wednesday,NORTHERN,-122.425892,37.774599
1,2015-05-13 23:53:00,Wednesday,NORTHERN,-122.425892,37.774599
2,2015-05-13 23:33:00,Wednesday,NORTHERN,-122.424363,37.800414
3,2015-05-13 23:30:00,Wednesday,NORTHERN,-122.426995,37.800873
4,2015-05-13 23:30:00,Wednesday,PARK,-122.438738,37.771541


In [7]:
#make a list of a crime categories 
crime_list = list(test['Category'].unique())

crime_list

['WARRANTS',
 'OTHER OFFENSES',
 'LARCENY/THEFT',
 'VEHICLE THEFT',
 'VANDALISM',
 'NON-CRIMINAL',
 'ROBBERY',
 'ASSAULT',
 'WEAPON LAWS',
 'BURGLARY',
 'SUSPICIOUS OCC',
 'DRUNKENNESS',
 'FORGERY/COUNTERFEITING',
 'DRUG/NARCOTIC',
 'STOLEN PROPERTY',
 'SECONDARY CODES',
 'TRESPASS',
 'MISSING PERSON',
 'FRAUD',
 'KIDNAPPING',
 'RUNAWAY',
 'DRIVING UNDER THE INFLUENCE',
 'SEX OFFENSES FORCIBLE',
 'PROSTITUTION',
 'DISORDERLY CONDUCT',
 'ARSON',
 'FAMILY OFFENSES',
 'LIQUOR LAWS',
 'BRIBERY',
 'EMBEZZLEMENT',
 'SUICIDE',
 'LOITERING',
 'SEX OFFENSES NON FORCIBLE',
 'EXTORTION',
 'GAMBLING',
 'BAD CHECKS',
 'TREA',
 'RECOVERED VEHICLE',
 'PORNOGRAPHY/OBSCENE MAT']

In [8]:
#use label encoder on crime category 
encoder = LabelEncoder()
y_train = encoder.fit_transform(test['Category'])

In [9]:
# feature engineer dates and times to use in model first convert to date time and then create columns year month day week and hour 

combined_df['Dates'] = pd.to_datetime(combined_df['Dates'])


combined_df['Year'] = combined_df['Dates'].dt.year
combined_df['Month'] = combined_df['Dates'].dt.month
combined_df['Day'] = combined_df['Dates'].dt.day
combined_df['Weekday'] = combined_df['Dates'].dt.weekday
combined_df['Hour'] = combined_df['Dates'].dt.hour

In [10]:
#use one hot encoder to encode policat department districts and drop non numeric columns 
combined_df= pd.concat([combined_df, pd.get_dummies(combined_df['PdDistrict'])], axis=1)



In [11]:
combined_df.drop(['Dates', 'DayOfWeek', 'PdDistrict'], axis=1, inplace=True)

combined_df

Unnamed: 0,X,Y,Year,Month,Day,Weekday,Hour,BAYVIEW,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,SOUTHERN,TARAVAL,TENDERLOIN
0,-122.425892,37.774599,2015,5,13,2,23,False,False,False,False,True,False,False,False,False,False
1,-122.425892,37.774599,2015,5,13,2,23,False,False,False,False,True,False,False,False,False,False
2,-122.424363,37.800414,2015,5,13,2,23,False,False,False,False,True,False,False,False,False,False
3,-122.426995,37.800873,2015,5,13,2,23,False,False,False,False,True,False,False,False,False,False
4,-122.438738,37.771541,2015,5,13,2,23,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884257,-122.408983,37.751987,2003,1,1,2,0,False,False,False,True,False,False,False,False,False,False
884258,-122.425342,37.792681,2003,1,1,2,0,False,False,False,False,True,False,False,False,False,False
884259,-122.445418,37.712075,2003,1,1,2,0,False,False,True,False,False,False,False,False,False,False
884260,-122.387394,37.739479,2003,1,1,2,0,True,False,False,False,False,False,False,False,False,False


In [12]:
#split the test and train data back into their seperate dataframes 

test_formatted = combined_df.iloc[:len(train)]
train_formatted = combined_df.iloc[len(train):]


In [14]:
# use xgb classification 
model = xgb.XGBClassifier(objective= 'multi:softprob', num_class=len(crime_list))
model.fit(train_formatted, y_train)


In [16]:
# predict using the test data 
y_predict = model.predict_proba(test_formatted)
y_predict_df = pd.DataFrame(y_predict, columns=crime_list)


In [17]:
y_predict_df

Unnamed: 0,WARRANTS,OTHER OFFENSES,LARCENY/THEFT,VEHICLE THEFT,VANDALISM,NON-CRIMINAL,ROBBERY,ASSAULT,WEAPON LAWS,BURGLARY,...,EMBEZZLEMENT,SUICIDE,LOITERING,SEX OFFENSES NON FORCIBLE,EXTORTION,GAMBLING,BAD CHECKS,TREA,RECOVERED VEHICLE,PORNOGRAPHY/OBSCENE MAT
0,0.003084,0.075740,0.000084,0.000098,0.038578,0.001812,0.000921,0.027104,0.006359,0.000249,...,0.000040,0.005824,0.000392,0.031619,0.000002,0.005941,0.048155,0.068504,0.047413,0.009732
1,0.003084,0.075740,0.000084,0.000098,0.038578,0.001812,0.000921,0.027104,0.006359,0.000249,...,0.000040,0.005824,0.000392,0.031619,0.000002,0.005941,0.048155,0.068504,0.047413,0.009732
2,0.001100,0.071392,0.000139,0.000182,0.036074,0.000288,0.000717,0.042638,0.004463,0.000019,...,0.000065,0.023504,0.000888,0.021702,0.000002,0.006010,0.039790,0.098576,0.086553,0.006211
3,0.002112,0.075227,0.000164,0.000181,0.043198,0.000286,0.000782,0.032120,0.004290,0.000026,...,0.000084,0.013879,0.000155,0.024963,0.000002,0.007048,0.042347,0.074916,0.102662,0.007173
4,0.002614,0.064620,0.000101,0.000239,0.054807,0.000730,0.002601,0.031302,0.006009,0.000284,...,0.000020,0.007574,0.000260,0.030864,0.000001,0.009713,0.049434,0.057253,0.042165,0.008998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884257,0.003596,0.054720,0.000055,0.001488,0.038824,0.000978,0.003028,0.040564,0.002784,0.000296,...,0.000295,0.007270,0.000161,0.033339,0.000007,0.008253,0.047883,0.074968,0.035242,0.006636
884258,0.000993,0.048276,0.000027,0.000689,0.032700,0.000393,0.000547,0.056761,0.000811,0.000225,...,0.000156,0.008805,0.000077,0.037090,0.000005,0.011649,0.070632,0.042581,0.032863,0.019795
884259,0.001282,0.074561,0.000015,0.000718,0.030251,0.002050,0.001047,0.032569,0.002797,0.000189,...,0.000045,0.007030,0.000197,0.034857,0.000003,0.009786,0.069592,0.066329,0.040958,0.008352
884260,0.003059,0.063482,0.000013,0.000160,0.029177,0.001393,0.000969,0.041766,0.003107,0.000359,...,0.000145,0.009398,0.000097,0.039589,0.000012,0.010102,0.048299,0.055250,0.031017,0.010399


In [19]:
#combine ID with the model data 

output = pd.concat([train[['Id']], y_predict_df], axis=1)

In [20]:
#export to csv 
output.to_csv('probability for each class of crime', index=False)

