San Francisco Crime Classification

https://www.kaggle.com/competitions/sf-crime/overview

In [191]:
#=====import libraries=====#
import pandas  as pd
import numpy   as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, RepeatedKFold, GridSearchCV, cross_val_predict, cross_val_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, LabelBinarizer
import seaborn as sns
import warnings
from xgboost import XGBClassifier

warnings.filterwarnings("ignore")

### EDA

In [175]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

data = [train_data, test_data]

In [176]:
train_data.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [177]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Dates       878049 non-null  object 
 1   Category    878049 non-null  object 
 2   Descript    878049 non-null  object 
 3   DayOfWeek   878049 non-null  object 
 4   PdDistrict  878049 non-null  object 
 5   Resolution  878049 non-null  object 
 6   Address     878049 non-null  object 
 7   X           878049 non-null  float64
 8   Y           878049 non-null  float64
dtypes: float64(2), object(7)
memory usage: 60.3+ MB


In [178]:
#checking the count of each unique variable
for col in train_data.columns:
    print(col, train_data[col].nunique())

Dates 389257
Category 39
Descript 879
DayOfWeek 7
PdDistrict 10
Resolution 17
Address 23228
X 34243
Y 34243


In [179]:
#instantiating encoders
label_encoder = LabelEncoder()
binary_enc = LabelBinarizer()

In [181]:
#define a function that preprocesses the dataframe for correct use with the model
def preprocess(df):

    #convert dates into month, year, hour; drop 'Dates'
    df['hour'] = pd.to_datetime(df['Dates']).dt.hour
    df['month'] = pd.to_datetime(df['Dates']).dt.month
    df['year'] = pd.to_datetime(df['Dates']).dt.year
    df = df.drop('Dates', axis=1)

    #encode day of week as ordinal integers; drop 'DayOfWeek'
    df['dow_enc'] = label_encoder.fit_transform(df['DayOfWeek'])
    df = df.drop('DayOfWeek', axis=1)

    #one hot encode PdDistrict; drop 'PdDistrict'
    oh_enc = OneHotEncoder(sparse_output=False).set_output(transform='pandas')
    one_hot_encoded = oh_enc.fit_transform(df[['PdDistrict']])
    df = pd.concat([df, one_hot_encoded], axis=1)
    df = df.drop('PdDistrict', axis=1)

    #drop address
    df = df.drop('Address', axis=1)

    #drop descript/resolution if it exists
    try:
        df = df.drop(['Descript','Resolution'], axis=1)
    except:
        pass

    return df

In [182]:
df_train = preprocess(train_data)

#check the function was properly applied and dataframe is in expected format. 
df_train.head()

Unnamed: 0,Category,X,Y,hour,month,year,dow_enc,PdDistrict_BAYVIEW,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN
0,WARRANTS,-122.425892,37.774599,23,5,2015,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,OTHER OFFENSES,-122.425892,37.774599,23,5,2015,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,OTHER OFFENSES,-122.424363,37.800414,23,5,2015,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,LARCENY/THEFT,-122.426995,37.800873,23,5,2015,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,LARCENY/THEFT,-122.438738,37.771541,23,5,2015,6,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [183]:
df_test = preprocess(test_data)

#same as above
df_test.head()

Unnamed: 0,Id,X,Y,hour,month,year,dow_enc,PdDistrict_BAYVIEW,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN
0,0,-122.399588,37.735051,23,5,2015,3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,-122.391523,37.732432,23,5,2015,3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,-122.426002,37.792212,23,5,2015,3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,3,-122.437394,37.721412,23,5,2015,3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,-122.437394,37.721412,23,5,2015,3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [185]:
#split train_data/df_train into X and y values for training the model
#encoding y - label encoder used as XGBoost handles it well despite the data being nominal
X_train = df_train.drop(['Category'], axis=1)
y_train = label_encoder.fit_transform(df_train['Category'])

In [189]:
#just checking for duplicated columns
df_test.columns

Index(['Id', 'X', 'Y', 'hour', 'month', 'year', 'dow_enc',
       'PdDistrict_BAYVIEW', 'PdDistrict_CENTRAL', 'PdDistrict_INGLESIDE',
       'PdDistrict_MISSION', 'PdDistrict_NORTHERN', 'PdDistrict_PARK',
       'PdDistrict_RICHMOND', 'PdDistrict_SOUTHERN', 'PdDistrict_TARAVAL',
       'PdDistrict_TENDERLOIN'],
      dtype='object')

In [199]:
#simply renaming and checking format of test_data (using X_test to use more standard nomenclature)
X_test = df_test
X_test

Unnamed: 0,Id,X,Y,hour,month,year,dow_enc,PdDistrict_BAYVIEW,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN
0,0,-122.399588,37.735051,23,5,2015,3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,-122.391523,37.732432,23,5,2015,3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,-122.426002,37.792212,23,5,2015,3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,3,-122.437394,37.721412,23,5,2015,3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,-122.437394,37.721412,23,5,2015,3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884257,884257,-122.408983,37.751987,0,1,2003,6,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
884258,884258,-122.425342,37.792681,0,1,2003,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
884259,884259,-122.445418,37.712075,0,1,2003,6,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
884260,884260,-122.387394,37.739479,0,1,2003,6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [186]:
X_train.columns

Index(['X', 'Y', 'hour', 'month', 'year', 'dow_enc', 'PdDistrict_BAYVIEW',
       'PdDistrict_CENTRAL', 'PdDistrict_INGLESIDE', 'PdDistrict_MISSION',
       'PdDistrict_NORTHERN', 'PdDistrict_PARK', 'PdDistrict_RICHMOND',
       'PdDistrict_SOUTHERN', 'PdDistrict_TARAVAL', 'PdDistrict_TENDERLOIN'],
      dtype='object')

In [200]:
#instantiate classifier; fit the mode; generate predictions based on X_test
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

y_pred = xgb.predict_proba(X_test.drop('Id', axis=1))

In [201]:
#prepare submission document. 
submission = pd.DataFrame(columns=['Id'], data=X_test)

submission = pd.concat([submission, pd.DataFrame(y_pred, columns=["ARSON", "ASSAULT", "BAD CHECKS", "BRIBERY", "BURGLARY", "DISORDERLY CONDUCT",
"DRIVING UNDER THE INFLUENCE", "DRUG/NARCOTIC", "DRUNKENNESS", "EMBEZZLEMENT", "EXTORTION",
"FAMILY OFFENSES", "FORGERY/COUNTERFEITING", "FRAUD", "GAMBLING", "KIDNAPPING", "LARCENY/THEFT",
"LIQUOR LAWS", "LOITERING", "MISSING PERSON", "NON-CRIMINAL", "OTHER OFFENSES", "PORNOGRAPHY/OBSCENE MAT",
"PROSTITUTION", "RECOVERED VEHICLE", "ROBBERY", "RUNAWAY", "SECONDARY CODES", "SEX OFFENSES FORCIBLE",
"SEX OFFENSES NON FORCIBLE", "STOLEN PROPERTY", "SUICIDE", "SUSPICIOUS OCC", "TREA", "TRESPASS",
"VANDALISM", "VEHICLE THEFT", "WARRANTS", "WEAPON LAWS"])], axis=1)

In [202]:
#final check of submission document format
submission.head()

Unnamed: 0,Id,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0,0.001652,0.120603,2.7e-05,0.000406,0.014256,0.000708,0.025611,0.017056,0.002247,...,8.7e-05,0.006716,0.000246,0.035789,8.766382e-07,0.007196,0.067763,0.176164,0.031455,0.033143
1,1,0.00239,0.132424,1.1e-05,0.001623,0.007563,0.001402,0.02741,0.041931,0.003982,...,0.000343,0.009356,0.000996,0.031987,4.522529e-06,0.005603,0.041029,0.121929,0.074804,0.033104
2,2,0.002292,0.051441,3.7e-05,1e-05,0.05328,0.000758,0.000673,0.014361,0.004003,...,2.5e-05,0.006272,0.000107,0.026387,5.264191e-07,0.007803,0.065006,0.13861,0.017792,0.002397
3,3,0.001565,0.110652,1.2e-05,0.005115,0.011116,0.001773,0.003829,0.01482,0.008883,...,0.000229,0.012675,0.000156,0.036622,1.412628e-06,0.003049,0.066452,0.166891,0.028969,0.013213
4,4,0.001565,0.110652,1.2e-05,0.005115,0.011116,0.001773,0.003829,0.01482,0.008883,...,0.000229,0.012675,0.000156,0.036622,1.412628e-06,0.003049,0.066452,0.166891,0.028969,0.013213


In [203]:
#saving submission document
submission.to_csv("submission.csv", index=False)