In [21]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [8]:
train = pd.read_csv('train.csv', parse_dates=['Dates'], index_col=False)
test = pd.read_csv('test.csv', parse_dates=['Dates'], index_col=False)

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   Dates       878049 non-null  datetime64[ns]
 1   Category    878049 non-null  object        
 2   Descript    878049 non-null  object        
 3   DayOfWeek   878049 non-null  object        
 4   PdDistrict  878049 non-null  object        
 5   Resolution  878049 non-null  object        
 6   Address     878049 non-null  object        
 7   X           878049 non-null  float64       
 8   Y           878049 non-null  float64       
dtypes: datetime64[ns](1), float64(2), object(6)
memory usage: 60.3+ MB


In [10]:
train = train.drop(['Descript', 'Resolution', 'Address'], axis = 1)

In [11]:
test = test.drop(['Address'], axis = 1)

In [12]:
def feature_engineering(data):
    data['Day'] = data['Dates'].dt.day
    data['Month'] = data['Dates'].dt.month
    data['Year'] = data['Dates'].dt.year
    data['Hour'] = data['Dates'].dt.hour
    data['Minute'] = data['Dates'].dt.minute
    data['DayOfWeek'] = data['Dates'].dt.dayofweek
    data['WeekOfYear'] = data['Dates'].dt.weekofyear
    return data

In [13]:
train = feature_engineering(train)

  data['WeekOfYear'] = data['Dates'].dt.weekofyear


In [14]:
test = feature_engineering(test)

  data['WeekOfYear'] = data['Dates'].dt.weekofyear


In [16]:
enc = LabelEncoder()
train['PdDistrict'] = enc.fit_transform(train['PdDistrict'])

In [17]:
category_encoder = LabelEncoder()
category_encoder.fit(train['Category'])
train['CategoryEncoded'] = category_encoder.transform(train['Category'])
print(category_encoder.classes_)

['ARSON' 'ASSAULT' 'BAD CHECKS' 'BRIBERY' 'BURGLARY' 'DISORDERLY CONDUCT'
 'DRIVING UNDER THE INFLUENCE' 'DRUG/NARCOTIC' 'DRUNKENNESS'
 'EMBEZZLEMENT' 'EXTORTION' 'FAMILY OFFENSES' 'FORGERY/COUNTERFEITING'
 'FRAUD' 'GAMBLING' 'KIDNAPPING' 'LARCENY/THEFT' 'LIQUOR LAWS' 'LOITERING'
 'MISSING PERSON' 'NON-CRIMINAL' 'OTHER OFFENSES'
 'PORNOGRAPHY/OBSCENE MAT' 'PROSTITUTION' 'RECOVERED VEHICLE' 'ROBBERY'
 'RUNAWAY' 'SECONDARY CODES' 'SEX OFFENSES FORCIBLE'
 'SEX OFFENSES NON FORCIBLE' 'STOLEN PROPERTY' 'SUICIDE' 'SUSPICIOUS OCC'
 'TREA' 'TRESPASS' 'VANDALISM' 'VEHICLE THEFT' 'WARRANTS' 'WEAPON LAWS']


In [18]:
enc = LabelEncoder()
test['PdDistrict'] = enc.fit_transform(test['PdDistrict'])

In [19]:
print(train.columns)
print(test.columns)

Index(['Dates', 'Category', 'DayOfWeek', 'PdDistrict', 'X', 'Y', 'Day',
       'Month', 'Year', 'Hour', 'Minute', 'WeekOfYear', 'CategoryEncoded'],
      dtype='object')
Index(['Id', 'Dates', 'DayOfWeek', 'PdDistrict', 'X', 'Y', 'Day', 'Month',
       'Year', 'Hour', 'Minute', 'WeekOfYear'],
      dtype='object')


In [20]:
x_cols = list(train.columns[2:12].values)
x_cols.remove('Minute')
print(x_cols)

['DayOfWeek', 'PdDistrict', 'X', 'Y', 'Day', 'Month', 'Year', 'Hour', 'WeekOfYear']


In [22]:
clf = RandomForestClassifier(n_estimators = 10)

In [23]:
clf.fit(train[x_cols], train['CategoryEncoded'])

In [24]:
test['predictions'] = clf.predict(test[x_cols])

In [25]:
def field_to_columns(data, field, new_columns):
    for i in range(len(new_columns)):
        data[new_columns[i]] = (data[field] == new_columns[i]).astype(int)
    return data

In [26]:
test['Category'] = category_encoder.inverse_transform(test['predictions'])

In [27]:
categories = list(category_encoder.classes_)

In [28]:
test = field_to_columns(test, 'Category', categories)

In [29]:
import time
PREDICTIONS_FILENAME_PREFIX = 'predictions_'
PREDICTIONS_FILENAME = PREDICTIONS_FILENAME_PREFIX + time.strftime('%Y%m%d-%H%M%S') + '.csv'

In [30]:
print(test.columns)

Index(['Id', 'Dates', 'DayOfWeek', 'PdDistrict', 'X', 'Y', 'Day', 'Month',
       'Year', 'Hour', 'Minute', 'WeekOfYear', 'predictions', 'Category',
       'ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY',
       'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE', 'DRUG/NARCOTIC',
       'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION', 'FAMILY OFFENSES',
       'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING', 'KIDNAPPING',
       'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING', 'MISSING PERSON',
       'NON-CRIMINAL', 'OTHER OFFENSES', 'PORNOGRAPHY/OBSCENE MAT',
       'PROSTITUTION', 'RECOVERED VEHICLE', 'ROBBERY', 'RUNAWAY',
       'SECONDARY CODES', 'SEX OFFENSES FORCIBLE', 'SEX OFFENSES NON FORCIBLE',
       'STOLEN PROPERTY', 'SUICIDE', 'SUSPICIOUS OCC', 'TREA', 'TRESPASS',
       'VANDALISM', 'VEHICLE THEFT', 'WARRANTS', 'WEAPON LAWS'],
      dtype='object')


In [31]:
submission_cols = [test.columns[0]]+list(test.columns[14:])
print(submission_cols)

['Id', 'ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY', 'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE', 'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION', 'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING', 'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING', 'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES', 'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE', 'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE', 'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE', 'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT', 'WARRANTS', 'WEAPON LAWS']


In [32]:
print(PREDICTIONS_FILENAME)
test[submission_cols].to_csv(PREDICTIONS_FILENAME, index = False)

predictions_20220827-124125.csv
