# Imports

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Since dataset is 413mb / 4.5 million rows, getting only 1000rows

In [6]:
df = pd.read_csv('NYPD_Arrests_Data__Historic.csv', nrows=1000)

# Converting date in string to date type

In [7]:
df['ARREST_DATE']=pd.to_datetime(df.ARREST_DATE)
df.head()

Unnamed: 0,ARREST_KEY,ARREST_DATE,PD_CD,PD_DESC,KY_CD,OFNS_DESC,LAW_CODE,LAW_CAT_CD,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,AGE_GROUP,PERP_SEX,PERP_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude
0,82422509,2012-01-01,203,"TRESPASS 3, CRIMINAL",352,CRIMINAL TRESPASS,PL 1401000,M,K,70,0,18-24,M,BLACK,997873,169819,40.632788,-73.95092
1,82422945,2012-01-01,905,"INTOXICATED DRIVING,ALCOHOL",347,INTOXICATED & IMPAIRED DRIVING,VTL11920U3,M,Q,104,0,45-64,M,WHITE,1009994,204314,40.727442,-73.907119
2,82422481,2012-01-01,109,"ASSAULT 2,1,UNCLASSIFIED",106,FELONY ASSAULT,PL 120052H,F,K,73,0,45-64,F,BLACK,1009491,177733,40.654485,-73.909033
3,82422462,2012-01-01,782,"WEAPONS, POSSESSION, ETC",236,DANGEROUS WEAPONS,PL 2650101,M,K,73,0,18-24,M,BLACK,1008630,181082,40.66368,-73.912124
4,82422567,2012-01-01,269,"MISCHIEF,CRIMINAL, UNCL 2ND DEG 3RD DEG",121,CRIMINAL MISCHIEF & RELATED OFFENSES,PL 1450500,F,M,23,0,25-44,M,WHITE HISPANIC,1000700,227655,40.791529,-73.940594


# Exploring data

In [8]:
df.LAW_CODE.nunique()

145

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 18 columns):
ARREST_KEY           1000 non-null int64
ARREST_DATE          1000 non-null datetime64[ns]
PD_CD                1000 non-null int64
PD_DESC              1000 non-null object
KY_CD                1000 non-null int64
OFNS_DESC            1000 non-null object
LAW_CODE             1000 non-null object
LAW_CAT_CD           999 non-null object
ARREST_BORO          1000 non-null object
ARREST_PRECINCT      1000 non-null int64
JURISDICTION_CODE    1000 non-null int64
AGE_GROUP            1000 non-null object
PERP_SEX             1000 non-null object
PERP_RACE            1000 non-null object
X_COORD_CD           1000 non-null int64
Y_COORD_CD           1000 non-null int64
Latitude             1000 non-null float64
Longitude            1000 non-null float64
dtypes: datetime64[ns](1), float64(2), int64(7), object(8)
memory usage: 140.8+ KB


# Dropping redundant columns
- X_COORD_CD, Y_COORD_CD is similar to Latitude, Longitude so dropping it
- PD_CD, KY_CD don't what it can be useful for
- ARREST_KEY unique key no use as a feature
- PD_DESC : similar to OFNS_DESC
- LAW_CAT_CD : similar to prep_sex

In [10]:
df.drop(columns=['X_COORD_CD','Y_COORD_CD','ARREST_KEY','PD_CD','PD_DESC','KY_CD','LAW_CAT_CD'], inplace=True)

# Deriving feature
- Since inital 3 char of law code is generalizing the crime

In [11]:
df['LAW_CODE'] = df.LAW_CODE.apply(lambda x: x[:3])
df['day']= df.ARREST_DATE.dt.day
df['month']= df.ARREST_DATE.dt.month
df['dayofweek']= df.ARREST_DATE.dt.dayofweek
df.drop(columns=['ARREST_DATE'], inplace=True)
df.head()

Unnamed: 0,OFNS_DESC,LAW_CODE,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,AGE_GROUP,PERP_SEX,PERP_RACE,Latitude,Longitude,day,month,dayofweek
0,CRIMINAL TRESPASS,PL,K,70,0,18-24,M,BLACK,40.632788,-73.95092,1,1,6
1,INTOXICATED & IMPAIRED DRIVING,VTL,Q,104,0,45-64,M,WHITE,40.727442,-73.907119,1,1,6
2,FELONY ASSAULT,PL,K,73,0,45-64,F,BLACK,40.654485,-73.909033,1,1,6
3,DANGEROUS WEAPONS,PL,K,73,0,18-24,M,BLACK,40.66368,-73.912124,1,1,6
4,CRIMINAL MISCHIEF & RELATED OFFENSES,PL,M,23,0,25-44,M,WHITE HISPANIC,40.791529,-73.940594,1,1,6


In [12]:
# Label Encoding categorical values to make it numerical
from sklearn.preprocessing import LabelEncoder

def label_encoding_columns(columns_list):
    le1 = LabelEncoder()
    for column in columns_list:
        df[column] = le1.fit_transform(df[column])
        
label_encoding_columns(['OFNS_DESC', 'PERP_SEX', 'ARREST_BORO', 'AGE_GROUP','PERP_RACE','LAW_CODE'])

df.head()

Unnamed: 0,OFNS_DESC,LAW_CODE,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,AGE_GROUP,PERP_SEX,PERP_RACE,Latitude,Longitude,day,month,dayofweek
0,4,5,1,70,0,0,1,2,40.632788,-73.95092,1,1,6
1,16,7,3,104,0,2,1,5,40.727442,-73.907119,1,1,6
2,9,5,1,73,0,2,0,2,40.654485,-73.909033,1,1,6
3,6,5,1,73,0,0,1,2,40.66368,-73.912124,1,1,6
4,3,5,2,23,0,1,1,6,40.791529,-73.940594,1,1,6


In [13]:
y = df.pop('OFNS_DESC')
X = df

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [26]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import confusion_matrix
from sklearn.metrics import average_precision_score

forest_model = RandomForestClassifier(random_state=1)
forest_model.fit(X_train, y_train)
melb_preds = forest_model.predict(X_test)
print(mean_absolute_error(y_test, melb_preds))

7.64




In [21]:
melb_preds[:10]

array([ 9,  1,  1,  9, 29, 30,  7,  3, 27, 16])

In [22]:
y_test.head(10)

237     9
609     6
750     1
362     6
679    36
790     9
741     7
720     3
302    27
194    16
Name: OFNS_DESC, dtype: int64

In [28]:
confusion_matrix(y_test, melb_preds)
average_precision = average_precision_score(y_test, melb_preds)

ValueError: multiclass format is not supported