In [1]:
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from sklearn.preprocessing import OneHotEncoder

pd.options.mode.chained_assignment = None

In [2]:
df = pd.read_csv("ripa_stops_datasd.csv", low_memory=False)

In [3]:
# df 

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 464044 entries, 0 to 464043
Data columns (total 29 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   stop_id                    464044 non-null  int64  
 1   ori                        464044 non-null  object 
 2   agency                     464044 non-null  object 
 3   exp_years                  464044 non-null  int64  
 4   date_stop                  464044 non-null  object 
 5   time_stop                  464044 non-null  object 
 6   stopduration               464044 non-null  int64  
 7   stop_in_response_to_cfs    464044 non-null  int64  
 8   officer_assignment_key     464044 non-null  int64  
 9   assignment                 464044 non-null  object 
 10  intersection               43743 non-null   object 
 11  address_block              417070 non-null  float64
 12  land_mark                  56 non-null      object 
 13  address_street             44

In [5]:
df[df['gend'] == 0]['gend']

270       0
3945      0
4508      0
4686      0
5053      0
         ..
449138    0
450614    0
457491    0
457493    0
460214    0
Name: gend, Length: 131, dtype: int64

In [6]:
address_st = df['address_street']
address_st[address_st.notnull()]

0            Grand Avenue
1             NOBEL DRIVE
2             59th Street
3             59th Street
4             NIAGARA AVE
               ...       
464039           Field St
464040       Sea World Dr
464041       sea world dr
464042    pacific highway
464043              Alice
Name: address_street, Length: 444706, dtype: object

In [7]:
address_st.apply(lambda x: str(x).lower())

0            grand avenue
1             nobel drive
2             59th street
3             59th street
4             niagara ave
               ...       
464039           field st
464040       sea world dr
464041       sea world dr
464042    pacific highway
464043              alice
Name: address_street, Length: 464044, dtype: object

In [8]:
address_bl = df['address_block']
address_bl[address_bl.notnull()]

0          700.0
2         4400.0
3         4400.0
4         4800.0
5         4500.0
           ...  
464039    4900.0
464040     500.0
464041     500.0
464042    4500.0
464043    4500.0
Name: address_block, Length: 417070, dtype: float64

## Getting columns we're interested in

In [9]:
rdf= df[['stop_id',
         'beat',
         'isschool',
         'date_stop', 
         'time_stop', 
         'stopduration',
         'stop_in_response_to_cfs', # was stope made in response to a call for service
#          'officer_assignment_key',
         'assignment',
         'exp_years', 
         'pid',
         'isstudent',
         'perceived_limited_english',
         'perceived_age',
         'perceived_gender']]

In [10]:
# rdf

In [11]:
rdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 464044 entries, 0 to 464043
Data columns (total 14 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   stop_id                    464044 non-null  int64 
 1   beat                       464044 non-null  int64 
 2   isschool                   464044 non-null  int64 
 3   date_stop                  464044 non-null  object
 4   time_stop                  464044 non-null  object
 5   stopduration               464044 non-null  int64 
 6   stop_in_response_to_cfs    464044 non-null  int64 
 7   assignment                 464044 non-null  object
 8   exp_years                  464044 non-null  int64 
 9   pid                        464044 non-null  int64 
 10  isstudent                  464044 non-null  int64 
 11  perceived_limited_english  464044 non-null  int64 
 12  perceived_age              464044 non-null  int64 
 13  perceived_gender           463913 non-null  

## Perceived Gender

In [12]:
rdf['perceived_gender'].unique()

array(['Male', 'Female', 'Transgender woman/girl', nan,
       'Transgender man/boy'], dtype=object)

In [13]:
rdf['perceived_gender'].replace(['Transgender woman/girl', 'Transgender man/boy', np.nan], 'Other', inplace=True)

In [14]:
rdf['perceived_gender'] = rdf['perceived_gender'].apply(lambda x: 'gender_' + x.lower())

In [15]:
g_encoder =  OneHotEncoder().fit(rdf[['perceived_gender']])
g_encoder.categories_

[array(['gender_female', 'gender_male', 'gender_other'], dtype=object)]

In [16]:
g_transformed = g_encoder.transform(rdf[['perceived_gender']]).toarray()

In [17]:
for index, category in enumerate(np.concatenate(g_encoder.categories_)):
    rdf[category] = g_transformed[:,index]

In [18]:
rdf = rdf.drop(['perceived_gender'], axis=1)

In [19]:
rdf.head()

Unnamed: 0,stop_id,beat,isschool,date_stop,time_stop,stopduration,stop_in_response_to_cfs,assignment,exp_years,pid,isstudent,perceived_limited_english,perceived_age,gender_female,gender_male,gender_other
0,2443,122,0,2018-07-01,00:01:37,30,0,"Patrol, traffic enforcement, field operations",10,1,0,0,25,0.0,1.0,0.0
1,2444,121,0,2018-07-01,00:03:34,10,0,"Patrol, traffic enforcement, field operations",18,1,0,0,25,0.0,1.0,0.0
2,2447,822,0,2018-07-01,00:05:43,15,1,Other,1,1,0,0,30,0.0,1.0,0.0
3,2447,822,0,2018-07-01,00:05:43,15,1,Other,1,2,0,0,30,1.0,0.0,0.0
4,2448,614,0,2018-07-01,00:19:06,5,0,"Patrol, traffic enforcement, field operations",3,1,0,0,23,0.0,1.0,0.0


# Assignment

In [20]:
rdf['assignment'].unique()

array(['Patrol, traffic enforcement, field operations', 'Other',
       'Gang enforcement', 'Roadblock or DUI sobriety checkpoint',
       'Investigative/detective', 'Special events', 'Task force',
       'Narcotics/vice',
       'K1-12 public school inlcuding school resource officer or school police officer',
       'Compliance check'], dtype=object)

In [21]:
assign_dict = {'Patrol, traffic enforcement, field operations': 'assignment_A',
               'Gang enforcement': 'assignment_B',
               'Compliance check': 'assignment_C',
               'Special events': 'assignment_D',
               'Roadblock or DUI sobriety checkpoint': 'assignmen_E',
               'Narcotics/vice': 'assignment_F',
               'Task force': 'assignment_G',
               'K1-12 public school inlcuding school resource officer or school police officer': 'assignent_H',
               'Investigative/detective': 'assignment_I',
               'Other': 'assignment_J'}

In [22]:
rdf['assignment'].replace(assign_dict, inplace=True)

In [23]:
a_encoder =  OneHotEncoder().fit(rdf[['assignment']])
a_encoder.categories_

[array(['assignent_H', 'assignmen_E', 'assignment_A', 'assignment_B',
        'assignment_C', 'assignment_D', 'assignment_F', 'assignment_G',
        'assignment_I', 'assignment_J'], dtype=object)]

In [24]:
a_transformed = a_encoder.transform(rdf[['assignment']]).toarray()

In [25]:
for index, category in enumerate(np.concatenate(a_encoder.categories_)):
    rdf[category] = a_transformed[:,index]

In [26]:
rdf = rdf.drop(['assignment'], axis=1)

In [27]:
rdf.head()

Unnamed: 0,stop_id,beat,isschool,date_stop,time_stop,stopduration,stop_in_response_to_cfs,exp_years,pid,isstudent,...,assignent_H,assignmen_E,assignment_A,assignment_B,assignment_C,assignment_D,assignment_F,assignment_G,assignment_I,assignment_J
0,2443,122,0,2018-07-01,00:01:37,30,0,10,1,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2444,121,0,2018-07-01,00:03:34,10,0,18,1,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2447,822,0,2018-07-01,00:05:43,15,1,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2447,822,0,2018-07-01,00:05:43,15,1,1,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,2448,614,0,2018-07-01,00:19:06,5,0,3,1,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Stop time

In [28]:
# Really weird occurrences with ancient years in time_stop!
weird_ids = rdf['time_stop'].apply(lambda x: (len(x.split()) == 2))
rdf[weird_ids]

Unnamed: 0,stop_id,beat,isschool,date_stop,time_stop,stopduration,stop_in_response_to_cfs,exp_years,pid,isstudent,...,assignent_H,assignmen_E,assignment_A,assignment_B,assignment_C,assignment_D,assignment_F,assignment_G,assignment_I,assignment_J
97194,92205,524,0,2019-01-18,1900-01-01 04:48:00,30,1,10,1,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
113838,109883,313,0,2019-02-22,1900-01-01 02:53:09,15,0,1,1,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
124771,120608,722,0,2019-03-16,1900-01-01 05:47:19,30,0,1,1,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
127568,123099,512,0,2019-03-22,1900-01-01 01:56:00,25,0,1,1,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138748,132994,446,0,2019-04-11,1900-01-01 03:33:24,45,1,1,1,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438827,397980,723,0,2021-01-29,1899-12-30 00:00:00,180,0,1,1,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
443226,402207,124,0,2021-02-10,1899-12-30 00:00:00,10,0,1,1,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
443281,402252,836,0,2021-02-10,1899-12-30 00:00:00,90,0,1,1,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
453554,411242,834,0,2021-03-06,1899-12-30 00:00:00,30,0,1,1,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
def make_timestamp(row):
    if len(row.time_stop.split()) == 2:
        row.time_stop = row.time_stop.split()[1]
    
    timestr = row.date_stop + ' ' + row.time_stop
    
    return datetime.strptime(timestr, '%Y-%m-%d %H:%M:%S')

In [30]:
# Cut out the date and assign fixed column (Uncomment to create new time_stop_fixed column)
# rdf['time_stop_fixed'] = rdf['time_stop'].apply(lambda x: x.split()[1] if len(x.split()) == 2 else x)

In [31]:
# Get create timestamp column
rdf['timestamp'] = rdf.apply(lambda row: make_timestamp(row), axis=1)

In [32]:
min_ts = min(rdf['timestamp'])
max_ts = max(rdf['timestamp'])

In [33]:
# play around with timestamp
print(max(rdf['timestamp']))
print(max(rdf['timestamp']) - min(rdf['timestamp']))
print((max_ts - min_ts).total_seconds())

2021-03-31 23:39:35
1004 days 23:37:58
86830678.0


In [34]:
rdf

Unnamed: 0,stop_id,beat,isschool,date_stop,time_stop,stopduration,stop_in_response_to_cfs,exp_years,pid,isstudent,...,assignmen_E,assignment_A,assignment_B,assignment_C,assignment_D,assignment_F,assignment_G,assignment_I,assignment_J,timestamp
0,2443,122,0,2018-07-01,00:01:37,30,0,10,1,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-07-01 00:01:37
1,2444,121,0,2018-07-01,00:03:34,10,0,18,1,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-07-01 00:03:34
2,2447,822,0,2018-07-01,00:05:43,15,1,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2018-07-01 00:05:43
3,2447,822,0,2018-07-01,00:05:43,15,1,1,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2018-07-01 00:05:43
4,2448,614,0,2018-07-01,00:19:06,5,0,3,1,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-07-01 00:19:06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
464039,420325,116,0,2021-03-31,12:15:00,15,0,1,1,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021-03-31 12:15:00
464040,420349,614,0,2021-03-31,09:15:00,25,0,1,1,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021-03-31 09:15:00
464041,420354,614,0,2021-03-31,09:53:00,45,0,1,1,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021-03-31 09:53:00
464042,420366,625,0,2021-03-31,11:27:40,10,0,1,1,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021-03-31 11:27:40
