# Cleaning, Preparation and Modeling

In [71]:
# load pandas for data analysis
import pandas as pd

# load dataset
parking_violations = pd.read_csv('../data/sample/Parking_Violations_Issued_in_August_2018.csv')

In [72]:
# head of dataset
parking_violations.head()

Unnamed: 0,OBJECTID,ISSUE_DATE,ISSUE_TIME,ISSUING_AGENCY_CODE,ISSUING_AGENCY_NAME,ISSUING_AGENCY_SHORT,VIOLATION_CODE,VIOLATION_PROC_DESC,LOCATION,PLATE_STATE,...,PENALTY_2,PENALTY_3,PENALTY_4,PENALTY_5,XCOORD,YCOORD,LATITUDE,LONGITUDE,MAR_ID,GIS_LAST_MOD_DTTM
0,290490,2018/08/24 04:00:00+00,01:44 AM,7.0,METROPOLITAN POLICE DPT-DISTRICT 7,MPD-7D,P344,VEHICLE ON PRIVATE/PUBLIC PROPERTY WITHOUT CON...,4205 4TH ST SE,,...,,1.0,,,399920.0,129047.0,38.829,-77.001,147401.0,2019/05/29 04:16:08+00
1,290491,2018/08/24 04:00:00+00,01:46 PM,7.0,METROPOLITAN POLICE DPT-DISTRICT 7,MPD-7D,P344,VEHICLE ON PRIVATE/PUBLIC PROPERTY WITHOUT CON...,4205 4TH ST SE,,...,,,,,399920.0,129047.0,38.829,-77.001,147401.0,2019/05/29 04:16:08+00
2,290492,2018/08/24 04:00:00+00,02:20 AM,7.0,METROPOLITAN POLICE DPT-DISTRICT 7,MPD-7D,P344,VEHICLE ON PRIVATE/PUBLIC PROPERTY WITHOUT CON...,4329 4TH ST SE,,...,,1.0,,,399867.0,128923.0,38.828,-77.002,147406.0,2019/05/29 04:16:08+00
3,290493,2018/08/24 04:00:00+00,02:25 AM,7.0,METROPOLITAN POLICE DPT-DISTRICT 7,MPD-7D,P344,VEHICLE ON PRIVATE/PUBLIC PROPERTY WITHOUT CON...,4221 4TH ST SE,,...,,1.0,,,399923.0,128963.0,38.828,-77.001,147405.0,2019/05/29 04:16:08+00
4,290494,2018/08/24 04:00:00+00,03:20 AM,7.0,METROPOLITAN POLICE DPT-DISTRICT 7,MPD-7D,P344,VEHICLE ON PRIVATE/PUBLIC PROPERTY WITHOUT CON...,4337 4TH ST SE,,...,,,,,399860.0,128878.0,38.828,-77.002,147392.0,2019/05/29 04:16:08+00


In [73]:
# dataset info
parking_violations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136522 entries, 0 to 136521
Data columns (total 29 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   OBJECTID              136522 non-null  int64  
 1   ISSUE_DATE            136522 non-null  object 
 2   ISSUE_TIME            136436 non-null  object 
 3   ISSUING_AGENCY_CODE   136498 non-null  float64
 4   ISSUING_AGENCY_NAME   136498 non-null  object 
 5   ISSUING_AGENCY_SHORT  136498 non-null  object 
 6   VIOLATION_CODE        136522 non-null  object 
 7   VIOLATION_PROC_DESC   131860 non-null  object 
 8   LOCATION              136505 non-null  object 
 9   PLATE_STATE           136522 non-null  object 
 10  VEHICLE_TYPE          0 non-null       float64
 11  MULTI_OWNER_NUMBER    6660 non-null    float64
 12  DISPOSITION_CODE      57444 non-null   float64
 13  DISPOSITION_TYPE      136522 non-null  object 
 14  DISPOSITION_DESC      57444 non-null   object 
 15  

## Observations

Null fields:
* VEHICLE_TYPE
* PENALTY_1
* PENALTY_2
* PENALTY_3
* PENALTY_4
* PENALTY_5

Disposition fields - determine significance of DISPOSITION_CODE, DISPOSITION_TYPE, DISPOSITION_DESC, DISPOSITION_DATE
    
Date fields that require additional review:
* ISSUE_DATE - format
* GIS_LAST_MOD_DTTM - format, determine unique values and overall relevance
    

In [74]:
# drop null columns
parking_violations.drop(['VEHICLE_TYPE', 'PENALTY_1', 'PENALTY_2', 'PENALTY_3', 'PENALTY_4', 'PENALTY_5'], axis=1, inplace=True)

In [75]:
# dispositions are dismissed tickets
# remove records with dispositions
parking_violations = parking_violations[parking_violations['DISPOSITION_DESC'].isna()]

# drop disposition columns
parking_violations.drop(['DISPOSITION_CODE', 'DISPOSITION_TYPE', 'DISPOSITION_DESC', 'DISPOSITION_DATE'], axis=1, inplace=True)

In [76]:
parking_violations['MULTI_OWNER_NUMBER'].unique()

array([           nan, 9.50368779e+08, 2.07110000e+07, ...,
       9.50152495e+08, 9.50361956e+08, 9.50334466e+08])

In [77]:
parking_violations.drop('MULTI_OWNER_NUMBER', axis=1, inplace=True)

In [78]:
parking_violations['GIS_LAST_MOD_DTTM'].unique()

array(['2019/05/29 04:16:08+00'], dtype=object)

In [79]:
# column has a single value, drop it
parking_violations.drop('GIS_LAST_MOD_DTTM', axis=1, inplace=True)

In [80]:
parking_violations['PLATE_STATE'].unique()

array([' ', 'VA', 'MD'], dtype=object)

In [81]:
parking_violations.drop('PLATE_STATE', axis=1, inplace=True)

In [82]:
parking_violations.drop('VIOLATION_PROC_DESC', axis=1, inplace=True)

In [83]:
parking_violations.drop(['XCOORD', 'YCOORD', 'MAR_ID', 'OBJECTID'], axis=1, inplace=True)

In [84]:
# resolve date fields
from dateutil.parser import parse


def format_date(date_str):
    return parse(date_str).date()


def format_time(time_str):
    return parse(time_str).time()

In [85]:
parking_violations['FORMAT_DATE'] = parking_violations.apply(lambda x: format_date(x['ISSUE_DATE']), axis=1)

In [86]:
parking_violations.dropna(subset=['ISSUE_TIME'], inplace=True)

In [87]:
parking_violations['FORMAT_TIME'] = parking_violations.apply(lambda x: format_time(x['ISSUE_TIME']), axis=1)

In [88]:
# drop rows with nulls in meaningful dimensions
parking_violations.dropna(subset=['ISSUING_AGENCY_CODE', 'LATITUDE', 'LONGITUDE'], inplace=True)

In [89]:
# cast ISSUING_AGENCY_CODE to int
parking_violations['ISSUING_AGENCY_CODE'] = parking_violations['ISSUING_AGENCY_CODE'].astype('int')

In [90]:
# cast FINE_AMOUNT to int
parking_violations['FINE_AMOUNT'].fillna(0, inplace=True)
parking_violations['FINE_AMOUNT'] = parking_violations['FINE_AMOUNT'].astype('int')

In [91]:
parking_violations.head()

Unnamed: 0,ISSUE_DATE,ISSUE_TIME,ISSUING_AGENCY_CODE,ISSUING_AGENCY_NAME,ISSUING_AGENCY_SHORT,VIOLATION_CODE,LOCATION,FINE_AMOUNT,TOTAL_PAID,LATITUDE,LONGITUDE,FORMAT_DATE,FORMAT_TIME
1,2018/08/24 04:00:00+00,01:46 PM,7,METROPOLITAN POLICE DPT-DISTRICT 7,MPD-7D,P344,4205 4TH ST SE,250,250,38.829,-77.001,2018-08-24,13:46:00
9,2018/08/09 04:00:00+00,07:13 AM,25,SPECIAL OPERATION DIV & TRAFFIC DIV,MPD-SOD,P014,700 F ST NW,0,0,38.897,-77.022,2018-08-09,07:13:00
14,2018/08/26 04:00:00+00,08:55 AM,59,US. BUREAU OF ENGRAVING AND PRINTNG,BEP,P170,1200 BLK D ST SW,100,100,38.885,-77.029,2018-08-26,08:55:00
20,2018/08/07 04:00:00+00,02:45 AM,2,METROPOLITAN POLICE DPT-DISTRICT 2,MPD-2D,P344,100 MICHIGAN AVE NE,250,250,38.927,-77.006,2018-08-07,02:45:00
21,2018/08/07 04:00:00+00,02:50 AM,2,METROPOLITAN POLICE DPT-DISTRICT 2,MPD-2D,P344,100 MICHIGAN AVE NE,250,500,38.927,-77.006,2018-08-07,02:50:00


In [92]:
parking_violations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 76888 entries, 1 to 136517
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   ISSUE_DATE            76888 non-null  object 
 1   ISSUE_TIME            76888 non-null  object 
 2   ISSUING_AGENCY_CODE   76888 non-null  int64  
 3   ISSUING_AGENCY_NAME   76888 non-null  object 
 4   ISSUING_AGENCY_SHORT  76888 non-null  object 
 5   VIOLATION_CODE        76888 non-null  object 
 6   LOCATION              76888 non-null  object 
 7   FINE_AMOUNT           76888 non-null  int64  
 8   TOTAL_PAID            76888 non-null  int64  
 9   LATITUDE              76888 non-null  float64
 10  LONGITUDE             76888 non-null  float64
 11  FORMAT_DATE           76888 non-null  object 
 12  FORMAT_TIME           76888 non-null  object 
dtypes: float64(2), int64(3), object(8)
memory usage: 8.2+ MB
