In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("Data/NYPD_Only.csv", low_memory = False)

In [3]:
df.head(5)

Unnamed: 0,Ticket Number,Violation Date,Violation Time,Issuing Agency,Respondent First Name,Respondent Last Name,Balance Due,Violation Location (Borough),Violation Location (Block No.),Violation Location (Lot No.),...,Charge #8: Code Description,Charge #8: Infraction Amount,Charge #9: Code,Charge #9: Code Section,Charge #9: Code Description,Charge #9: Infraction Amount,Charge #10: Code,Charge #10: Code Section,Charge #10: Code Description,Charge #10: Infraction Amount
0,198948842,02/13/2020,03:15:00,POLICE DEPARTMENT,JUAN,FLORES,112.0,QUEENS,1769.0,55.0,...,,,,,,,,,,
1,198984253,02/12/2020,22:45:00,POLICE DEPARTMENT,ERIC A,NICASIO,112.0,QUEENS,,,...,,,,,,,,,,
2,199274121,02/10/2020,23:15:00,POLICE DEPARTMENT,ELIAS,SULUSA,25.0,QUEENS,1482.0,11.0,...,,,,,,,,,,
3,199009773,02/11/2020,19:33:00,POLICE DEPARTMENT,YOON S,CHO,0.0,QUEENS,,,...,,,,,,,,,,
4,199417525,12/30/2019,17:25:00,POLICE DEPARTMENT,STAVROULA,KOKKOROS,0.0,QUEENS,,,...,,,,,,,,,,


In [4]:
df.shape

(766394, 78)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 766394 entries, 0 to 766393
Data columns (total 78 columns):
 #   Column                                                           Non-Null Count   Dtype  
---  ------                                                           --------------   -----  
 0   Ticket Number                                                    766394 non-null  object 
 1   Violation Date                                                   763596 non-null  object 
 2   Violation Time                                                   764415 non-null  object 
 3   Issuing Agency                                                   766394 non-null  object 
 4   Respondent First Name                                            498874 non-null  object 
 5   Respondent Last Name                                             762112 non-null  object 
 6   Balance Due                                                      313916 non-null  float64
 7   Violation Location (Borough) 

In [62]:
# set it to datetime to check for min and max 
df["Violation Date"] = pd.to_datetime(df["Violation Date"])

print("Oldest ticket from the dataset is on {}".format(df["Violation Date"].min()))
print("Latest ticket from the dataset is on {}".format(df["Violation Date"].max()))

Oldest ticket from the dataset is on 1963-02-09 00:00:00
Latest ticket from the dataset is on 2021-09-11 00:00:00


# Identify the target variable "Hearing Result" and drop nulls and relable. 

In [6]:
df.dropna(subset=["Hearing Result"], inplace=True)

In [7]:
df.shape

(611757, 78)

In [8]:
df["Hearing Result"].value_counts()

WRITTEN OFF     263475
DEFAULTED       120361
DISMISSED       109692
IN VIOLATION     85387
DEFAULT          21334
SETTL IN-VIO     10815
ADJOURNED          411
POP IN-VIOL        123
POP/IN-VIOL        117
NONE                24
STIPULATED          11
COSRV INVIO          7
Name: Hearing Result, dtype: int64

In [9]:
df["Hearing Result"].replace({"POP IN-VIOL": "IN VIOLATION", 
                              "POP/IN-VIOL": "IN VIOLATION", 
                              "SETTL IN-VIO": "IN VIOLATION",
                              "COSRV INVIO": "IN VIOLATION",
                              "DEFAULT": "DEFAULTED"}, inplace=True)

In [10]:
df = df.loc[(df["Hearing Result"] != "NONE") & (df["Hearing Result"] != "ADJOURNED") & (df["Hearing Result"] != "STIPULATED")]

In [11]:
df["Hearing Result"].value_counts()

WRITTEN OFF     263475
DEFAULTED       141695
DISMISSED       109692
IN VIOLATION     96449
Name: Hearing Result, dtype: int64

In [12]:
df['Hearing Result'] = df['Hearing Result'].map({'WRITTEN OFF': 0,
                                     'DEFAULTED': 1,
                                     'DISMISSED': 2,
                                     'IN VIOLATION': 3,})

In [13]:
df.shape

(611311, 78)

## EDA and Cleaning Columns

only want to focus on respondents whose address is in New York because I will merge this dataframe with neighborhood data later on.

In [14]:
df["Respondent Address (State Name)"].value_counts()

NEW YORK               568515
NEW JERSEY              20209
PENNSYLVANIA             1194
FLORIDA                   920
CONNECTICUT               694
MASSACHUSETTS             537
NORTH CAROLINA            358
VIRGINIA                  281
MARYLAND                  280
CALIFORNIA                238
GEORGIA                   178
ILLINOIS                  177
SOUTH CAROLINA            163
WASHINGTON                144
TEXAS                     138
MISSOURI                  111
OHIO                      108
RHODE ISLAND              107
MICHIGAN                  101
OTHER                      56
DELAWARE                   54
TENNESSEE                  45
OKLAHOMA                   42
ALABAMA                    40
LOUISIANA                  36
DIST COLUMBIA              34
NEBRASKA                   31
ARIZONA                    29
COLORADO                   28
MAINE                      25
OREGON                     23
INDIANA                    21
KENTUCKY                   20
WISCONSIN 

In [15]:
df = df.loc[(df["Respondent Address (State Name)"] == "NEW YORK")]

## drop columns that contain all null values

In [29]:
column_list = list(df.columns)

In [30]:
empty_columns = []

for x in column_list:

    if df[x].isnull().all() == True:
        empty_columns.append(x)

df.drop(empty_columns, axis = 1, inplace=True)

In [31]:
df.shape

(568515, 48)

In [32]:
df.columns

Index(['Ticket Number', 'Violation Date', 'Violation Time', 'Issuing Agency',
       'Respondent First Name', 'Respondent Last Name', 'Balance Due',
       'Violation Location (Borough)', 'Violation Location (Block No.)',
       'Violation Location (Lot No.)', 'Violation Location (House #)',
       'Violation Location (Street Name)', 'Violation Location (City)',
       'Violation Location (Zip Code)', 'Violation Location (State Name)',
       'Respondent Address (Borough)', 'Respondent Address (House #)',
       'Respondent Address (Street Name)', 'Respondent Address (City)',
       'Respondent Address (Zip Code)', 'Respondent Address (State Name)',
       'Hearing Status', 'Hearing Result', 'Scheduled Hearing Location',
       'Hearing Date', 'Hearing Time', 'Decision Location (Borough)',
       'Decision Date', 'Total Violation Amount', 'Violation Details',
       'Date Judgment Docketed',
       'Respondent Address or Facility Number(For FDNY and DOB Tickets)',
       'Penalty Impos

In [37]:
columns_to_drop = ['Violation Time','Balance Due',
       'Hearing Status','Scheduled Hearing Location',
       'Hearing Date', 'Hearing Time',
       'Decision Date', 'Total Violation Amount', 'Violation Details',
       'Date Judgment Docketed',
       'Respondent Address or Facility Number(For FDNY and DOB Tickets)',
        'Additional Penalties or Late Fees',
       'Compliance Status']

df.drop(columns_to_drop, axis = 1, inplace=True)
df.shape

(568515, 35)

In [19]:
# pd.options.display.max_colwidth = 1000000
# pd.set_option('display.max_columns', 2000000000)
# pd.set_option('display.max_rows', 1000000000)
# pd.set_option('display.expand_frame_repr', True)

### Feature enginnering the violation location and respondent's address column

In [43]:
df.dropna(subset=["Violation Location (Zip Code)"], inplace=True)
df.dropna(subset=["Respondent Address (Zip Code)"], inplace=True)

In [45]:
viol_cols = ['Violation Location (Borough)', 'Violation Location (Block No.)',
       'Violation Location (Lot No.)', 'Violation Location (House #)',
       'Violation Location (Street Name)',
       'Violation Location (City)', 'Violation Location (Zip Code)',
       'Violation Location (State Name)']


resp_cols = ['Respondent Address (Borough)',
       'Respondent Address (House #)', 'Respondent Address (Street Name)',
       'Respondent Address (City)', 'Respondent Address (Zip Code)',
       'Respondent Address (State Name)']



df['complete violation location'] = df[viol_cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
df['complete violation location'] = df['complete violation location'].map(lambda x: x.replace("nan", ''))


df['complete respondent location'] = df[resp_cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
df['complete respondent location'] = df['complete respondent location'].map(lambda x: x.replace("nan", ''))

In [51]:
print("Violation Location (Zip Code) has {} nulls".format(df['Violation Location (Zip Code)'].isnull().sum()))
print("Respondent Address (Zip Code) has {} nulls".format(df['Respondent Address (Zip Code)'].isnull().sum()))

Violation Location (Zip Code) has 0 nulls
Respondent Address (Zip Code) has 0 nulls


In [52]:
df.shape

(284325, 37)

In [57]:

df["complete violation location"].isnull().sum()

0

# now that the dataframe is cut down, I will save it as a new dataframe to do more work for faster analysis

In [55]:
df.to_csv('cleaned_dataframe.csv')