In [2]:
import pandas as pd

In [3]:
bucket = 'capstoneprojectoath'
data_key =  'NYPD_Only.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)

In [4]:
df = pd.read_csv(data_location, low_memory=False, error_bad_lines = False)

In [16]:
df.shape

(766394, 78)

### Identify the target variable "Hearing Result" and drop nulls and relable. 

In [5]:
df.dropna(subset=["Hearing Result"], inplace=True)

In [6]:
df.shape

(611757, 78)

In [7]:
df["Hearing Result"].value_counts()

WRITTEN OFF     263475
DEFAULTED       120361
DISMISSED       109692
IN VIOLATION     85387
DEFAULT          21334
SETTL IN-VIO     10815
ADJOURNED          411
POP IN-VIOL        123
POP/IN-VIOL        117
NONE                24
STIPULATED          11
COSRV INVIO          7
Name: Hearing Result, dtype: int64

In [8]:
df["Hearing Result"].replace({"POP IN-VIOL": "IN VIOLATION", 
                              "POP/IN-VIOL": "IN VIOLATION", 
                              "SETTL IN-VIO": "IN VIOLATION",
                              "COSRV INVIO": "IN VIOLATION",
                              "DEFAULT": "DEFAULTED"}, inplace=True)

In [9]:
df = df.loc[(df["Hearing Result"] != "NONE") & (df["Hearing Result"] != "ADJOURNED") & (df["Hearing Result"] != "STIPULATED")]

In [10]:
df["Hearing Result"].value_counts()

WRITTEN OFF     263475
DEFAULTED       141695
DISMISSED       109692
IN VIOLATION     96449
Name: Hearing Result, dtype: int64

In [41]:
df.shape

(568515, 66)

### EDA and Cleaning Columns

only want to focus on respondents residing in New York

In [11]:
df["Respondent Address (State Name)"].value_counts()

NEW YORK               568515
NEW JERSEY              20209
PENNSYLVANIA             1194
FLORIDA                   920
CONNECTICUT               694
MASSACHUSETTS             537
NORTH CAROLINA            358
VIRGINIA                  281
MARYLAND                  280
CALIFORNIA                238
GEORGIA                   178
ILLINOIS                  177
SOUTH CAROLINA            163
WASHINGTON                144
TEXAS                     138
MISSOURI                  111
OHIO                      108
RHODE ISLAND              107
MICHIGAN                  101
OTHER                      56
DELAWARE                   54
TENNESSEE                  45
OKLAHOMA                   42
ALABAMA                    40
LOUISIANA                  36
DIST COLUMBIA              34
NEBRASKA                   31
ARIZONA                    29
COLORADO                   28
MAINE                      25
OREGON                     23
INDIANA                    21
WISCONSIN                  20
KENTUCKY  

In [12]:
df = df.loc[(df["Respondent Address (State Name)"] == "NEW YORK")]

In [13]:
df.columns

Index(['Ticket Number', 'Violation Date', 'Violation Time', 'Issuing Agency',
       'Respondent First Name', 'Respondent Last Name', 'Balance Due',
       'Violation Location (Borough)', 'Violation Location (Block No.)',
       'Violation Location (Lot No.)', 'Violation Location (House #)',
       'Violation Location (Street Name)', 'Violation Location (Floor)',
       'Violation Location (City)', 'Violation Location (Zip Code)',
       'Violation Location (State Name)', 'Respondent Address (Borough)',
       'Respondent Address (House #)', 'Respondent Address (Street Name)',
       'Respondent Address (City)', 'Respondent Address (Zip Code)',
       'Respondent Address (State Name)', 'Hearing Status', 'Hearing Result',
       'Scheduled Hearing Location', 'Hearing Date', 'Hearing Time',
       'Decision Location (Borough)', 'Decision Date',
       'Total Violation Amount', 'Violation Details', 'Date Judgment Docketed',
       'Respondent Address or Facility Number(For FDNY and DOB Ti

In [15]:
columns_to_drop = ['Violation Time','Respondent First Name', 'Respondent Last Name',
       'Balance Due',
       'Hearing Status', 'Hearing Date', 'Hearing Time', 'Decision Date',
       'Date Judgment Docketed',
       'Respondent Address or Facility Number(For FDNY and DOB Tickets)',
       'Additional Penalties or Late Fees',
       'Compliance Status']

In [16]:
df.drop(columns_to_drop, axis = 1, inplace=True)

In [17]:
df.shape

(568515, 66)

In [18]:
pd.options.display.max_colwidth = 1000000
pd.set_option('display.max_columns', 2000000000)
pd.set_option('display.max_rows', 1000000000)
pd.set_option('display.expand_frame_repr', True)

### Feature Enginnering the violation location and respondent's address column

In [29]:
viol_cols = ['Violation Location (Borough)', 'Violation Location (Block No.)',
       'Violation Location (Lot No.)', 'Violation Location (House #)',
       'Violation Location (Street Name)', 'Violation Location (Floor)',
       'Violation Location (City)', 'Violation Location (Zip Code)',
       'Violation Location (State Name)']


resp_cols = ['Respondent Address (Borough)',
       'Respondent Address (House #)', 'Respondent Address (Street Name)',
       'Respondent Address (City)', 'Respondent Address (Zip Code)',
       'Respondent Address (State Name)']



df['complete violation location'] = df[viol_cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
df['complete violation location'] = df['complete violation location'].map(lambda x: x.replace("nan", ''))


df['complete respondent location'] = df[resp_cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
df['complete respondent location'] = df['complete respondent location'].map(lambda x: x.replace("nan", ''))

In [None]:
df

In [28]:
df['Violation Location (Zip Code)'].isnull().sum()

276180

In [27]:
df["Respondent Address (Zip Code)"].isnull().sum()

24201

In [21]:

df["complete violation location"].isnull().sum()

0

In [None]:
df['complete respondent location']

In [None]:
df["Violation Date"]

In [None]:
df["Violation Date"] = pd.to_datetime(df["Violation Date"])

In [None]:
df["Violation Date"].min()

In [None]:
df["Violation Date"].max()

In [None]:
df["Violation Date"].value_counts()

In [None]:
df["Hearing Result"].value_counts()

In [None]:
df["Violation Date"].min()

In [None]:
df["Hearing Result"].value_counts(normalize=True)

In [None]:
df["Hearing Result"].isnull().sum()

In [None]:
df["Issuing Agency"].value_counts()

In [None]:
df["Violation Location (State Name)"].value_counts()

In [None]:
pd.set_option('display.max_columns', 500)
df.loc[df["Violation Location (State Name)"] == "COLORADO"]