In [350]:
import pandas as pd
import numpy as np


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder , StandardScaler 
from sklearn.compose import ColumnTransformer

from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate, GridSearchCV


from sklearn.metrics import confusion_matrix, plot_confusion_matrix,\
    precision_score, recall_score, accuracy_score, f1_score, log_loss,\
    roc_curve, roc_auc_score, classification_report

In [233]:
df = pd.read_csv("df_train.csv", low_memory = False)
df['Respondent Address (Zip Code)'] = df['Respondent Address (Zip Code)'].astype(str)

In [234]:
df.shape

(213243, 38)

In [235]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,Ticket Number,Violation Date,Issuing Agency,Respondent First Name,Respondent Last Name,Violation Location (Borough),Violation Location (Block No.),Violation Location (Lot No.),Violation Location (House #),Violation Location (Street Name),Violation Location (City),Violation Location (Zip Code),Violation Location (State Name),Respondent Address (Borough),Respondent Address (House #),Respondent Address (Street Name),Respondent Address (City),Respondent Address (Zip Code),Respondent Address (State Name),Decision Location (Borough),Penalty Imposed,Paid Amount,Charge #1: Code,Charge #1: Code Section,Charge #1: Code Description,Charge #1: Infraction Amount,Charge #2: Code,Charge #2: Code Section,Charge #2: Code Description,Charge #2: Infraction Amount,Charge #3: Code,Charge #3: Code Section,Charge #3: Code Description,Charge #3: Infraction Amount,complete violation location,complete respondent location,Hearing Result
0,545274,162420262,2012-08-30,POLICE DEPARTMENT,RONALD,BARONE,BROOKLYN,,,1711,EAST 33 STREET,BROOKLYN,11234,NEW YORK,BROOKLYN,21,SEBA AVENUE,BROOKLYN,11229,NEW YORK,SAU: MANH,0.0,0.0,,,,,AN68,A.C. 24-238 A,IMPROPER AUDIBLE BURGLAR ALARM WITH NO AUTOMATIC TERMINATION,280.0,,,,,BROOKLYN 1711 EAST 33 STREET BROOKLYN 11234 NEW YORK,BROOKLYN 21 SEBA AVENUE BROOKLYN 11229 NEW YORK,2
1,25788,112647160,2003-11-23,POLICE DEPARTMENT,JOHN,PEZZULLO,BROOKLYN,989.0,42.0,256,7 AVENUE,BROOKLYN,11215,NEW YORK,BROOKLYN,359,9 AVE,BROOKLYN,11215,NEW YORK,,50.0,,AG13,20-465(C),STAND OR GOODS TOUCHING/LEANING AGAINST BUILDING,100.0,,,,,,,,,BROOKLYN 989.0 42.0 256 7 AVENUE BROOKLYN 11215 NEW YORK,BROOKLYN 359 9 AVE BROOKLYN 11215 NEW YORK,0


In [236]:
df = df.drop("Unnamed: 0", axis=1)

In [237]:
# Create a function to fill in UNKNOWN for all NAs in the dataframe
def fill_na(column):
    
    for x in column:
        df[x].fillna('UNKNOWN', inplace=True)
    
    return df.head(2)

In [238]:
print(list(df.columns))

columns_to_fill = ['Ticket Number', 'Violation Date', 'Issuing Agency', 'Respondent First Name', 'Respondent Last Name', 'Violation Location (Borough)', 'Violation Location (Block No.)', 'Violation Location (Lot No.)', 'Violation Location (House #)', 'Violation Location (Street Name)', 'Violation Location (City)', 'Violation Location (Zip Code)', 'Violation Location (State Name)', 'Respondent Address (Borough)', 'Respondent Address (House #)', 'Respondent Address (Street Name)', 'Respondent Address (City)', 'Respondent Address (Zip Code)', 'Respondent Address (State Name)', 'Decision Location (Borough)','Charge #1: Code', 'Charge #1: Code Section', 'Charge #1: Code Description', 'Charge #1: Infraction Amount', 'Charge #2: Code', 'Charge #2: Code Section', 'Charge #2: Code Description', 'Charge #2: Infraction Amount', 'Charge #3: Code', 'Charge #3: Code Section', 'Charge #3: Code Description', 'Charge #3: Infraction Amount', 'complete violation location', 'complete respondent location', 'Hearing Result']

['Ticket Number', 'Violation Date', 'Issuing Agency', 'Respondent First Name', 'Respondent Last Name', 'Violation Location (Borough)', 'Violation Location (Block No.)', 'Violation Location (Lot No.)', 'Violation Location (House #)', 'Violation Location (Street Name)', 'Violation Location (City)', 'Violation Location (Zip Code)', 'Violation Location (State Name)', 'Respondent Address (Borough)', 'Respondent Address (House #)', 'Respondent Address (Street Name)', 'Respondent Address (City)', 'Respondent Address (Zip Code)', 'Respondent Address (State Name)', 'Decision Location (Borough)', 'Penalty Imposed', 'Paid Amount', 'Charge #1: Code', 'Charge #1: Code Section', 'Charge #1: Code Description', 'Charge #1: Infraction Amount', 'Charge #2: Code', 'Charge #2: Code Section', 'Charge #2: Code Description', 'Charge #2: Infraction Amount', 'Charge #3: Code', 'Charge #3: Code Section', 'Charge #3: Code Description', 'Charge #3: Infraction Amount', 'complete violation location', 'complete resp

In [239]:
column_names = list(columns_to_fill)
fill_na(column_names)

Unnamed: 0,Ticket Number,Violation Date,Issuing Agency,Respondent First Name,Respondent Last Name,Violation Location (Borough),Violation Location (Block No.),Violation Location (Lot No.),Violation Location (House #),Violation Location (Street Name),Violation Location (City),Violation Location (Zip Code),Violation Location (State Name),Respondent Address (Borough),Respondent Address (House #),Respondent Address (Street Name),Respondent Address (City),Respondent Address (Zip Code),Respondent Address (State Name),Decision Location (Borough),Penalty Imposed,Paid Amount,Charge #1: Code,Charge #1: Code Section,Charge #1: Code Description,Charge #1: Infraction Amount,Charge #2: Code,Charge #2: Code Section,Charge #2: Code Description,Charge #2: Infraction Amount,Charge #3: Code,Charge #3: Code Section,Charge #3: Code Description,Charge #3: Infraction Amount,complete violation location,complete respondent location,Hearing Result
0,162420262,2012-08-30,POLICE DEPARTMENT,RONALD,BARONE,BROOKLYN,UNKNOWN,UNKNOWN,1711,EAST 33 STREET,BROOKLYN,11234,NEW YORK,BROOKLYN,21,SEBA AVENUE,BROOKLYN,11229,NEW YORK,SAU: MANH,0.0,0.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,AN68,A.C. 24-238 A,IMPROPER AUDIBLE BURGLAR ALARM WITH NO AUTOMATIC TERMINATION,280.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 1711 EAST 33 STREET BROOKLYN 11234 NEW YORK,BROOKLYN 21 SEBA AVENUE BROOKLYN 11229 NEW YORK,2
1,112647160,2003-11-23,POLICE DEPARTMENT,JOHN,PEZZULLO,BROOKLYN,989.0,42.0,256,7 AVENUE,BROOKLYN,11215,NEW YORK,BROOKLYN,359,9 AVE,BROOKLYN,11215,NEW YORK,UNKNOWN,50.0,,AG13,20-465(C),STAND OR GOODS TOUCHING/LEANING AGAINST BUILDING,100.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 989.0 42.0 256 7 AVENUE BROOKLYN 11215 NEW YORK,BROOKLYN 359 9 AVE BROOKLYN 11215 NEW YORK,0


In [240]:
# check to see that is the case
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213243 entries, 0 to 213242
Data columns (total 37 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   Ticket Number                     213243 non-null  object 
 1   Violation Date                    213243 non-null  object 
 2   Issuing Agency                    213243 non-null  object 
 3   Respondent First Name             213243 non-null  object 
 4   Respondent Last Name              213243 non-null  object 
 5   Violation Location (Borough)      213243 non-null  object 
 6   Violation Location (Block No.)    213243 non-null  object 
 7   Violation Location (Lot No.)      213243 non-null  object 
 8   Violation Location (House #)      213243 non-null  object 
 9   Violation Location (Street Name)  213243 non-null  object 
 10  Violation Location (City)         213243 non-null  object 
 11  Violation Location (Zip Code)     213243 non-null  o

In [241]:
df["Charge #2: Infraction Amount"].value_counts().head(3)

UNKNOWN    209908
0.0          2447
1000.0        157
Name: Charge #2: Infraction Amount, dtype: int64

## Bring in neighborhood level income data from the census 

1. main problem is USPS zip code (which is what the main df has) is different from census's zip code tabulation area number (thought a signficant portion of them match up identically). For example, one's USPS zip code could be 11333 but its zip code tabulation area could be 11332

2. Need to go through the following steps to get the census data to match up with the main dataframe 

     A. add a new column with ZIP Code Tabulation Areas (ZCTAs) so to pull census data using ZCTA
     
     B. web scrape census to get the list of ZCTA based on the dataframe 
     
     C. merge the dataframes together so each row contain neighborhood level income data 

In [242]:
nyc_zip_codes = ["10001", "10002", "10003", "10004", "10005", "10006",
                 "10007","10009","10010","10011","10012","10013","10014",
                 "10015","10016","10017","10018","10019","10020","10021",
                 "10022","10023","10024","10025","10026","10027","10028",
                 "10029","10030","10031","10032","10033","10034","10035",
                 "10036","10037","10038","10039","10040","10041","10044",
                 "10045","10048","10055","10060","10069","10090","10095",
                 "10098","10099","10103","10104","10105","10106","10107",
                 "10110","10111","10112","10115","10118","10119","10120",
                 "10121","10122","10123","10128","10151","10152","10153",
                 "10154","10155","10158","10161","10162","10165","10166",
                 "10167","10168","10169","10170","10171","10172","10173",
                 "10174","10175","10176","10177","10178","10199","10270",
                 "10271","10278","10279","10280","10281","10282","10301",
                 "10302","10303","10304","10305","10306","10307","10308",
                 "10309","10310","10311","10312","10314","10451","10452",
                 "10453","10454","10455","10456","10457","10458","10459",
                 "10460","10461","10462","10463","10464","10465","10466",
                 "10467","10468","10469","10470","10471","10472","10473",
                 "10474","10475","11004","11101","11102","11103","11104",
                 "11105","11106","11109","11201","11203","11204","11205",
                 "11206","11207","11208","11209","11210","11211","11212",
                 "11213","11214","11215","11216","11217","11218","11219",
                 "11220","11221","11222","11223","11224","11225","11226",
                 "11228","11229","11230","11231","11232","11233","11234",
                 "11235","11236","11237","11238","11239","11241","11242",
                 "11243","11249","11252","11256","11351","11354","11355",
                 "11356","11357","11358","11359","11360","11361","11362",
                 "11363","11364","11365","11366","11367","11368","11369",
                 "11370","11371","11372","11373","11374","11375","11377",
                 "11378","11379","11385","11411","11412","11413","11414",
                 "11415","11416","11417","11418","11419","11420","11421",
                 "11422","11423","11426","11427","11428","11429","11430",
                 "11432","11433","11434","11435","11436","11691","11692",
                 "11693","11694","11697"]


In [243]:
# this dataset contains USPS zip_code and ZCTA zip code for several states. 
# itindicates whether some a USPS zip code matches with a ZCTA code
# if certain ones doesn't match up, it indicates the equivalence of that


ZiptoZcta_Crosswalk_2021 = pd.read_excel("ZiptoZcta_Crosswalk_2021.xlsx")
ZiptoZcta_Crosswalk_2021["ZIP_CODE"] = ZiptoZcta_Crosswalk_2021["ZIP_CODE"].astype(str)
df['Respondent Address (Zip Code)'] = df['Respondent Address (Zip Code)'].astype(str)

# narrown down the df to only pull out zip codes that matches the nyc zipcode list above
ZiptoZcta_Crosswalk_2021["ZIP_CODE_NYC"] = np.where(ZiptoZcta_Crosswalk_2021["ZIP_CODE"].isin(nyc_zip_codes), "NYC", "Other")

In [244]:
print(ZiptoZcta_Crosswalk_2021["ZIP_CODE_NYC"].value_counts())

ZiptoZcta_Crosswalk_2021_NYC = ZiptoZcta_Crosswalk_2021.loc[ZiptoZcta_Crosswalk_2021["ZIP_CODE_NYC"] == "NYC"]
ZiptoZcta_Crosswalk_2021_NYC

Other    40873
NYC        234
Name: ZIP_CODE_NYC, dtype: int64


Unnamed: 0,ZIP_CODE,PO_NAME,STATE,ZIP_TYPE,ZCTA,zip_join_type,ZIP_CODE_NYC
0,10001,New York,NY,Zip Code Area,10001,Zip matches ZCTA,NYC
1,10002,New York,NY,Zip Code Area,10002,Zip matches ZCTA,NYC
3190,10003,New York,NY,Zip Code Area,10003,Zip matches ZCTA,NYC
3191,10004,New York,NY,Zip Code Area,10004,Zip matches ZCTA,NYC
3192,10005,New York,NY,Zip Code Area,10005,Zip matches ZCTA,NYC
...,...,...,...,...,...,...,...
5087,11242,Brooklyn,NY,Post Office or large volume customer,11201,Spatial join to ZCTA,NYC
5088,11243,Brooklyn,NY,Post Office or large volume customer,11217,Spatial join to ZCTA,NYC
5091,11249,Brooklyn,NY,Zip Code Area,11211,Spatial join to ZCTA,NYC
5093,11252,Brooklyn,NY,Zip Code Area,11209,Spatial join to ZCTA,NYC


In [245]:
ZiptoZcta_Crosswalk_2021_NYC.to_csv("NYC_Only_ZiptoZcta_Crosswalk_2021.csv")

In [246]:
#create a dic that has the usps zip code on the left and ZCTA code on the right so we can map it 
ZiptoZcta_Crosswalk_2021_NYC_dict = dict(zip(ZiptoZcta_Crosswalk_2021_NYC.ZIP_CODE, ZiptoZcta_Crosswalk_2021_NYC.ZCTA))
ZiptoZcta_Crosswalk_2021_NYC_dict

{'10001': '10001',
 '10002': '10002',
 '10003': '10003',
 '10004': '10004',
 '10005': '10005',
 '10006': '10006',
 '10007': '10007',
 '10009': '10009',
 '10010': '10010',
 '10011': '10011',
 '10012': '10012',
 '10013': '10013',
 '10014': '10014',
 '10016': '10016',
 '10017': '10017',
 '10018': '10018',
 '10019': '10019',
 '10020': '10020',
 '10021': '10021',
 '10022': '10022',
 '10023': '10023',
 '10024': '10024',
 '10025': '10025',
 '10026': '10026',
 '10027': '10027',
 '10028': '10028',
 '10029': '10029',
 '10030': '10030',
 '10031': '10031',
 '10032': '10032',
 '10033': '10033',
 '10034': '10034',
 '10035': '10035',
 '10036': '10036',
 '10037': '10037',
 '10038': '10038',
 '10039': '10039',
 '10040': '10040',
 '10044': '10044',
 '10069': '10069',
 '10103': '10103',
 '10110': '10110',
 '10111': '10111',
 '10112': '10112',
 '10115': '10115',
 '10119': '10119',
 '10128': '10128',
 '10152': '10152',
 '10153': '10153',
 '10154': '10154',
 '10162': '10162',
 '10165': '10165',
 '10167': '1

In [247]:
#using the key above, if the respondent address column matches the key, then the new column will match it with the value
df['Respondent ZCTA'] = df['Respondent Address (Zip Code)'].map(ZiptoZcta_Crosswalk_2021_NYC_dict)

In [248]:
df["Respondent ZCTA"].isnull().value_counts()

False    208967
True       4276
Name: Respondent ZCTA, dtype: int64

In [249]:
df['Respondent ZCTA'] = df["Respondent ZCTA"].astype(str)

print("Respondent ZCTA column contains a total {} unique zip codes. I will use this list to do web scraping\
to get the relevant census files".format(len(df['Respondent ZCTA'].unique())))

Respondent_ZCTA_list = list(set(df['Respondent ZCTA']))
print(Respondent_ZCTA_list)

Respondent ZCTA column contains a total 189 unique zip codes. I will use this list to do web scrapingto get the relevant census files
['10033', '11211', '10103', '11214', '11412', '11239', '10465', '10170', '11235', '11355', '10021', '10024', '10013', '10308', '10455', '10016', '10451', '10464', '11004', '11210', '11691', '11207', '10009', '10472', '10014', '11411', '11697', '11428', '10473', '11104', '10467', '10037', '11219', '11426', '10032', '11360', '11429', '10003', '10031', '11203', '11225', '10165', '11215', '10271', '11204', '10030', '10307', '10019', '11414', '11413', '11368', '10026', '10305', '10006', '11693', '10018', '10312', '11385', '10471', '10280', '11369', '10044', '11364', '11356', '10463', '11372', '11230', '11366', '10012', '11367', '11417', '10304', '10119', '10303', '10466', '10461', '10020', '11374', '11419', '11201', '11232', '11694', '10027', '10040', '11229', '10453', '11415', '11223', '10474', 'nan', '11226', '11436', '10111', '11220', '11421', '10456', '11

# Web Scraping

In [None]:
pip install selenium

In [None]:
pip install chromedriver-binary

In [None]:
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup
import chromedriver_binary
#import get to call a get request on the site
from requests import get
import re

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import chromedriver_binary

driver = webdriver.Chrome("/Users/allisongao/Downloads/chromedriver 4")

In [None]:
ZCTA_url =[]

for x in Respondent_ZCTA_list:
    ZCTA_url.append("https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US" + x + "&tid=ACSST5Y2019.S1901&hidePreview=true")
    
    
ZCTA_url

In [None]:
#code to auto click on the download button to get all the census data
import time
# #importing webdriver from selenium
# from selenium import webdriver
 
# # Here Chrome  will be used
# driver = webdriver.Chrome("/Users/allisongao/Downloads/chromedriver 4")
 
# firs_group = ['https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US11229&tid=ACSST5Y2019.S1901&hidePreview=true',
#  'https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US11215&tid=ACSST5Y2019.S1901&hidePreview=true',
#  'https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US11435&tid=ACSST5Y2019.S1901&hidePreview=true',
#  'https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US11208&tid=ACSST5Y2019.S1901&hidePreview=true',
#  'https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US11367&tid=ACSST5Y2019.S1901&hidePreview=true',
#  'https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US11206&tid=ACSST5Y2019.S1901&hidePreview=true',
#  'https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US10458&tid=ACSST5Y2019.S1901&hidePreview=true']
    
# # URL of website

# for url in firs_group:
#     driver.get(url)
    
#     downloadBtn = driver.find_element_by_xpath("//*[text() = 'Download']")
#     downloadBtn.click()
#     time.sleep(5)
#     downloadBtn2 = driver.find_element_by_xpath("//button[@class='aqua-button mt-5']")
#     downloadBtn2.click()
#     time.sleep(2)
#     downloadBtn3=driver.find_element_by_xpath('//*[@id="table-download-now-button"]')
#     downloadBtn3.click()


for x in Respondent_ZCTA_list_39:
    
    link="https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US" + x + "&tid=ACSST5Y2019.S1901&hidePreview=true"
#     link= x
    driver.get(link)
    downloadBtn = driver.find_element_by_xpath("//*[text() = 'Download']")
    downloadBtn.click()
    time.sleep(60)
    downloadBtn2 = driver.find_element_by_xpath("//button[@class='aqua-button mt-5']")
    downloadBtn2.click()
    time.sleep(60)
    downloadBtn3=driver.find_element_by_xpath('//*[@id="table-download-now-button"]')
    downloadBtn3.click()
    time.sleep(60)

# load census data

In [250]:
# load all the census data by zip code and concat it into a single dataframe

zcta_df = []

for x in range(1, 190):
    data = pd.read_csv(str(x) + "ACS.csv")
    
    zcta_df.append(data)
    
#merge them all horizontally 
zcta_df = pd.concat(zcta_df)   


#drop duplicated rows
zcta_df.drop_duplicates(keep='first', inplace=True)

zcta_df = zcta_df.reset_index(drop=True)

#second row is an observation for the entire country, let's drop it
zcta_df.drop(2, inplace=True)

 #change the first row for the header
new_header = zcta_df.iloc[0]
zcta_df = zcta_df[1:]
zcta_df.columns = new_header 

#make sure df is in good shape
zcta_df.head(5)

Unnamed: 0,id,Geographic Area Name,Estimate!!Households!!Total,Margin of Error!!Households!!Total,"Estimate!!Households!!Total!!Less than $10,000","Margin of Error!!Households!!Total!!Less than $10,000","Estimate!!Households!!Total!!$10,000 to $14,999","Margin of Error!!Households!!Total!!$10,000 to $14,999","Estimate!!Households!!Total!!$15,000 to $24,999","Margin of Error!!Households!!Total!!$15,000 to $24,999","Estimate!!Households!!Total!!$25,000 to $34,999","Margin of Error!!Households!!Total!!$25,000 to $34,999","Estimate!!Households!!Total!!$35,000 to $49,999","Margin of Error!!Households!!Total!!$35,000 to $49,999","Estimate!!Households!!Total!!$50,000 to $74,999","Margin of Error!!Households!!Total!!$50,000 to $74,999","Estimate!!Households!!Total!!$75,000 to $99,999","Margin of Error!!Households!!Total!!$75,000 to $99,999","Estimate!!Households!!Total!!$100,000 to $149,999","Margin of Error!!Households!!Total!!$100,000 to $149,999","Estimate!!Households!!Total!!$150,000 to $199,999","Margin of Error!!Households!!Total!!$150,000 to $199,999","Estimate!!Households!!Total!!$200,000 or more","Margin of Error!!Households!!Total!!$200,000 or more",Estimate!!Households!!Median income (dollars),Margin of Error!!Households!!Median income (dollars),Estimate!!Households!!Mean income (dollars),Margin of Error!!Households!!Mean income (dollars),Estimate!!Households!!PERCENT ALLOCATED!!Household income in the past 12 months,Margin of Error!!Households!!PERCENT ALLOCATED!!Household income in the past 12 months,Estimate!!Households!!PERCENT ALLOCATED!!Family income in the past 12 months,Margin of Error!!Households!!PERCENT ALLOCATED!!Family income in the past 12 months,Estimate!!Households!!PERCENT ALLOCATED!!Nonfamily income in the past 12 months,Margin of Error!!Households!!PERCENT ALLOCATED!!Nonfamily income in the past 12 months,Estimate!!Families!!Total,Margin of Error!!Families!!Total,"Estimate!!Families!!Total!!Less than $10,000","Margin of Error!!Families!!Total!!Less than $10,000","Estimate!!Families!!Total!!$10,000 to $14,999","Margin of Error!!Families!!Total!!$10,000 to $14,999","Estimate!!Families!!Total!!$15,000 to $24,999","Margin of Error!!Families!!Total!!$15,000 to $24,999","Estimate!!Families!!Total!!$25,000 to $34,999","Margin of Error!!Families!!Total!!$25,000 to $34,999","Estimate!!Families!!Total!!$35,000 to $49,999","Margin of Error!!Families!!Total!!$35,000 to $49,999","Estimate!!Families!!Total!!$50,000 to $74,999","Margin of Error!!Families!!Total!!$50,000 to $74,999","Estimate!!Families!!Total!!$75,000 to $99,999","Margin of Error!!Families!!Total!!$75,000 to $99,999","Estimate!!Families!!Total!!$100,000 to $149,999","Margin of Error!!Families!!Total!!$100,000 to $149,999","Estimate!!Families!!Total!!$150,000 to $199,999","Margin of Error!!Families!!Total!!$150,000 to $199,999","Estimate!!Families!!Total!!$200,000 or more","Margin of Error!!Families!!Total!!$200,000 or more",Estimate!!Families!!Median income (dollars),Margin of Error!!Families!!Median income (dollars),Estimate!!Families!!Mean income (dollars),Margin of Error!!Families!!Mean income (dollars),Estimate!!Families!!PERCENT ALLOCATED!!Household income in the past 12 months,Margin of Error!!Families!!PERCENT ALLOCATED!!Household income in the past 12 months,Estimate!!Families!!PERCENT ALLOCATED!!Family income in the past 12 months,Margin of Error!!Families!!PERCENT ALLOCATED!!Family income in the past 12 months,Estimate!!Families!!PERCENT ALLOCATED!!Nonfamily income in the past 12 months,Margin of Error!!Families!!PERCENT ALLOCATED!!Nonfamily income in the past 12 months,Estimate!!Married-couple families!!Total,Margin of Error!!Married-couple families!!Total,"Estimate!!Married-couple families!!Total!!Less than $10,000","Margin of Error!!Married-couple families!!Total!!Less than $10,000","Estimate!!Married-couple families!!Total!!$10,000 to $14,999","Margin of Error!!Married-couple families!!Total!!$10,000 to $14,999","Estimate!!Married-couple families!!Total!!$15,000 to $24,999","Margin of Error!!Married-couple families!!Total!!$15,000 to $24,999","Estimate!!Married-couple families!!Total!!$25,000 to $34,999","Margin of Error!!Married-couple families!!Total!!$25,000 to $34,999","Estimate!!Married-couple families!!Total!!$35,000 to $49,999","Margin of Error!!Married-couple families!!Total!!$35,000 to $49,999","Estimate!!Married-couple families!!Total!!$50,000 to $74,999","Margin of Error!!Married-couple families!!Total!!$50,000 to $74,999","Estimate!!Married-couple families!!Total!!$75,000 to $99,999","Margin of Error!!Married-couple families!!Total!!$75,000 to $99,999","Estimate!!Married-couple families!!Total!!$100,000 to $149,999","Margin of Error!!Married-couple families!!Total!!$100,000 to $149,999","Estimate!!Married-couple families!!Total!!$150,000 to $199,999","Margin of Error!!Married-couple families!!Total!!$150,000 to $199,999","Estimate!!Married-couple families!!Total!!$200,000 or more","Margin of Error!!Married-couple families!!Total!!$200,000 or more",Estimate!!Married-couple families!!Median income (dollars),Margin of Error!!Married-couple families!!Median income (dollars),Estimate!!Married-couple families!!Mean income (dollars),Margin of Error!!Married-couple families!!Mean income (dollars),Estimate!!Married-couple families!!PERCENT ALLOCATED!!Household income in the past 12 months,Margin of Error!!Married-couple families!!PERCENT ALLOCATED!!Household income in the past 12 months,Estimate!!Married-couple families!!PERCENT ALLOCATED!!Family income in the past 12 months,Margin of Error!!Married-couple families!!PERCENT ALLOCATED!!Family income in the past 12 months,Estimate!!Married-couple families!!PERCENT ALLOCATED!!Nonfamily income in the past 12 months,Margin of Error!!Married-couple families!!PERCENT ALLOCATED!!Nonfamily income in the past 12 months,Estimate!!Nonfamily households!!Total,Margin of Error!!Nonfamily households!!Total,"Estimate!!Nonfamily households!!Total!!Less than $10,000","Margin of Error!!Nonfamily households!!Total!!Less than $10,000","Estimate!!Nonfamily households!!Total!!$10,000 to $14,999","Margin of Error!!Nonfamily households!!Total!!$10,000 to $14,999","Estimate!!Nonfamily households!!Total!!$15,000 to $24,999","Margin of Error!!Nonfamily households!!Total!!$15,000 to $24,999","Estimate!!Nonfamily households!!Total!!$25,000 to $34,999","Margin of Error!!Nonfamily households!!Total!!$25,000 to $34,999","Estimate!!Nonfamily households!!Total!!$35,000 to $49,999","Margin of Error!!Nonfamily households!!Total!!$35,000 to $49,999","Estimate!!Nonfamily households!!Total!!$50,000 to $74,999","Margin of Error!!Nonfamily households!!Total!!$50,000 to $74,999","Estimate!!Nonfamily households!!Total!!$75,000 to $99,999","Margin of Error!!Nonfamily households!!Total!!$75,000 to $99,999","Estimate!!Nonfamily households!!Total!!$100,000 to $149,999","Margin of Error!!Nonfamily households!!Total!!$100,000 to $149,999","Estimate!!Nonfamily households!!Total!!$150,000 to $199,999","Margin of Error!!Nonfamily households!!Total!!$150,000 to $199,999","Estimate!!Nonfamily households!!Total!!$200,000 or more","Margin of Error!!Nonfamily households!!Total!!$200,000 or more",Estimate!!Nonfamily households!!Median income (dollars),Margin of Error!!Nonfamily households!!Median income (dollars),Estimate!!Nonfamily households!!Mean income (dollars),Margin of Error!!Nonfamily households!!Mean income (dollars),Estimate!!Nonfamily households!!PERCENT ALLOCATED!!Household income in the past 12 months,Margin of Error!!Nonfamily households!!PERCENT ALLOCATED!!Household income in the past 12 months,Estimate!!Nonfamily households!!PERCENT ALLOCATED!!Family income in the past 12 months,Margin of Error!!Nonfamily households!!PERCENT ALLOCATED!!Family income in the past 12 months,Estimate!!Nonfamily households!!PERCENT ALLOCATED!!Nonfamily income in the past 12 months,Margin of Error!!Nonfamily households!!PERCENT ALLOCATED!!Nonfamily income in the past 12 months
1,8600000US10005,ZCTA5 10005,4363,396,4.5,2.3,0.3,0.5,2.6,2.0,0.3,0.4,1.7,1.4,6.2,3.2,5.2,2.9,16.3,5.3,16.4,5.2,46.5,6.6,189702,15832,225453,25771,21.8,(X),(X),(X),(X),(X),1422,296,0.9,1.5,0.0,2.3,2.0,3.3,0.8,1.2,0.2,0.3,2.7,3.9,4.4,4.5,15.3,9.9,8.1,5.7,65.5,11.7,241094,37175,312806,57275,(X),(X),21.3,(X),(X),(X),1268,279,1.0,1.7,0.0,2.5,0.0,2.5,0.0,2.5,0.2,0.4,3.1,4.4,5.0,5.0,17.1,11.1,8.0,6.3,65.6,12.4,239800,34372,N,N,(X),(X),(X),(X),(X),(X),2941,432,6.3,3.4,0.5,0.8,2.9,2.6,0.0,1.1,2.4,2.1,7.9,4.2,5.6,3.8,16.9,6.8,20.7,7.1,36.9,8.1,180389,15320,180629,19749,(X),(X),(X),(X),22.0,(X)
3,8600000US10002,ZCTA5 10002,33459,574,14.7,1.7,11.8,1.3,13.3,1.7,8.6,1.2,10.9,1.6,11.0,1.4,8.6,1.3,9.7,1.2,4.3,0.7,7.1,0.8,36982,2384,69323,3102,32.0,(X),(X),(X),(X),(X),16723,775,8.5,1.9,10.2,2.0,15.6,2.5,10.8,1.9,14.4,2.1,12.8,2.0,7.6,1.4,8.8,1.6,4.1,1.0,7.2,1.1,40045,3383,72064,4340,(X),(X),36.6,(X),(X),(X),9865,727,4.5,1.6,11.5,2.6,14.4,3.0,11.1,2.6,13.6,3.0,11.6,2.4,7.8,2.1,9.5,2.4,5.3,1.7,10.7,1.9,43952,5337,86881,6899,(X),(X),(X),(X),(X),(X),16736,868,20.9,2.6,13.5,2.1,11.2,2.4,6.2,1.3,7.8,2.1,9.4,2.0,9.5,1.8,10.6,1.9,4.2,1.0,6.7,1.1,31767,4138,65351,4947,(X),(X),(X),(X),27.1,(X)
4,8600000US10003,ZCTA5 10003,26043,638,5.8,1.3,2.5,1.0,4.8,1.2,4.9,1.3,6.1,1.5,12.5,2.1,8.8,1.7,13.4,1.8,11.1,1.6,30.2,2.4,118161,10235,196931,12709,31.2,(X),(X),(X),(X),(X),7504,569,1.4,1.1,1.5,1.0,1.2,1.2,3.0,1.6,5.3,2.1,7.9,3.4,8.4,2.5,14.1,3.6,10.5,2.7,46.7,4.3,183787,27211,320538,32403,(X),(X),34.5,(X),(X),(X),6544,542,0.5,0.7,0.5,0.5,0.7,0.9,2.4,1.7,6.1,2.4,7.2,3.6,9.1,2.7,14.0,3.9,10.5,2.8,49.0,4.7,193778,25886,N,N,(X),(X),(X),(X),(X),(X),18539,718,7.6,1.8,3.0,1.3,6.2,1.6,5.5,1.6,6.5,2.0,14.4,2.5,8.9,1.9,13.2,2.1,11.6,2.0,23.2,2.6,95812,6867,146060,9333,(X),(X),(X),(X),29.9,(X)
5,8600000US11215,ZCTA5 11215,28445,510,4.0,0.9,2.1,0.6,3.2,0.7,3.4,0.8,4.4,0.9,9.1,1.2,10.6,1.4,19.1,1.7,13.9,1.5,30.2,1.7,132091,4395,183544,7698,31.3,(X),(X),(X),(X),(X),16621,569,1.8,0.7,0.8,0.5,2.1,0.8,1.8,0.8,3.9,1.0,6.9,1.3,7.9,1.4,17.8,2.0,15.5,1.8,41.5,2.5,170511,9001,229884,11295,(X),(X),29.9,(X),(X),(X),13954,561,1.1,0.7,0.0,0.1,1.2,0.6,1.3,0.7,3.1,1.0,5.8,1.4,6.4,1.3,17.2,2.1,16.0,2.1,47.8,2.8,188946,12173,N,N,(X),(X),(X),(X),(X),(X),11824,595,7.4,1.8,4.2,1.4,5.3,1.5,5.5,1.5,5.8,1.4,12.7,2.0,15.0,2.3,21.0,2.6,11.4,2.3,11.9,1.9,91672,4607,108235,5400,(X),(X),(X),(X),31.5,(X)
6,8600000US10031,ZCTA5 10031,21744,574,9.8,1.9,7.1,1.6,9.8,1.6,9.4,1.9,11.8,1.9,14.8,2.0,11.6,1.8,13.1,1.7,5.7,1.4,7.0,1.3,53660,3965,78767,4819,38.3,(X),(X),(X),(X),(X),10591,512,6.1,1.9,5.2,2.0,11.1,2.4,11.1,2.9,11.0,2.2,16.8,3.3,11.1,2.4,13.2,2.5,6.2,2.3,8.1,1.9,54699,4132,85680,7381,(X),(X),40.7,(X),(X),(X),5177,495,2.0,1.4,3.5,2.5,8.4,2.7,10.8,3.8,10.0,3.2,14.5,4.2,11.1,3.2,19.9,4.3,7.4,3.1,12.5,3.4,78197,12406,N,N,(X),(X),(X),(X),(X),(X),11153,591,14.5,3.0,8.8,2.4,10.0,2.2,8.0,2.1,12.6,2.6,12.8,2.6,11.7,2.7,10.8,2.2,5.4,1.8,5.5,1.9,45735,4549,68336,6814,(X),(X),(X),(X),35.0,(X)


In [251]:
# need to convert the Geographic Area Name column so that I can use this to merge it to the main df
zcta_df["Geographic Area Name"]= zcta_df["Geographic Area Name"].str.replace('ZCTA5', '')
zcta_df['Geographic Area Name'] = zcta_df['Geographic Area Name'].astype(str)
zcta_df['Geographic Area Name'] = zcta_df['Geographic Area Name'].str.lstrip()

In [252]:
for x in list (zcta_df.columns.tolist()):
    print(x)


id
Geographic Area Name
Estimate!!Households!!Total
Margin of Error!!Households!!Total
Estimate!!Households!!Total!!Less than $10,000
Margin of Error!!Households!!Total!!Less than $10,000
Estimate!!Households!!Total!!$10,000 to $14,999
Margin of Error!!Households!!Total!!$10,000 to $14,999
Estimate!!Households!!Total!!$15,000 to $24,999
Margin of Error!!Households!!Total!!$15,000 to $24,999
Estimate!!Households!!Total!!$25,000 to $34,999
Margin of Error!!Households!!Total!!$25,000 to $34,999
Estimate!!Households!!Total!!$35,000 to $49,999
Margin of Error!!Households!!Total!!$35,000 to $49,999
Estimate!!Households!!Total!!$50,000 to $74,999
Margin of Error!!Households!!Total!!$50,000 to $74,999
Estimate!!Households!!Total!!$75,000 to $99,999
Margin of Error!!Households!!Total!!$75,000 to $99,999
Estimate!!Households!!Total!!$100,000 to $149,999
Margin of Error!!Households!!Total!!$100,000 to $149,999
Estimate!!Households!!Total!!$150,000 to $199,999
Margin of Error!!Households!!Total!!$

In [253]:
# the census data df has a lot of information
# isolate the relevant columns for the final df

columns = ['Geographic Area Name',
           
            "Estimate!!Nonfamily households!!Median income (dollars)",
            "Estimate!!Nonfamily households!!Mean income (dollars)",
            "Estimate!!Nonfamily households!!Total!!Less than $10,000",
            "Estimate!!Nonfamily households!!Total!!$10,000 to $14,999",
            "Estimate!!Nonfamily households!!Total!!$15,000 to $24,999",
            "Estimate!!Nonfamily households!!Total!!$25,000 to $34,999",
            "Estimate!!Nonfamily households!!Total!!$35,000 to $49,999",
            "Estimate!!Nonfamily households!!Total!!$50,000 to $74,999",
           
           
           
           "Estimate!!Households!!Median income (dollars)",
           "Estimate!!Households!!Mean income (dollars)",
           "Estimate!!Households!!Total!!Less than $10,000",
            "Estimate!!Households!!Total!!$10,000 to $14,999",
            "Estimate!!Households!!Total!!$15,000 to $24,999",
            "Estimate!!Households!!Total!!$25,000 to $34,999",
            "Estimate!!Households!!Total!!$35,000 to $49,999",
            "Estimate!!Households!!Total!!$50,000 to $74,999"]
           

In [254]:
zcta_df = zcta_df[columns]
zcta_df.head(8)

Unnamed: 0,Geographic Area Name,Estimate!!Nonfamily households!!Median income (dollars),Estimate!!Nonfamily households!!Mean income (dollars),"Estimate!!Nonfamily households!!Total!!Less than $10,000","Estimate!!Nonfamily households!!Total!!$10,000 to $14,999","Estimate!!Nonfamily households!!Total!!$15,000 to $24,999","Estimate!!Nonfamily households!!Total!!$25,000 to $34,999","Estimate!!Nonfamily households!!Total!!$35,000 to $49,999","Estimate!!Nonfamily households!!Total!!$50,000 to $74,999",Estimate!!Households!!Median income (dollars),Estimate!!Households!!Mean income (dollars),"Estimate!!Households!!Total!!Less than $10,000","Estimate!!Households!!Total!!$10,000 to $14,999","Estimate!!Households!!Total!!$15,000 to $24,999","Estimate!!Households!!Total!!$25,000 to $34,999","Estimate!!Households!!Total!!$35,000 to $49,999","Estimate!!Households!!Total!!$50,000 to $74,999"
1,10005,180389,180629,6.3,0.5,2.9,0.0,2.4,7.9,189702,225453,4.5,0.3,2.6,0.3,1.7,6.2
3,10002,31767,65351,20.9,13.5,11.2,6.2,7.8,9.4,36982,69323,14.7,11.8,13.3,8.6,10.9,11.0
4,10003,95812,146060,7.6,3.0,6.2,5.5,6.5,14.4,118161,196931,5.8,2.5,4.8,4.9,6.1,12.5
5,11215,91672,108235,7.4,4.2,5.3,5.5,5.8,12.7,132091,183544,4.0,2.1,3.2,3.4,4.4,9.1
6,10031,45735,68336,14.5,8.8,10.0,8.0,12.6,12.8,53660,78767,9.8,7.1,9.8,9.4,11.8,14.8
7,11208,24462,37087,23.3,10.5,17.0,11.7,12.3,10.6,42403,60092,15.0,5.9,11.5,9.7,13.5,15.1
8,10023,100531,147850,8.5,3.5,6.0,5.9,4.9,11.0,132605,228373,5.7,2.6,5.4,4.9,3.7,8.0
9,11217,94148,120780,10.1,2.9,7.0,4.8,6.2,9.4,119375,172251,6.7,1.9,5.4,5.0,6.4,8.7


In [255]:
zcta_df = zcta_df.rename(columns={"Geographic Area Name": 'Respondent Address (Zip Code)'})

In [256]:
column_need_convert = ["Estimate!!Nonfamily households!!Median income (dollars)",
            "Estimate!!Nonfamily households!!Mean income (dollars)",
            "Estimate!!Nonfamily households!!Total!!Less than $10,000",
            "Estimate!!Nonfamily households!!Total!!$10,000 to $14,999",
            "Estimate!!Nonfamily households!!Total!!$15,000 to $24,999",
            "Estimate!!Nonfamily households!!Total!!$25,000 to $34,999",
            "Estimate!!Nonfamily households!!Total!!$35,000 to $49,999",
            "Estimate!!Nonfamily households!!Total!!$50,000 to $74,999",
           
           
           
           "Estimate!!Households!!Median income (dollars)",
           "Estimate!!Households!!Mean income (dollars)",
           "Estimate!!Households!!Total!!Less than $10,000",
            "Estimate!!Households!!Total!!$10,000 to $14,999",
            "Estimate!!Households!!Total!!$15,000 to $24,999",
            "Estimate!!Households!!Total!!$25,000 to $34,999",
            "Estimate!!Households!!Total!!$35,000 to $49,999",
            "Estimate!!Households!!Total!!$50,000 to $74,999"]

In [257]:
for x in column_need_convert:
    
    zcta_df[x].replace('-', np.nan, inplace=True)
    zcta_df[x] = zcta_df[x].astype("float64")
    zcta_df[x].fillna(zcta_df[x].mean(), inplace=True)
    
    

In [258]:
zcta_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 188 entries, 1 to 189
Data columns (total 17 columns):
 #   Column                                                     Non-Null Count  Dtype  
---  ------                                                     --------------  -----  
 0   Respondent Address (Zip Code)                              188 non-null    object 
 1   Estimate!!Nonfamily households!!Median income (dollars)    188 non-null    float64
 2   Estimate!!Nonfamily households!!Mean income (dollars)      188 non-null    float64
 3   Estimate!!Nonfamily households!!Total!!Less than $10,000   188 non-null    float64
 4   Estimate!!Nonfamily households!!Total!!$10,000 to $14,999  188 non-null    float64
 5   Estimate!!Nonfamily households!!Total!!$15,000 to $24,999  188 non-null    float64
 6   Estimate!!Nonfamily households!!Total!!$25,000 to $34,999  188 non-null    float64
 7   Estimate!!Nonfamily households!!Total!!$35,000 to $49,999  188 non-null    float64
 8   Estimate!!

In [259]:
zcta_df.iloc[0,0]

'10005'

In [260]:
df.iloc[7550,17]

'10005'

# merge main df with the census df

In [261]:
merged_train_df = pd.merge(df, zcta_df, how="outer", on=["Respondent Address (Zip Code)"])

In [262]:
merged_train_df.tail(1)

Unnamed: 0,Ticket Number,Violation Date,Issuing Agency,Respondent First Name,Respondent Last Name,Violation Location (Borough),Violation Location (Block No.),Violation Location (Lot No.),Violation Location (House #),Violation Location (Street Name),Violation Location (City),Violation Location (Zip Code),Violation Location (State Name),Respondent Address (Borough),Respondent Address (House #),Respondent Address (Street Name),Respondent Address (City),Respondent Address (Zip Code),Respondent Address (State Name),Decision Location (Borough),Penalty Imposed,Paid Amount,Charge #1: Code,Charge #1: Code Section,Charge #1: Code Description,Charge #1: Infraction Amount,Charge #2: Code,Charge #2: Code Section,Charge #2: Code Description,Charge #2: Infraction Amount,Charge #3: Code,Charge #3: Code Section,Charge #3: Code Description,Charge #3: Infraction Amount,complete violation location,complete respondent location,Hearing Result,Respondent ZCTA,Estimate!!Nonfamily households!!Median income (dollars),Estimate!!Nonfamily households!!Mean income (dollars),"Estimate!!Nonfamily households!!Total!!Less than $10,000","Estimate!!Nonfamily households!!Total!!$10,000 to $14,999","Estimate!!Nonfamily households!!Total!!$15,000 to $24,999","Estimate!!Nonfamily households!!Total!!$25,000 to $34,999","Estimate!!Nonfamily households!!Total!!$35,000 to $49,999","Estimate!!Nonfamily households!!Total!!$50,000 to $74,999",Estimate!!Households!!Median income (dollars),Estimate!!Households!!Mean income (dollars),"Estimate!!Households!!Total!!Less than $10,000","Estimate!!Households!!Total!!$10,000 to $14,999","Estimate!!Households!!Total!!$15,000 to $24,999","Estimate!!Households!!Total!!$25,000 to $34,999","Estimate!!Households!!Total!!$35,000 to $49,999","Estimate!!Households!!Total!!$50,000 to $74,999"
213243,,,,,,,,,,,,,,,,,,10199,,,,,,,,,,,,,,,,,,,,,51558.398844,73118.947977,13.435838,9.293064,11.201734,8.663584,10.505202,14.07052,75616.485549,106351.543353,7.816763,5.321387,8.372254,7.57341,9.93237,14.142197


In [263]:
merged_train_df.drop(merged_train_df.tail(1).index,inplace=True)

In [264]:
merged_train_df.shape

(213243, 54)

In [265]:
#moved the target column to the first for easier visual
first_column = merged_train_df.pop('Hearing Result')
merged_train_df.insert(0, 'Hearing Result', first_column)
merged_train_df.head(2)

Unnamed: 0,Hearing Result,Ticket Number,Violation Date,Issuing Agency,Respondent First Name,Respondent Last Name,Violation Location (Borough),Violation Location (Block No.),Violation Location (Lot No.),Violation Location (House #),Violation Location (Street Name),Violation Location (City),Violation Location (Zip Code),Violation Location (State Name),Respondent Address (Borough),Respondent Address (House #),Respondent Address (Street Name),Respondent Address (City),Respondent Address (Zip Code),Respondent Address (State Name),Decision Location (Borough),Penalty Imposed,Paid Amount,Charge #1: Code,Charge #1: Code Section,Charge #1: Code Description,Charge #1: Infraction Amount,Charge #2: Code,Charge #2: Code Section,Charge #2: Code Description,Charge #2: Infraction Amount,Charge #3: Code,Charge #3: Code Section,Charge #3: Code Description,Charge #3: Infraction Amount,complete violation location,complete respondent location,Respondent ZCTA,Estimate!!Nonfamily households!!Median income (dollars),Estimate!!Nonfamily households!!Mean income (dollars),"Estimate!!Nonfamily households!!Total!!Less than $10,000","Estimate!!Nonfamily households!!Total!!$10,000 to $14,999","Estimate!!Nonfamily households!!Total!!$15,000 to $24,999","Estimate!!Nonfamily households!!Total!!$25,000 to $34,999","Estimate!!Nonfamily households!!Total!!$35,000 to $49,999","Estimate!!Nonfamily households!!Total!!$50,000 to $74,999",Estimate!!Households!!Median income (dollars),Estimate!!Households!!Mean income (dollars),"Estimate!!Households!!Total!!Less than $10,000","Estimate!!Households!!Total!!$10,000 to $14,999","Estimate!!Households!!Total!!$15,000 to $24,999","Estimate!!Households!!Total!!$25,000 to $34,999","Estimate!!Households!!Total!!$35,000 to $49,999","Estimate!!Households!!Total!!$50,000 to $74,999"
0,2.0,162420262,2012-08-30,POLICE DEPARTMENT,RONALD,BARONE,BROOKLYN,UNKNOWN,UNKNOWN,1711,EAST 33 STREET,BROOKLYN,11234,NEW YORK,BROOKLYN,21,SEBA AVENUE,BROOKLYN,11229,NEW YORK,SAU: MANH,0.0,0.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,AN68,A.C. 24-238 A,IMPROPER AUDIBLE BURGLAR ALARM WITH NO AUTOMATIC TERMINATION,280.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 1711 EAST 33 STREET BROOKLYN 11234 NEW YORK,BROOKLYN 21 SEBA AVENUE BROOKLYN 11229 NEW YORK,11229,32281.0,49761.0,15.2,14.0,14.0,8.8,13.1,14.0,64631.0,87355.0,6.8,6.3,9.5,7.8,11.3,13.4
1,0.0,176434684,2010-09-10,POLICE DEPARTMENT,KONSTANTIN,TSIPNYATOV,MANHATTAN,1016.0,36.0,1515,BROADWAY,NEW YORK,10036,NEW YORK,BROOKLYN,1815,EAST 17 STREET,BROOKLYN,11229,NEW YORK,UNKNOWN,1000.0,0.0,AG21,20-465.1,VENDING AT TIMES PLACES RESTRICTED BY RULE OF VENDOR REV. PANEL,100.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,MANHATTAN 1016.0 36.0 1515 BROADWAY NEW YORK 10036 NEW YORK,BROOKLYN 1815 EAST 17 STREET BROOKLYN 11229 NEW YORK,11229,32281.0,49761.0,15.2,14.0,14.0,8.8,13.1,14.0,64631.0,87355.0,6.8,6.3,9.5,7.8,11.3,13.4


In [266]:
merged_train_df["Hearing Result"].value_counts(normalize=True)

0.0    0.429200
1.0    0.273505
3.0    0.150786
2.0    0.146509
Name: Hearing Result, dtype: float64

In [267]:
print(list(merged_train_df.columns))

['Hearing Result', 'Ticket Number', 'Violation Date', 'Issuing Agency', 'Respondent First Name', 'Respondent Last Name', 'Violation Location (Borough)', 'Violation Location (Block No.)', 'Violation Location (Lot No.)', 'Violation Location (House #)', 'Violation Location (Street Name)', 'Violation Location (City)', 'Violation Location (Zip Code)', 'Violation Location (State Name)', 'Respondent Address (Borough)', 'Respondent Address (House #)', 'Respondent Address (Street Name)', 'Respondent Address (City)', 'Respondent Address (Zip Code)', 'Respondent Address (State Name)', 'Decision Location (Borough)', 'Penalty Imposed', 'Paid Amount', 'Charge #1: Code', 'Charge #1: Code Section', 'Charge #1: Code Description', 'Charge #1: Infraction Amount', 'Charge #2: Code', 'Charge #2: Code Section', 'Charge #2: Code Description', 'Charge #2: Infraction Amount', 'Charge #3: Code', 'Charge #3: Code Section', 'Charge #3: Code Description', 'Charge #3: Infraction Amount', 'complete violation locatio

columns_fill_na = [
            "Estimate!!Nonfamily households!!Median income (dollars)",
            "Estimate!!Nonfamily households!!Mean income (dollars)",
            "Estimate!!Nonfamily households!!Total!!Less than $10,000",
            "Estimate!!Nonfamily households!!Total!!$10,000 to $14,999",
            "Estimate!!Nonfamily households!!Total!!$15,000 to $24,999",
            "Estimate!!Nonfamily households!!Total!!$25,000 to $34,999",
            "Estimate!!Nonfamily households!!Total!!$35,000 to $49,999",
            "Estimate!!Nonfamily households!!Total!!$50,000 to $74,999",
           "Estimate!!Households!!Median income (dollars)",
           "Estimate!!Households!!Mean income (dollars)",
           "Estimate!!Households!!Total!!Less than $10,000",
            "Estimate!!Households!!Total!!$10,000 to $14,999",
            "Estimate!!Households!!Total!!$15,000 to $24,999",
            "Estimate!!Households!!Total!!$25,000 to $34,999",
            "Estimate!!Households!!Total!!$35,000 to $49,999",
            "Estimate!!Households!!Total!!$50,000 to $74,999"]

def fill_na(column):
    
    for x in column:
        merged_train_df[x].fillna('UNKNOWN', inplace=True)
    
    return merged_train_df.head(2)

fill_na(columns_fill_na)

In [268]:
merged_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 213243 entries, 0 to 213242
Data columns (total 54 columns):
 #   Column                                                     Non-Null Count   Dtype  
---  ------                                                     --------------   -----  
 0   Hearing Result                                             213243 non-null  float64
 1   Ticket Number                                              213243 non-null  object 
 2   Violation Date                                             213243 non-null  object 
 3   Issuing Agency                                             213243 non-null  object 
 4   Respondent First Name                                      213243 non-null  object 
 5   Respondent Last Name                                       213243 non-null  object 
 6   Violation Location (Borough)                               213243 non-null  object 
 7   Violation Location (Block No.)                             213243 non-null  object 

In [269]:
merged_train_df["Violation Location (Borough)"].value_counts()

MANHATTAN    63768
BROOKLYN     56753
BRONX        47226
QUEENS       35687
STATEN IS     9791
NOT NYC         13
UNKNOWN          5
Name: Violation Location (Borough), dtype: int64

In [270]:
# looked through each of the unknown category and corrected their borough based on the addressed provided 
list_zip_code_change_MA = [5149, 56147,113291,149645,187366, 95107,194000,211902]

for x in list_zip_code_change_MA:
    merged_train_df.at[x, "Violation Location (Borough)"] = "MANHATTAN"
    
    

list_zip_code_change_QNS = [52891, 100964,183982,201329]

for x in list_zip_code_change_QNS:
    merged_train_df.at[x, "Violation Location (Borough)"] = "QUEENS"
    
    
    
list_zip_code_change_SI = [59790, 72781,60756,7278,73248,73613,206307]

for x in list_zip_code_change_SI:
    merged_train_df.at[x, "Violation Location (Borough)"] = "STATEN IS"
    
    
merged_train_df.at[48408, "Violation Location (Zip Code)"] = 10038
merged_train_df.at[210827, "Violation Location (Zip Code)"] = 11559
merged_train_df.at[210873, "Violation Location (Zip Code)"] = 11692
merged_train_df.at[210879, "Violation Location (Zip Code)"] = 11233
merged_train_df.at[210892, "Violation Location (Zip Code)"] = 11692

In [271]:
#check to make sure this column is corrected
merged_train_df["Violation Location (Borough)"].value_counts()

MANHATTAN    63776
BROOKLYN     56752
BRONX        47226
QUEENS       35691
STATEN IS     9798
Name: Violation Location (Borough), dtype: int64

In [272]:
merged_train_df["Violation Location (Zip Code)"].value_counts()

10036    9676
10001    6684
11207    6075
10466    5417
11368    4778
         ... 
10151       1
11474       1
10168       1
11555       1
10704       1
Name: Violation Location (Zip Code), Length: 233, dtype: int64

In [273]:
merged_train_df["Respondent Address (Zip Code)"].value_counts()



11207        6387
11368        5507
10466        5506
10457        5077
10456        4951
             ... 
117983702       1
12762           1
107             1
10354           1
14074           1
Name: Respondent Address (Zip Code), Length: 1108, dtype: int64

In [274]:
# pulling out weird respondent address zip code 

117792218 = 11779

117571211 = 11757
117433914 
112212517
117576451
117983702
117274069       
105471054       
7055 =07055 NJ
7102 ==07102 NJ
107012849
8879 == 08879NJ
1008 == unclear delete unknown 
115424211 
104576724
101
7506
117573534       
117033221       
107             
116913065       
100103202
115544540
8610
115201726 
2908
103010468 
112170022
7083
116931854
1226
113733607
7166
7050
7036
116914809
103
7011
7112
7043
117691823       
107103211       
117530754       
115503908
1915
109181420
110035033
8701
115503445
7047
7073 
112041721 
116920311  
116900171     
117794358 
7014
116972206
110
113551701 
112324228
115581926       
7065
115504822
116941710
112
7054 
111
7033 
752
100290289       
112321637       
125949759       
112171006 
112022783 
117061409
100
196
7307
7303
1105
7105
116931498
7631
113
7032            
7726
116915609 
100302472
115012242
112181121
107042262
115532021
116932127
105531607
115504718  
116972203
7304
1120
112181852 
11
117252116
104 
7306
110961359

SyntaxError: leading zeros in decimal integer literals are not permitted; use an 0o prefix for octal integers (<ipython-input-274-ad804dc0ffb4>, line 12)

In [275]:
merged_train_df["Decision Location (Borough)"].value_counts()

UNKNOWN      152391
MANHATTAN     32769
BROOKLYN       8722
QUEENS         6045
BRONX          4846
SAU: MANH      4545
BY PHONE       1412
STATEN IS      1019
LONG ISLA       746
ONE-CLICK       730
SAU: BX           9
SAU: BKLN         8
SAU: LIC          1
Name: Decision Location (Borough), dtype: int64

In [276]:
# based on the value counts above, we can group this column to make it cleaner


merged_train_df['Decision Location (Borough)'] = merged_train_df['Decision Location (Borough)'].map(
                                     {'UNKNOWN': "UNKNOWN",
                                      'MANHATTAN': 'In Person MANHATTAN',
                                      'BROOKLYN': "In Person BROOKLYN",
                                      'QUEENS': "In Person QUEENS",
                                      'BRONX': "In Person BRONX",
                                      'SAU: MANH': "By Mail MANHATTAN",
                                      'BY PHONE': "BY PHONE",
                                      'STATEN IS': "STATEN IS",
                                     'LONG ISLA': "In Person QUEENS",
                                     'SAU: BX': "By Mail BRONX",
                                     'SAU: BKLN': "By Mail BKLN",
                                     'SAU: LIC': "By Mail QUEENS",
                                     'ONE-CLICK ': "Electronic"
                                     })

In [277]:
merged_train_df["Decision Location (Borough)"].value_counts()

UNKNOWN                152391
In Person MANHATTAN     32769
In Person BROOKLYN       8722
In Person QUEENS         6791
In Person BRONX          4846
By Mail MANHATTAN        4545
BY PHONE                 1412
STATEN IS                1019
By Mail BRONX               9
By Mail BKLN                8
By Mail QUEENS              1
Name: Decision Location (Borough), dtype: int64

In [278]:
merged_train_df["Penalty Imposed"].mean()

369.32190048383205

In [279]:
merged_train_df["Paid Amount"].mean()

140.52392961963335

In [280]:
merged_train_df['Paid Amount'].fillna(merged_train_df['Paid Amount'].mean(), inplace=True)
merged_train_df['Penalty Imposed'].fillna(merged_train_df['Penalty Imposed'].mean(), inplace=True)

In [281]:
merged_train_df["Penalty Imposed"].astype(int)
merged_train_df["Paid Amount"].astype(int)

0           0
1           0
2           0
3         121
4         100
         ... 
213238     50
213239    140
213240    449
213241    144
213242      0
Name: Paid Amount, Length: 213243, dtype: int64

In [282]:
merged_train_df["Penalty Imposed - Paid Amount"] = merged_train_df["Penalty Imposed"] - merged_train_df["Paid Amount"]

In [283]:
merged_train_df["Penalty Imposed - Paid Amount"].describe()

count    213243.000000
mean        228.797971
std         478.617166
min      -10888.000000
25%           0.000000
50%         159.476070
75%         300.000000
max       25000.000000
Name: Penalty Imposed - Paid Amount, dtype: float64

In [284]:
Charge #1: Code                                            213243 non-null  object 
Charge #1: Code Section                                    213243 non-null  object 
Charge #1: Code Description                                213243 non-null  object 
Charge #1: Infraction Amount                               213243 non-null  object 
Charge #2: Code                                            213243 non-null  object 
Charge #2: Code Section                                    213243 non-null  object 
Charge #2: Code Description                                213243 non-null  object 
Charge #2: Infraction Amount                               213243 non-null  object 
Charge #3: Code                                            213243 non-null  object 
Charge #3: Code Section                                    213243 non-null  object 
Charge #3: Code Description                                213243 non-null  object 
Charge #3: Infraction Amount 

NameError: name 'Charge' is not defined

In [285]:
# pd.set_option('display.max_rows', 1000000000)
# merged_train_df[merged_train_df["Charge #1: Code Description"] == "DIRTY SIDEWALK DIRTY AREA"]

In [286]:
pd.options.display.max_colwidth = 1000000
pd.set_option('display.max_columns', 2000000000)
merged_train_df.head(3)

Unnamed: 0,Hearing Result,Ticket Number,Violation Date,Issuing Agency,Respondent First Name,Respondent Last Name,Violation Location (Borough),Violation Location (Block No.),Violation Location (Lot No.),Violation Location (House #),Violation Location (Street Name),Violation Location (City),Violation Location (Zip Code),Violation Location (State Name),Respondent Address (Borough),Respondent Address (House #),Respondent Address (Street Name),Respondent Address (City),Respondent Address (Zip Code),Respondent Address (State Name),Decision Location (Borough),Penalty Imposed,Paid Amount,Charge #1: Code,Charge #1: Code Section,Charge #1: Code Description,Charge #1: Infraction Amount,Charge #2: Code,Charge #2: Code Section,Charge #2: Code Description,Charge #2: Infraction Amount,Charge #3: Code,Charge #3: Code Section,Charge #3: Code Description,Charge #3: Infraction Amount,complete violation location,complete respondent location,Respondent ZCTA,Estimate!!Nonfamily households!!Median income (dollars),Estimate!!Nonfamily households!!Mean income (dollars),"Estimate!!Nonfamily households!!Total!!Less than $10,000","Estimate!!Nonfamily households!!Total!!$10,000 to $14,999","Estimate!!Nonfamily households!!Total!!$15,000 to $24,999","Estimate!!Nonfamily households!!Total!!$25,000 to $34,999","Estimate!!Nonfamily households!!Total!!$35,000 to $49,999","Estimate!!Nonfamily households!!Total!!$50,000 to $74,999",Estimate!!Households!!Median income (dollars),Estimate!!Households!!Mean income (dollars),"Estimate!!Households!!Total!!Less than $10,000","Estimate!!Households!!Total!!$10,000 to $14,999","Estimate!!Households!!Total!!$15,000 to $24,999","Estimate!!Households!!Total!!$25,000 to $34,999","Estimate!!Households!!Total!!$35,000 to $49,999","Estimate!!Households!!Total!!$50,000 to $74,999",Penalty Imposed - Paid Amount
0,2.0,0162420262,2012-08-30,POLICE DEPARTMENT,RONALD,BARONE,BROOKLYN,UNKNOWN,UNKNOWN,1711,EAST 33 STREET,BROOKLYN,11234,NEW YORK,BROOKLYN,21,SEBA AVENUE,BROOKLYN,11229,NEW YORK,By Mail MANHATTAN,0.0,0.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,AN68,A.C. 24-238 A,IMPROPER AUDIBLE BURGLAR ALARM WITH NO AUTOMATIC TERMINATION,280.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 1711 EAST 33 STREET BROOKLYN 11234 NEW YORK,BROOKLYN 21 SEBA AVENUE BROOKLYN 11229 NEW YORK,11229,32281.0,49761.0,15.2,14.0,14.0,8.8,13.1,14.0,64631.0,87355.0,6.8,6.3,9.5,7.8,11.3,13.4,0.0
1,0.0,0176434684,2010-09-10,POLICE DEPARTMENT,KONSTANTIN,TSIPNYATOV,MANHATTAN,1016.0,36.0,1515,BROADWAY,NEW YORK,10036,NEW YORK,BROOKLYN,1815,EAST 17 STREET,BROOKLYN,11229,NEW YORK,UNKNOWN,1000.0,0.0,AG21,20-465.1,VENDING AT TIMES PLACES RESTRICTED BY RULE OF VENDOR REV. PANEL,100.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,MANHATTAN 1016.0 36.0 1515 BROADWAY NEW YORK 10036 NEW YORK,BROOKLYN 1815 EAST 17 STREET BROOKLYN 11229 NEW YORK,11229,32281.0,49761.0,15.2,14.0,14.0,8.8,13.1,14.0,64631.0,87355.0,6.8,6.3,9.5,7.8,11.3,13.4,1000.0
2,0.0,040600557P,2009-01-12,SANITATION POLICE,RACHEL,BANAYAN,BROOKLYN,6806.0,3.0,2308,QUENTIN ROAD,BROOKLYN,11229,NEW YORK,BROOKLYN,2308,QUENTIN ROAD,BROOKLYN,11229,NEW YORK,UNKNOWN,350.0,0.0,AS21,A.C. 16-123,"SNOW,ICE DIRT ON SIDEWALKS",100.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 6806.0 3.0 2308 QUENTIN ROAD BROOKLYN 11229 NEW YORK,BROOKLYN 2308 QUENTIN ROAD BROOKLYN 11229 NEW YORK,11229,32281.0,49761.0,15.2,14.0,14.0,8.8,13.1,14.0,64631.0,87355.0,6.8,6.3,9.5,7.8,11.3,13.4,350.0


In [287]:
merged_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 213243 entries, 0 to 213242
Data columns (total 55 columns):
 #   Column                                                     Non-Null Count   Dtype  
---  ------                                                     --------------   -----  
 0   Hearing Result                                             213243 non-null  float64
 1   Ticket Number                                              213243 non-null  object 
 2   Violation Date                                             213243 non-null  object 
 3   Issuing Agency                                             213243 non-null  object 
 4   Respondent First Name                                      213243 non-null  object 
 5   Respondent Last Name                                       213243 non-null  object 
 6   Violation Location (Borough)                               213243 non-null  object 
 7   Violation Location (Block No.)                             213243 non-null  object 

In [288]:
column_need_convert = ["Estimate!!Nonfamily households!!Median income (dollars)",
            "Estimate!!Nonfamily households!!Mean income (dollars)",
            "Estimate!!Nonfamily households!!Total!!Less than $10,000",
            "Estimate!!Nonfamily households!!Total!!$10,000 to $14,999",
            "Estimate!!Nonfamily households!!Total!!$15,000 to $24,999",
            "Estimate!!Nonfamily households!!Total!!$25,000 to $34,999",
            "Estimate!!Nonfamily households!!Total!!$35,000 to $49,999",
            "Estimate!!Nonfamily households!!Total!!$50,000 to $74,999",
           
           
           
           "Estimate!!Households!!Median income (dollars)",
           "Estimate!!Households!!Mean income (dollars)",
           "Estimate!!Households!!Total!!Less than $10,000",
            "Estimate!!Households!!Total!!$10,000 to $14,999",
            "Estimate!!Households!!Total!!$15,000 to $24,999",
            "Estimate!!Households!!Total!!$25,000 to $34,999",
            "Estimate!!Households!!Total!!$35,000 to $49,999",
            "Estimate!!Households!!Total!!$50,000 to $74,999"]

In [289]:
for x in column_need_convert:
    merged_train_df[x].fillna(merged_train_df[x].mean(), inplace=True)

In [290]:
merged_train_df.describe()

Unnamed: 0,Hearing Result,Penalty Imposed,Paid Amount,Estimate!!Nonfamily households!!Median income (dollars),Estimate!!Nonfamily households!!Mean income (dollars),"Estimate!!Nonfamily households!!Total!!Less than $10,000","Estimate!!Nonfamily households!!Total!!$10,000 to $14,999","Estimate!!Nonfamily households!!Total!!$15,000 to $24,999","Estimate!!Nonfamily households!!Total!!$25,000 to $34,999","Estimate!!Nonfamily households!!Total!!$35,000 to $49,999","Estimate!!Nonfamily households!!Total!!$50,000 to $74,999",Estimate!!Households!!Median income (dollars),Estimate!!Households!!Mean income (dollars),"Estimate!!Households!!Total!!Less than $10,000","Estimate!!Households!!Total!!$10,000 to $14,999","Estimate!!Households!!Total!!$15,000 to $24,999","Estimate!!Households!!Total!!$25,000 to $34,999","Estimate!!Households!!Total!!$35,000 to $49,999","Estimate!!Households!!Total!!$50,000 to $74,999",Penalty Imposed - Paid Amount
count,213243.0,213243.0,213243.0,213243.0,213243.0,213243.0,213243.0,213243.0,213243.0,213243.0,213243.0,213243.0,213243.0,213243.0,213243.0,213243.0,213243.0,213243.0,213243.0,213243.0
mean,1.01888,369.3219,140.52393,40157.45202,59323.355019,17.123871,11.833488,12.26515,9.000868,10.566724,13.29964,58117.064177,82893.516989,10.167535,6.878915,10.238894,8.753856,11.263215,15.053626,228.797971
std,1.085588,533.245229,286.238161,22286.31713,30093.901906,7.503109,5.794265,3.165717,2.827782,2.775013,3.552248,24193.326753,38804.899686,5.127376,3.280287,3.25612,2.187683,2.554491,2.915794,478.617166
min,0.0,0.0,-650.0,12857.0,24496.0,1.5,0.0,0.6,0.0,1.3,2.4,21447.0,39790.0,1.2,0.0,0.5,0.3,1.2,3.7,-10888.0
25%,0.0,100.0,0.0,24255.0,41261.0,10.7,7.8,10.3,7.0,8.7,11.0,39753.0,60372.0,6.2,4.5,8.1,7.4,9.9,13.6,0.0
50%,1.0,300.0,140.52393,35293.0,52463.0,15.5,10.5,12.4,9.0,10.4,13.29964,54646.0,77057.0,8.7,6.4,9.9,9.2,11.263215,15.1,159.47607
75%,2.0,369.3219,140.52393,48343.0,68336.0,23.3,16.2,14.7,11.0,12.4,15.2,68530.0,90753.0,14.4,9.0,12.4,10.1,13.5,17.0,300.0
max,3.0,25000.0,14680.0,180389.0,268474.0,35.9,31.1,23.8,22.2,22.0,26.6,224063.0,358261.0,23.6,16.9,18.8,14.9,17.0,22.2,25000.0


### for the respondents, some are individuals and some are commerical entities as indicated on some rows as "LLC." Therefore, need to create a separate column labeling whether the respondent is a person or otherwise

In [291]:
# pd.set_option('display.max_rows', 1000000000)
merged_train_df["Respondent First Name"].value_counts()

UNKNOWN       72283
MARIA          1503
JOSE           1385
LLC            1345
MOHAMED         992
              ...  
POPO              1
ORMOND            1
SUNNATULLA        1
LEVETT            1
DAWN C            1
Name: Respondent First Name, Length: 34045, dtype: int64

In [292]:
merged_train_df.shape

(213243, 55)

In [293]:
key_words_first_name = ["INC", "CORP", "MANAGEMENT","BUS SERVICE AND TOUR", 
"SCIENCES DIVISION",
"HOUSING DEVELOPMENT"]  

In [294]:
key_words_last_name = ["INC", "CORP", "MANAGEMENT","FIRST HOME PROPERTIES",
"COR",
"3 NYC",
"HPENY HOUSING DEVELOPMENT FUND",
"RT HUDSON ELEMENTARY SCHOOL",
"DEVELOPMENT CO",
"HOLDING CO",
"BANANA KELLY HSG DEVE",
"AQUA PROPERTIES",
"THE BROOKLYN UNION GAS CO",
"VANDERBILT MORTGAGE AND FINANC",
"AMERICAN BROKERS CONDUIT",
"CMI BUSINESS FURNITURE",
"FRIENDS LAND DEVELOP",
"HARBOR VIEW PROP LTD",
"INGERSOLL TENANT ASSOC",
"THE BROOKLYN UNION GAS COMPANY ",
"PLAZA CONSTRUCTION",
"AUTO AUCTION"
"FIRST HOME PROP",
"1046 WASHINGTON AVE HDFC",
"DIEGO BEEKMAN MUTUAL HOUSING A",
"REV MANAGEMENT",
"LANDSLIDE PROPERTIES",
"NEIGHBORHOOD RESTORE HOUSING D",
"HTB ENTERPRISES LTD",
"ALLIANCE OF INDIVIDUA",
"WJR PROPERTIES INC",
"WJR PROPERTIES INC",
"KEYSPAN ENERGY DELIVERY NYC",
"RLTY",
"FIRST UNITED MORTGAGE BANKING",
"ASSET PLUSS MANAGEMENT SERVICE",
"KEYSPAN ENERGY DELIVERY N Y C",
"WELLS FARGO HOME MORT",
"ALLIANCE OF INDIVIDUAL",
"NEIGHBORHOOD RESTORE HDFC",
"WILMINGTON SAVINGS FUND SOCIET",
"YOUNG ISRAEL OF AVENUE K",
"FREMONT INVESTMENT LOAN",
"BELL ATLANTIC",
"EM ESS PETROLEUM CORP",
"PI CONSTRUCTION SERVICE INC",
"US BANK NATIONAL ASSOCIATION",
"CONKLIN MGMT CO",
"CON EDISON",
"CONSOLIDATED EDISON",
"EMPIRE CITY SUBWAY",
"DEUTSCHE BANK NATIONAL TRUST C",
"NATIONAL GRID",
"CONTACT HOLDINGS CORP",
"U S BANK NATIONAL ASSOCIATION",
"G G ASSOCIATES",
"WELLS FARGO BANK",
"LUCKY SEAFOOD",
"AGENT OWNER",
"FEDERAL NATIONAL MORTGAGE ASSO",
"AMENCAN HOME MORTGAGE",
"HOMESIDE LENDING",
"HSBC BANK USA",
"HSBC BANK USA NA",
"HIGH STATE RLTY CORP",
"NYC HOUSING AUTHORITY",
"PLAZA CONSTRUCTION CORP",
"EASY STREET PLUMBING INC",
"1249 WEBSTER AVE RLTY",
"DEVELOP", "BANK", "RESOURCES", "SERVICES", "LLC", "SCHOOL", "HOME","NATIONAL GRID","SAM CONEY ISLAND LLC"
                    "ALL PHASE PLUMBING CORP","ERCAT REALTY CORP"]

In [295]:
merged_train_df['Respondent Last Name'] = merged_train_df['Respondent Last Name'].astype(str)

In [296]:
# def word_checker(sentence):
#     if any(word in key_words_last_name for word in sentence.lower().split()):
#         return 'Not Person'
#     else:
#         return 'Person'

In [297]:
# merged_df['Respondent Status'] = merged_df['Respondent Last Name'].apply(word_checker)  

In [298]:
def get_word(my_string):
    for word in key_words_last_name:
        for x in merged_train_df["Respondent Last Name"]:
            if word.lower() in my_string.lower():
                return "Not Person"
            else:
                return "Person"

In [299]:
merged_train_df["Respondent Status"]= merged_train_df["Respondent Last Name"].apply(get_word)

In [300]:
merged_train_df.sample(2)

Unnamed: 0,Hearing Result,Ticket Number,Violation Date,Issuing Agency,Respondent First Name,Respondent Last Name,Violation Location (Borough),Violation Location (Block No.),Violation Location (Lot No.),Violation Location (House #),Violation Location (Street Name),Violation Location (City),Violation Location (Zip Code),Violation Location (State Name),Respondent Address (Borough),Respondent Address (House #),Respondent Address (Street Name),Respondent Address (City),Respondent Address (Zip Code),Respondent Address (State Name),Decision Location (Borough),Penalty Imposed,Paid Amount,Charge #1: Code,Charge #1: Code Section,Charge #1: Code Description,Charge #1: Infraction Amount,Charge #2: Code,Charge #2: Code Section,Charge #2: Code Description,Charge #2: Infraction Amount,Charge #3: Code,Charge #3: Code Section,Charge #3: Code Description,Charge #3: Infraction Amount,complete violation location,complete respondent location,Respondent ZCTA,Estimate!!Nonfamily households!!Median income (dollars),Estimate!!Nonfamily households!!Mean income (dollars),"Estimate!!Nonfamily households!!Total!!Less than $10,000","Estimate!!Nonfamily households!!Total!!$10,000 to $14,999","Estimate!!Nonfamily households!!Total!!$15,000 to $24,999","Estimate!!Nonfamily households!!Total!!$25,000 to $34,999","Estimate!!Nonfamily households!!Total!!$35,000 to $49,999","Estimate!!Nonfamily households!!Total!!$50,000 to $74,999",Estimate!!Households!!Median income (dollars),Estimate!!Households!!Mean income (dollars),"Estimate!!Households!!Total!!Less than $10,000","Estimate!!Households!!Total!!$10,000 to $14,999","Estimate!!Households!!Total!!$15,000 to $24,999","Estimate!!Households!!Total!!$25,000 to $34,999","Estimate!!Households!!Total!!$35,000 to $49,999","Estimate!!Households!!Total!!$50,000 to $74,999",Penalty Imposed - Paid Amount,Respondent Status
67487,3.0,042805528H,2017-09-27,SANITATION POLICE,GHEORGHE,TILNEAC,QUEENS,3508.0,35.0,59-19,71 AVENUE,RIDGEWOOD,11385,NEW YORK,QUEENS,59-19,71 AVENUE,RIDGEWOOD,11385,NEW YORK,In Person MANHATTAN,100.0,100.0,AS26,A.C. 16-118 2 A,FAILURE TO CLEAN 18 INCHES INTO STREET,100.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,QUEENS 3508.0 35.0 59-19 71 AVENUE RIDGEWOOD 11385 NEW YORK,QUEENS 59-19 71 AVENUE RIDGEWOOD 11385 NEW YORK,11385,57188.0,68815.0,8.6,6.2,10.2,9.7,10.5,14.7,75340.0,87853.0,4.6,3.2,6.6,7.8,10.4,17.1,0.0,Person
93911,1.0,042547590J,2014-01-22,SANITATION POLICE,UNKNOWN,VALDERRAMA JOHNY,BRONX,2662.0,33.0,779,HOME STREET,BRONX,10456,NEW YORK,BRONX,779,HOME STREET,BRONX,10456,NEW YORK,UNKNOWN,350.0,0.0,AS2C,A.C. 16-123,"SNOW, ICE DIRT ON SIDEWALKS - 2ND OFFENSE",150.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BRONX 2662.0 33.0 779 HOME STREET BRONX 10456 NEW YORK,BRONX 779 HOME STREET BRONX 10456 NEW YORK,10456,15067.0,30447.0,30.2,19.7,15.3,7.8,10.9,11.1,27917.0,45602.0,19.2,11.1,16.2,9.8,13.5,14.5,350.0,Person


In [301]:
merged_train_df['Respondent Status'].value_counts()

Person        202439
Not Person     10804
Name: Respondent Status, dtype: int64

In [302]:
merged_train_df.loc[merged_train_df['Respondent Last Name'] == "NATIONAL GRID"]

Unnamed: 0,Hearing Result,Ticket Number,Violation Date,Issuing Agency,Respondent First Name,Respondent Last Name,Violation Location (Borough),Violation Location (Block No.),Violation Location (Lot No.),Violation Location (House #),Violation Location (Street Name),Violation Location (City),Violation Location (Zip Code),Violation Location (State Name),Respondent Address (Borough),Respondent Address (House #),Respondent Address (Street Name),Respondent Address (City),Respondent Address (Zip Code),Respondent Address (State Name),Decision Location (Borough),Penalty Imposed,Paid Amount,Charge #1: Code,Charge #1: Code Section,Charge #1: Code Description,Charge #1: Infraction Amount,Charge #2: Code,Charge #2: Code Section,Charge #2: Code Description,Charge #2: Infraction Amount,Charge #3: Code,Charge #3: Code Section,Charge #3: Code Description,Charge #3: Infraction Amount,complete violation location,complete respondent location,Respondent ZCTA,Estimate!!Nonfamily households!!Median income (dollars),Estimate!!Nonfamily households!!Mean income (dollars),"Estimate!!Nonfamily households!!Total!!Less than $10,000","Estimate!!Nonfamily households!!Total!!$10,000 to $14,999","Estimate!!Nonfamily households!!Total!!$15,000 to $24,999","Estimate!!Nonfamily households!!Total!!$25,000 to $34,999","Estimate!!Nonfamily households!!Total!!$35,000 to $49,999","Estimate!!Nonfamily households!!Total!!$50,000 to $74,999",Estimate!!Households!!Median income (dollars),Estimate!!Households!!Mean income (dollars),"Estimate!!Households!!Total!!Less than $10,000","Estimate!!Households!!Total!!$10,000 to $14,999","Estimate!!Households!!Total!!$15,000 to $24,999","Estimate!!Households!!Total!!$25,000 to $34,999","Estimate!!Households!!Total!!$35,000 to $49,999","Estimate!!Households!!Total!!$50,000 to $74,999",Penalty Imposed - Paid Amount,Respondent Status
58597,3.0,0182000693,2013-10-12,POLICE DEPT,UNKNOWN,NATIONAL GRID,BROOKLYN,UNKNOWN,UNKNOWN,136,SOUTH 4TH STREET,BROOKLYN,11211,NEW YORK,BROOKLYN,1,METROTECH CENTER,BROOKLYN,11201,NEW YORK,UNKNOWN,250.0000,234.00000,AD16,A.C. 19-122,SAND DIRT RUBBISH DEBRIS NOT REMOVED FROM SITE WITHIN 7 DAYS,25000.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 136 SOUTH 4TH STREET BROOKLYN 11211 NEW YORK,BROOKLYN 1 METROTECH CENTER BROOKLYN 11201 NEW YORK,11201,104119.00000,125332.000000,10.200000,2.600000,4.40000,5.100000,5.200000,10.40000,129248.000000,186989.000000,6.700000,2.600000,4.700000,4.100000,5.100000,8.000000,16.000000,Person
58607,1.0,0177039574,2010-08-09,NYPD TRANSPORT INTELLIGENCE DI,UNKNOWN,NATIONAL GRID,BROOKLYN,UNKNOWN,UNKNOWN,UNKNOWN,F O 1664 EASTERN PARKWAY BT,BROOKLYN,11233,NEW YORK,BROOKLYN,ONE,METROTECH CENTER,BROOKLYN,11201,NEW YORK,UNKNOWN,3600.0000,1200.00000,AD30,A.C. 19-102 II,FAILURE TO COMPLY WITH THE TERMS AND CONDITIONS OF DOT PERMITS,1200.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN F O 1664 EASTERN PARKWAY BT BROOKLYN 11233 NEW YORK,BROOKLYN ONE METROTECH CENTER BROOKLYN 11201 NEW YORK,11201,104119.00000,125332.000000,10.200000,2.600000,4.40000,5.100000,5.200000,10.40000,129248.000000,186989.000000,6.700000,2.600000,4.700000,4.100000,5.100000,8.000000,2400.000000,Person
58609,3.0,0182000583,2013-07-24,POLICE DEPT,UNKNOWN,NATIONAL GRID,BROOKLYN,3031.0,10.0,12,STAGG STREET,BROOKLYN,11206,NEW YORK,BROOKLYN,1,METROTECT CENTER,BROOKLYN,11201,NEW YORK,UNKNOWN,280.0000,262.00000,AD16,A.C. 19-122,SAND DIRT RUBBISH DEBRIS NOT REMOVED FROM SITE WITHIN 7 DAYS,25000.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 3031.0 10.0 12 STAGG STREET BROOKLYN 11206 NEW YORK,BROOKLYN 1 METROTECT CENTER BROOKLYN 11201 NEW YORK,11201,104119.00000,125332.000000,10.200000,2.600000,4.40000,5.100000,5.200000,10.40000,129248.000000,186989.000000,6.700000,2.600000,4.700000,4.100000,5.100000,8.000000,18.000000,Person
58610,1.0,0180233662,2012-04-03,NYPD TRANSPORT INTELLIGENCE DI,UNKNOWN,NATIONAL GRID,BROOKLYN,1200.0,27.0,UNKNOWN,ATLANTIC AVENUE,BROOKLYN,11216,NEW YORK,BROOKLYN,1,METROTECH CENTER,BROOKLYN,11201,NEW YORK,UNKNOWN,1230.0000,1200.00000,AD30,A.C. 19-102(II),FAILURE TO COMPLY WITH THE TERMS AND CONDITIONS OF DOT PERMITS,1200.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 1200.0 27.0 ATLANTIC AVENUE BROOKLYN 11216 NEW YORK,BROOKLYN 1 METROTECH CENTER BROOKLYN 11201 NEW YORK,11201,104119.00000,125332.000000,10.200000,2.600000,4.40000,5.100000,5.200000,10.40000,129248.000000,186989.000000,6.700000,2.600000,4.700000,4.100000,5.100000,8.000000,30.000000,Person
58620,3.0,0176395450,2012-01-03,POLICE DEPARTMENT,UNKNOWN,NATIONAL GRID,QUEENS,3096.0,7501.0,63-80,WETHEROLE STREET,REGO PARK,11374,NEW YORK,BROOKLYN,1,METROTECH CENTER,BROOKLYN,11201,NEW YORK,UNKNOWN,750.0000,788.00000,AD3C,34 RCNY 2-11 e 5,FAILURE TO MAINTAIN 5FT PEDESTRIAN WALKWAY ON S W,250.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,QUEENS 3096.0 7501.0 63-80 WETHEROLE STREET REGO PARK 11374 NEW YORK,BROOKLYN 1 METROTECH CENTER BROOKLYN 11201 NEW YORK,11201,104119.00000,125332.000000,10.200000,2.600000,4.40000,5.100000,5.200000,10.40000,129248.000000,186989.000000,6.700000,2.600000,4.700000,4.100000,5.100000,8.000000,-38.000000,Person
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126260,3.0,0176395010,2012-01-23,POLICE DEPT,UNKNOWN,NATIONAL GRID,QUEENS,3098.0,16.0,64-64,WETHEROLE STREET,REGO PARK,11374,NEW YORK,QUEENS,89-67,162 STREET,JAMAICA,11432,NEW YORK,UNKNOWN,750.0000,770.00000,AD10,A.C. 19-121 B 2,"DEBRIS CONSTR. MATERIALS OBSTRUCTING GUTTERS SIDEWALK, ETC.",25000.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,QUEENS 3098.0 16.0 64-64 WETHEROLE STREET REGO PARK 11374 NEW YORK,QUEENS 89-67 162 STREET JAMAICA 11432 NEW YORK,11432,40509.00000,55944.000000,12.200000,8.800000,13.30000,9.900000,13.000000,17.20000,62148.000000,86998.000000,5.200000,4.100000,8.700000,9.300000,13.800000,17.700000,-20.000000,Person
210772,1.0,0169015120,2009-12-16,POLICE DEPARTMENT,UNKNOWN,NATIONAL GRID,BROOKLYN,2102.0,31.0,UNKNOWN,DE KALB AVENUE,BROOKLYN,11205,NEW YORK,BROOKLYN,1,METRO TECH CENTER,BROOKLYN,11202,NEW YORK,UNKNOWN,3600.0000,1292.00000,AD30,A.C. 19-102 II,FAILURE TO COMPLY WITH THE TERMS AND CONDITIONS OF DOT PERMITS,1200.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 2102.0 31.0 DE KALB AVENUE BROOKLYN 11205 NEW YORK,BROOKLYN 1 METRO TECH CENTER BROOKLYN 11202 NEW YORK,,40157.45202,59323.355019,17.123871,11.833488,12.26515,9.000868,10.566724,13.29964,58117.064177,82893.516989,10.167535,6.878915,10.238894,8.753856,11.263215,15.053626,2308.000000,Person
210773,1.0,0177034285,2010-09-29,NYPD TRANSPORT INTELLIGENCE DI,UNKNOWN,NATIONAL GRID,BROOKLYN,2011.0,12.0,UNKNOWN,CLINTON AVENUE,BROOKLYN,11238,NEW YORK,BROOKLYN,1,METRO TECH CENTER,BROOKLYN,11202,NEW YORK,UNKNOWN,1230.0000,1200.00000,AD30,A.C. 19-102 II,FAILURE TO COMPLY WITH THE TERMS AND CONDITIONS OF DOT PERMITS,1200.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 2011.0 12.0 CLINTON AVENUE BROOKLYN 11238 NEW YORK,BROOKLYN 1 METRO TECH CENTER BROOKLYN 11202 NEW YORK,,40157.45202,59323.355019,17.123871,11.833488,12.26515,9.000868,10.566724,13.29964,58117.064177,82893.516989,10.167535,6.878915,10.238894,8.753856,11.263215,15.053626,30.000000,Person
210779,1.0,0176395130,2012-01-30,POLICE DEPARTMENT,UNKNOWN,NATIONAL GRID,QUEENS,2124.0,20.0,102-18,63 AVENUE,FOREST HILLS,11375,NEW YORK,BROOKLYN,1,MOTER TECH CNT,BROOKLYN,11202,NEW YORK,UNKNOWN,430.0000,400.00000,AD05,A.C. 19-109 A,FAILURE TO PROVIDE ADEQUATE PROTECTION AT WORK SITE,400.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,QUEENS 2124.0 20.0 102-18 63 AVENUE FOREST HILLS 11375 NEW YORK,BROOKLYN 1 MOTER TECH CNT BROOKLYN 11202 NEW YORK,,40157.45202,59323.355019,17.123871,11.833488,12.26515,9.000868,10.566724,13.29964,58117.064177,82893.516989,10.167535,6.878915,10.238894,8.753856,11.263215,15.053626,30.000000,Person


In [303]:
# pd.options.display.max_colwidth = 1000000
# pd.set_option('display.max_columns', 2000000000)
# pd.set_option('display.max_rows', 1000000000)
# pd.set_option('display.expand_frame_repr', True)

In [304]:
merged_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 213243 entries, 0 to 213242
Data columns (total 56 columns):
 #   Column                                                     Non-Null Count   Dtype  
---  ------                                                     --------------   -----  
 0   Hearing Result                                             213243 non-null  float64
 1   Ticket Number                                              213243 non-null  object 
 2   Violation Date                                             213243 non-null  object 
 3   Issuing Agency                                             213243 non-null  object 
 4   Respondent First Name                                      213243 non-null  object 
 5   Respondent Last Name                                       213243 non-null  object 
 6   Violation Location (Borough)                               213243 non-null  object 
 7   Violation Location (Block No.)                             213243 non-null  object 

In [305]:
# numerical columns

numerical_columns = merged_train_df[['Penalty Imposed', 'Paid Amount', 
                     'Estimate!!Nonfamily households!!Median income (dollars)', 
'Estimate!!Nonfamily households!!Mean income (dollars)', 
'Estimate!!Nonfamily households!!Total!!Less than $10,000',
 'Estimate!!Nonfamily households!!Total!!$10,000 to $14,999',
 'Estimate!!Nonfamily households!!Total!!$15,000 to $24,999', 
'Estimate!!Nonfamily households!!Total!!$25,000 to $34,999', 
'Estimate!!Nonfamily households!!Total!!$35,000 to $49,999', 
'Estimate!!Nonfamily households!!Total!!$50,000 to $74,999', 
'Estimate!!Households!!Median income (dollars)', 
'Estimate!!Households!!Mean income (dollars)', 
'Estimate!!Households!!Total!!Less than $10,000',
 'Estimate!!Households!!Total!!$10,000 to $14,999', 
'Estimate!!Households!!Total!!$15,000 to $24,999',
 'Estimate!!Households!!Total!!$25,000 to $34,999',
 'Estimate!!Households!!Total!!$35,000 to $49,999', 
'Estimate!!Households!!Total!!$50,000 to $74,999']]

In [306]:
numerical_columns.head(6)

Unnamed: 0,Penalty Imposed,Paid Amount,Estimate!!Nonfamily households!!Median income (dollars),Estimate!!Nonfamily households!!Mean income (dollars),"Estimate!!Nonfamily households!!Total!!Less than $10,000","Estimate!!Nonfamily households!!Total!!$10,000 to $14,999","Estimate!!Nonfamily households!!Total!!$15,000 to $24,999","Estimate!!Nonfamily households!!Total!!$25,000 to $34,999","Estimate!!Nonfamily households!!Total!!$35,000 to $49,999","Estimate!!Nonfamily households!!Total!!$50,000 to $74,999",Estimate!!Households!!Median income (dollars),Estimate!!Households!!Mean income (dollars),"Estimate!!Households!!Total!!Less than $10,000","Estimate!!Households!!Total!!$10,000 to $14,999","Estimate!!Households!!Total!!$15,000 to $24,999","Estimate!!Households!!Total!!$25,000 to $34,999","Estimate!!Households!!Total!!$35,000 to $49,999","Estimate!!Households!!Total!!$50,000 to $74,999"
0,0.0,0.0,32281.0,49761.0,15.2,14.0,14.0,8.8,13.1,14.0,64631.0,87355.0,6.8,6.3,9.5,7.8,11.3,13.4
1,1000.0,0.0,32281.0,49761.0,15.2,14.0,14.0,8.8,13.1,14.0,64631.0,87355.0,6.8,6.3,9.5,7.8,11.3,13.4
2,350.0,0.0,32281.0,49761.0,15.2,14.0,14.0,8.8,13.1,14.0,64631.0,87355.0,6.8,6.3,9.5,7.8,11.3,13.4
3,100.0,121.0,32281.0,49761.0,15.2,14.0,14.0,8.8,13.1,14.0,64631.0,87355.0,6.8,6.3,9.5,7.8,11.3,13.4
4,100.0,100.0,32281.0,49761.0,15.2,14.0,14.0,8.8,13.1,14.0,64631.0,87355.0,6.8,6.3,9.5,7.8,11.3,13.4
5,1000.0,0.0,32281.0,49761.0,15.2,14.0,14.0,8.8,13.1,14.0,64631.0,87355.0,6.8,6.3,9.5,7.8,11.3,13.4


In [310]:
categorical_columns = merged_train_df[['Issuing Agency',
'Violation Location (Borough)',
'Violation Location (City)', 
'Violation Location (Zip Code)', 
'Respondent Address (Borough)', 
'Respondent Address (City)', 
'Respondent Address (Zip Code)', 
'Respondent Address (Zip Code)',
"Decision Location (Borough)",
"Respondent Status"]]

In [311]:
categorical_columns.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 213243 entries, 0 to 213242
Data columns (total 10 columns):
 #   Column                         Non-Null Count   Dtype 
---  ------                         --------------   ----- 
 0   Issuing Agency                 213243 non-null  object
 1   Violation Location (Borough)   213243 non-null  object
 2   Violation Location (City)      213243 non-null  object
 3   Violation Location (Zip Code)  213243 non-null  object
 4   Respondent Address (Borough)   213243 non-null  object
 5   Respondent Address (City)      213243 non-null  object
 6   Respondent Address (Zip Code)  213243 non-null  object
 7   Respondent Address (Zip Code)  213243 non-null  object
 8   Decision Location (Borough)    212513 non-null  object
 9   Respondent Status              213243 non-null  object
dtypes: object(10)
memory usage: 26.0+ MB


In [315]:
merged_train_df["Issuing Agency"]= merged_train_df["Issuing Agency"].astype(str)
merged_train_df["Violation Location (Borough)"] = merged_train_df["Violation Location (Borough)"].astype(str)
merged_train_df["Violation Location (City)"] = merged_train_df["Violation Location (City)"].astype(str)
merged_train_df["Violation Location (Zip Code)"] = merged_train_df["Violation Location (Zip Code)"].astype(str)
merged_train_df["Respondent Status"] = merged_train_df["Respondent Status"].astype(str)
merged_train_df["Respondent Address (City)"] = merged_train_df["Respondent Address (City)"].astype(str)
merged_train_df["Respondent Address (Zip Code)"] = merged_train_df["Respondent Address (Zip Code)"].astype(str)
merged_train_df["Respondent Address (Borough)"] = merged_train_df["Respondent Address (Borough)"].astype(str)
merged_train_df["Decision Location (Borough)"] = merged_train_df["Decision Location (Borough)"].astype(str)

In [313]:
categorical_columns.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 213243 entries, 0 to 213242
Data columns (total 10 columns):
 #   Column                         Non-Null Count   Dtype 
---  ------                         --------------   ----- 
 0   Issuing Agency                 213243 non-null  object
 1   Violation Location (Borough)   213243 non-null  object
 2   Violation Location (City)      213243 non-null  object
 3   Violation Location (Zip Code)  213243 non-null  object
 4   Respondent Address (Borough)   213243 non-null  object
 5   Respondent Address (City)      213243 non-null  object
 6   Respondent Address (Zip Code)  213243 non-null  object
 7   Respondent Address (Zip Code)  213243 non-null  object
 8   Decision Location (Borough)    213243 non-null  object
 9   Respondent Status              213243 non-null  object
dtypes: object(10)
memory usage: 26.0+ MB


In [317]:
# create dummies 

categoricals = ['Issuing Agency',
'Violation Location (Borough)',
'Violation Location (City)', 
'Violation Location (Zip Code)', 
'Respondent Address (Borough)', 
'Respondent Address (City)', 
'Respondent Address (Zip Code)', 
'Decision Location (Borough)',
"Respondent Status"]

train_dummies = merged_train_df[categoricals]

# Create OneHotEncoder object to create dummies
ohe = OneHotEncoder(handle_unknown='ignore')
enc = ohe.fit(train_dummies)
train_dummies_trans = enc.transform(train_dummies)

# Dummies values in matrix form
train_data = train_dummies_trans.todense()

# New dummy column names
names = ohe.get_feature_names(categoricals)

# Make them into Dataframe
train_dummies_trans_df = pd.DataFrame(train_data, columns=names,index = merged_train_df.index)

In [323]:
# scale the numericals

numerical = ['Penalty Imposed', 'Paid Amount', 
                     'Estimate!!Nonfamily households!!Median income (dollars)', 
'Estimate!!Nonfamily households!!Mean income (dollars)', 
'Estimate!!Nonfamily households!!Total!!Less than $10,000',
 'Estimate!!Nonfamily households!!Total!!$10,000 to $14,999',
 'Estimate!!Nonfamily households!!Total!!$15,000 to $24,999', 
'Estimate!!Nonfamily households!!Total!!$25,000 to $34,999', 
'Estimate!!Nonfamily households!!Total!!$35,000 to $49,999', 
'Estimate!!Nonfamily households!!Total!!$50,000 to $74,999', 
'Estimate!!Households!!Median income (dollars)', 
'Estimate!!Households!!Mean income (dollars)', 
'Estimate!!Households!!Total!!Less than $10,000',
 'Estimate!!Households!!Total!!$10,000 to $14,999', 
'Estimate!!Households!!Total!!$15,000 to $24,999',
 'Estimate!!Households!!Total!!$25,000 to $34,999',
 'Estimate!!Households!!Total!!$35,000 to $49,999', 
'Estimate!!Households!!Total!!$50,000 to $74,999']

train_ss = merged_train_df[numerical]


scaler = StandardScaler()

merge_train_df_numerical = scaler.fit_transform(train_ss)

In [336]:
train_dummies_trans_df.columns
numerical_cat = pd.DataFrame(merge_train_df_numerical, index = merged_train_df.index)
train_df = numerical_cat.merge(train_dummies_trans_df, left_index = True , right_index=True)

In [None]:
# display_labels = ['No Emotion', 
#                   'Negative Emotion', 
#                   'Positive Emotion']

def print_cm_with_labels(y_true, 
                         y_pred, 
                         display_labels):
    '''
    Takes the true values and predicted values of a classifier and 
    plots a confusion matrix (normalized by predictions) using 
    a list of given display labels.
    '''
    cm = confusion_matrix(y_true, y_pred, normalize = 'pred')
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                                  #display_labels=display_labels)

    fig, ax = plt.subplots(figsize=(6,6))
    disp.plot(ax=ax)
    ax.grid(False)
    disp.ax_.set_xticklabels(display_labels,rotation=45);

# FSM

just pulled in one feature b/c i need to do more work on the columns. 

In [341]:
dummy_model = DummyClassifier(strategy="most_frequent")
dummy_model.fit(train_df, merged_train_df["Hearing Result"])
y_hat = dummy_model.predict(train_df)

In [342]:
acc = accuracy_score(merged_train_df["Hearing Result"],y_hat)
macro_precision_score=precision_score(merged_train_df["Hearing Result"], y_hat, average='macro')
micro_precision_score=precision_score(merged_train_df["Hearing Result"] , y_hat, average='micro')
macro_recall_score=recall_score(merged_train_df["Hearing Result"], y_hat, average='macro')
micro_recall_score=recall_score(merged_train_df["Hearing Result"], y_hat, average='micro')

print('Accuracy Score: {}'.format(acc))
print('Macro Precision Score: {}'.format(macro_precision_score))
print('Micro Precision Score: {}'.format(micro_precision_score))
print('Macro Recall Score: {}'.format(macro_recall_score))
print('Micro Recall Score: {}'.format(micro_recall_score))

  _warn_prf(average, modifier, msg_start, len(result))


Accuracy Score: 0.4292004895823075
Macro Precision Score: 0.10730012239557687
Micro Precision Score: 0.4292004895823075
Macro Recall Score: 0.25
Micro Recall Score: 0.4292004895823075


# Decision Tree

In [346]:
tree_clf = DecisionTreeClassifier(max_depth=2) 

tree_clf.fit(train_df, merged_train_df["Hearing Result"])

y_hat = tree_clf.predict(train_df)

acc = accuracy_score(merged_train_df["Hearing Result"],y_hat)
macro_precision_score=precision_score(merged_train_df["Hearing Result"], y_hat, average='macro')
micro_precision_score=precision_score(merged_train_df["Hearing Result"], y_hat, average='micro')
macro_recall_score=recall_score(merged_train_df["Hearing Result"], y_hat, average='macro')
micro_recall_score=recall_score(merged_train_df["Hearing Result"], y_hat, average='micro')


print('Accuracy Score: {}'.format(acc))
print('Macro Precision Score: {}'.format(macro_precision_score))
print('Micro Precision Score: {}'.format(micro_precision_score))
print('Macro Recall Score: {}'.format(macro_recall_score))
print('Micro Recall Score: {}'.format(micro_recall_score))

Accuracy Score: 0.6589383942263052
Macro Precision Score: 0.6744972562199326
Micro Precision Score: 0.6589383942263052
Macro Recall Score: 0.6150885962040864
Micro Recall Score: 0.6589383942263052


In [348]:
def print_cv_scores(pipe, X, y):
    '''
    Runs cross_validate on given feature and multiclass target arrays using given pipeline, 
    printing the scoring results for both training and cross_val.
    '''
    scoring = ['accuracy','precision_macro', 'recall_macro', 'f1_macro']
    
    results = cross_validate(pipe, X, 
                                   y, 
                                   return_train_score=True, scoring=scoring)
    
    print(results['train_accuracy'])
    print('Training Accuracy', results['train_accuracy'].mean())
    print('##############')
    print(results['test_accuracy'])
    print('Cross_Val Accuracy', results['test_accuracy'].mean())
    print('##############')
    print('Training Macro Precision:', results['train_precision_macro'].mean())
    print('Cross_Val Macro Precision:', results['test_precision_macro'].mean())
    print('##############')
    print('Training Macro Recall:', results['train_recall_macro'].mean())
    print('Cross_Val Macro Recall:', results['test_recall_macro'].mean())
    print('##############')
    print('Training Macro F1:', results['train_f1_macro'].mean())
    print('Cross_Val Macro F1:', results['test_f1_macro'].mean())

In [349]:
print_cv_scores(tree_clf, 
                train_df, 
                merged_train_df["Hearing Result"])

[0.65983563 0.65572646 0.65650609 0.6597028  0.66292095]
Training Accuracy 0.6589383886610808
##############
[0.65534948 0.67178597 0.6686675  0.6558807  0.64300788]
Cross_Val Accuracy 0.6589383051821908
##############
Training Macro Precision: 0.6746529025748564
Cross_Val Macro Precision: 0.6768917017948969
##############
Training Macro Recall: 0.6150885791854861
Cross_Val Macro Recall: 0.6150883238566063
##############
Training Macro F1: 0.6147117848436012
Cross_Val Macro F1: 0.6148541533936139


In [363]:
type(merged_train_df["Hearing Result"])

pandas.core.series.Series

# Random Forest

In [None]:
y_train = merged_train_df["Hearing Result"]


model_RF = RandomForestClassifier()
model_RF.fit(train_df, y_train)
y_hat = model_RF.predict(y_train)

acc = accuracy_score(y_train,y_hat)
macro_precision_score=precision_score(y_train, y_hat, average='macro')
micro_precision_score=precision_score(y_train, y_hat, average='micro')
macro_recall_score=recall_score(y_train, y_hat, average='macro')
micro_recall_score=recall_score(y_train, y_hat, average='micro')

In [None]:
print('Accuracy Score: {}'.format(acc))
print('Macro Precision Score: {}'.format(macro_precision_score))
print('Micro Precision Score: {}'.format(micro_precision_score))
print('Macro Recall Score: {}'.format(macro_recall_score))
print('Micro Recall Score: {}'.format(micro_recall_score))

In [None]:
def print_cv_scores(pipe, X, y):
    '''
    Runs cross_validate on given feature and multiclass target arrays using given pipeline, 
    printing the scoring results for both training and cross_val.
    '''
    scoring = ['accuracy','precision_macro', 'recall_macro', 'f1_macro']
    
    results = cross_validate(pipe, X, 
                                   y, 
                                   return_train_score=True, scoring=scoring)
    
    print(results['train_accuracy'])
    print('Training Accuracy', results['train_accuracy'].mean())
    print('##############')
    print(results['test_accuracy'])
    print('Cross_Val Accuracy', results['test_accuracy'].mean())
    print('##############')
    print('Training Macro Precision:', results['train_precision_macro'].mean())
    print('Cross_Val Macro Precision:', results['test_precision_macro'].mean())
    print('##############')
    print('Training Macro Recall:', results['train_recall_macro'].mean())
    print('Cross_Val Macro Recall:', results['test_recall_macro'].mean())
    print('##############')
    print('Training Macro F1:', results['train_f1_macro'].mean())
    print('Cross_Val Macro F1:', results['test_f1_macro'].mean())

# XGBoost

In [360]:
boost_model = XGBClassifier(objective='ternary:logistic',
    max_depth = 20,
    min_child_weight = 0.5,
    reg_alpha = 0)


boost_model.fit(train_df, merged_train_df["Hearing Result"])
print("fit model")
    
y_hat = boost_model.predict(merged_train_df["Hearing Result"])
print("predicted on model")





KeyboardInterrupt: 

In [None]:
acc = accuracy_score(merged_train_df["Hearing Result"],y_hat)
macro_precision_score=precision_score(merged_train_df["Hearing Result"], y_hat, average='macro')
micro_precision_score=precision_score(merged_train_df["Hearing Result"], y_hat, average='micro')
macro_recall_score=recall_score(merged_train_df["Hearing Result"], y_hat, average='macro')
micro_recall_score=recall_score(merged_train_df["Hearing Result"], y_hat, average='micro')
    
print('Score: {}'.format(boost_model.score(merged_train_df["Hearing Result"], y_train)))
print('Accuracy Score: {}'.format(acc))
print('Macro Precision Score: {}'.format(macro_precision_score))
print('Micro Precision Score: {}'.format(micro_precision_score))
print('Macro Recall Score: {}'.format(macro_recall_score))
print('Micro Recall Score: {}'.format(micro_recall_score))