In [1132]:
import pandas as pd
import numpy as np

from sklearn.dummy import DummyClassifier


from sklearn.metrics import confusion_matrix, plot_confusion_matrix,\
    precision_score, recall_score, accuracy_score, f1_score, log_loss,\
    roc_curve, roc_auc_score, classification_report

In [1133]:
df = pd.read_csv("df_train.csv", low_memory = False)
df['Respondent Address (Zip Code)'] = df['Respondent Address (Zip Code)'].astype(str)

In [1134]:
df.shape

(213243, 38)

In [1135]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,Ticket Number,Violation Date,Issuing Agency,Respondent First Name,Respondent Last Name,Violation Location (Borough),Violation Location (Block No.),Violation Location (Lot No.),Violation Location (House #),Violation Location (Street Name),Violation Location (City),Violation Location (Zip Code),Violation Location (State Name),Respondent Address (Borough),Respondent Address (House #),Respondent Address (Street Name),Respondent Address (City),Respondent Address (Zip Code),Respondent Address (State Name),Decision Location (Borough),Penalty Imposed,Paid Amount,Charge #1: Code,Charge #1: Code Section,Charge #1: Code Description,Charge #1: Infraction Amount,Charge #2: Code,Charge #2: Code Section,Charge #2: Code Description,Charge #2: Infraction Amount,Charge #3: Code,Charge #3: Code Section,Charge #3: Code Description,Charge #3: Infraction Amount,complete violation location,complete respondent location,Hearing Result
0,545274,162420262,2012-08-30,POLICE DEPARTMENT,RONALD,BARONE,BROOKLYN,,,1711,EAST 33 STREET,BROOKLYN,11234,NEW YORK,BROOKLYN,21,SEBA AVENUE,BROOKLYN,11229,NEW YORK,SAU: MANH,0.0,0.0,,,,,AN68,A.C. 24-238 A,IMPROPER AUDIBLE BURGLAR ALARM WITH NO AUTOMAT...,280.0,,,,,BROOKLYN 1711 EAST 33 STREET BROOKLYN 11234 ...,BROOKLYN 21 SEBA AVENUE BROOKLYN 11229 NEW YORK,2
1,25788,112647160,2003-11-23,POLICE DEPARTMENT,JOHN,PEZZULLO,BROOKLYN,989.0,42.0,256,7 AVENUE,BROOKLYN,11215,NEW YORK,BROOKLYN,359,9 AVE,BROOKLYN,11215,NEW YORK,,50.0,,AG13,20-465(C),STAND OR GOODS TOUCHING/LEANING AGAINST BUILDING,100.0,,,,,,,,,BROOKLYN 989.0 42.0 256 7 AVENUE BROOKLYN 1121...,BROOKLYN 359 9 AVE BROOKLYN 11215 NEW YORK,0


In [1136]:
df = df.drop("Unnamed: 0", axis=1)

In [1137]:
# Create a function to fill in UNKNOWN for all NAs in the dataframe
def fill_na(column):
    
    for x in column:
        df[x].fillna('UNKNOWN', inplace=True)
    
    return df.head(2)

In [1138]:
column_names = list(df.columns)
fill_na(column_names)

Unnamed: 0,Ticket Number,Violation Date,Issuing Agency,Respondent First Name,Respondent Last Name,Violation Location (Borough),Violation Location (Block No.),Violation Location (Lot No.),Violation Location (House #),Violation Location (Street Name),Violation Location (City),Violation Location (Zip Code),Violation Location (State Name),Respondent Address (Borough),Respondent Address (House #),Respondent Address (Street Name),Respondent Address (City),Respondent Address (Zip Code),Respondent Address (State Name),Decision Location (Borough),Penalty Imposed,Paid Amount,Charge #1: Code,Charge #1: Code Section,Charge #1: Code Description,Charge #1: Infraction Amount,Charge #2: Code,Charge #2: Code Section,Charge #2: Code Description,Charge #2: Infraction Amount,Charge #3: Code,Charge #3: Code Section,Charge #3: Code Description,Charge #3: Infraction Amount,complete violation location,complete respondent location,Hearing Result
0,162420262,2012-08-30,POLICE DEPARTMENT,RONALD,BARONE,BROOKLYN,UNKNOWN,UNKNOWN,1711,EAST 33 STREET,BROOKLYN,11234,NEW YORK,BROOKLYN,21,SEBA AVENUE,BROOKLYN,11229,NEW YORK,SAU: MANH,0.0,0.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,AN68,A.C. 24-238 A,IMPROPER AUDIBLE BURGLAR ALARM WITH NO AUTOMAT...,280.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 1711 EAST 33 STREET BROOKLYN 11234 ...,BROOKLYN 21 SEBA AVENUE BROOKLYN 11229 NEW YORK,2
1,112647160,2003-11-23,POLICE DEPARTMENT,JOHN,PEZZULLO,BROOKLYN,989.0,42.0,256,7 AVENUE,BROOKLYN,11215,NEW YORK,BROOKLYN,359,9 AVE,BROOKLYN,11215,NEW YORK,UNKNOWN,50.0,UNKNOWN,AG13,20-465(C),STAND OR GOODS TOUCHING/LEANING AGAINST BUILDING,100.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 989.0 42.0 256 7 AVENUE BROOKLYN 1121...,BROOKLYN 359 9 AVE BROOKLYN 11215 NEW YORK,0


In [1049]:
# check to see that is the case
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213243 entries, 0 to 213242
Data columns (total 37 columns):
 #   Column                            Non-Null Count   Dtype 
---  ------                            --------------   ----- 
 0   Ticket Number                     213243 non-null  object
 1   Violation Date                    213243 non-null  object
 2   Issuing Agency                    213243 non-null  object
 3   Respondent First Name             213243 non-null  object
 4   Respondent Last Name              213243 non-null  object
 5   Violation Location (Borough)      213243 non-null  object
 6   Violation Location (Block No.)    213243 non-null  object
 7   Violation Location (Lot No.)      213243 non-null  object
 8   Violation Location (House #)      213243 non-null  object
 9   Violation Location (Street Name)  213243 non-null  object
 10  Violation Location (City)         213243 non-null  object
 11  Violation Location (Zip Code)     213243 non-null  object
 12  Vi

In [1050]:
df["Charge #2: Infraction Amount"].value_counts().head(3)

UNKNOWN    209908
0.0          2447
1000.0        157
Name: Charge #2: Infraction Amount, dtype: int64

### load zip code dataframe to add neighborhood level data to the original data

## Bring in neighborhood level income data from the census 

1. main problem is USPS zip code (which is what the main df has) is different from census's zip code tabulation area number (thought a signficant portion of them match up identically). For example, one's USPS zip code could be 11333 but its zip code tabulation area could be 11332

2. Need to go through the following steps to get the census data to match up with the main dataframe 

     A. add a new column with ZIP Code Tabulation Areas (ZCTAs) so to pull census data using ZCTA
     
     B. web scrape census to get the list of ZCTA based on the dataframe 
     
     C. merge the dataframes together so each row contain neighborhood level income data 

In [1151]:
nyc_zip_codes = ["10001", "10002", "10003", "10004", "10005", "10006",
                 "10007","10009","10010","10011","10012","10013","10014",
                 "10015","10016","10017","10018","10019","10020","10021",
                 "10022","10023","10024","10025","10026","10027","10028",
                 "10029","10030","10031","10032","10033","10034","10035",
                 "10036","10037","10038","10039","10040","10041","10044",
                 "10045","10048","10055","10060","10069","10090","10095",
                 "10098","10099","10103","10104","10105","10106","10107",
                 "10110","10111","10112","10115","10118","10119","10120",
                 "10121","10122","10123","10128","10151","10152","10153",
                 "10154","10155","10158","10161","10162","10165","10166",
                 "10167","10168","10169","10170","10171","10172","10173",
                 "10174","10175","10176","10177","10178","10199","10270",
                 "10271","10278","10279","10280","10281","10282","10301",
                 "10302","10303","10304","10305","10306","10307","10308",
                 "10309","10310","10311","10312","10314","10451","10452",
                 "10453","10454","10455","10456","10457","10458","10459",
                 "10460","10461","10462","10463","10464","10465","10466",
                 "10467","10468","10469","10470","10471","10472","10473",
                 "10474","10475","11004","11101","11102","11103","11104",
                 "11105","11106","11109","11201","11203","11204","11205",
                 "11206","11207","11208","11209","11210","11211","11212",
                 "11213","11214","11215","11216","11217","11218","11219",
                 "11220","11221","11222","11223","11224","11225","11226",
                 "11228","11229","11230","11231","11232","11233","11234",
                 "11235","11236","11237","11238","11239","11241","11242",
                 "11243","11249","11252","11256","11351","11354","11355",
                 "11356","11357","11358","11359","11360","11361","11362",
                 "11363","11364","11365","11366","11367","11368","11369",
                 "11370","11371","11372","11373","11374","11375","11377",
                 "11378","11379","11385","11411","11412","11413","11414",
                 "11415","11416","11417","11418","11419","11420","11421",
                 "11422","11423","11426","11427","11428","11429","11430",
                 "11432","11433","11434","11435","11436","11691","11692",
                 "11693","11694","11697"]


In [1152]:
# this dataset contains USPS zip_code and ZCTA zip code for several states. 
# itindicates whether some a USPS zip code matches with a ZCTA code
# if certain ones doesn't match up, it indicates the equivalence of that


ZiptoZcta_Crosswalk_2021 = pd.read_excel("ZiptoZcta_Crosswalk_2021.xlsx")
ZiptoZcta_Crosswalk_2021["ZIP_CODE"] = ZiptoZcta_Crosswalk_2021["ZIP_CODE"].astype(str)
df['Respondent Address (Zip Code)'] = df['Respondent Address (Zip Code)'].astype(str)

# narrown down the df to only pull out zip codes that matches the nyc zipcode list above
ZiptoZcta_Crosswalk_2021["ZIP_CODE_NYC"] = np.where(ZiptoZcta_Crosswalk_2021["ZIP_CODE"].isin(nyc_zip_codes), "NYC", "Other")

In [1155]:
print(ZiptoZcta_Crosswalk_2021["ZIP_CODE_NYC"].value_counts())

ZiptoZcta_Crosswalk_2021_NYC = ZiptoZcta_Crosswalk_2021.loc[ZiptoZcta_Crosswalk_2021["ZIP_CODE_NYC"] == "NYC"]
ZiptoZcta_Crosswalk_2021_NYC

Other    40873
NYC        234
Name: ZIP_CODE_NYC, dtype: int64


Unnamed: 0,ZIP_CODE,PO_NAME,STATE,ZIP_TYPE,ZCTA,zip_join_type,ZIP_CODE_NYC
0,10001,New York,NY,Zip Code Area,10001,Zip matches ZCTA,NYC
1,10002,New York,NY,Zip Code Area,10002,Zip matches ZCTA,NYC
3190,10003,New York,NY,Zip Code Area,10003,Zip matches ZCTA,NYC
3191,10004,New York,NY,Zip Code Area,10004,Zip matches ZCTA,NYC
3192,10005,New York,NY,Zip Code Area,10005,Zip matches ZCTA,NYC
3193,10006,New York,NY,Zip Code Area,10006,Zip matches ZCTA,NYC
3194,10007,New York,NY,Zip Code Area,10007,Zip matches ZCTA,NYC
3195,10009,New York,NY,Zip Code Area,10009,Zip matches ZCTA,NYC
3196,10010,New York,NY,Zip Code Area,10010,Zip matches ZCTA,NYC
3197,10011,New York,NY,Zip Code Area,10011,Zip matches ZCTA,NYC


In [1156]:
ZiptoZcta_Crosswalk_2021_NYC.to_csv("NYC_Only_ZiptoZcta_Crosswalk_2021.csv")

In [1157]:
#create a dic that has the usps zip code on the left and ZCTA code on the right so we can map it 
ZiptoZcta_Crosswalk_2021_NYC_dict = dict(zip(ZiptoZcta_Crosswalk_2021_NYC.ZIP_CODE, ZiptoZcta_Crosswalk_2021_NYC.ZCTA))
ZiptoZcta_Crosswalk_2021_NYC_dict

{'10001': '10001',
 '10002': '10002',
 '10003': '10003',
 '10004': '10004',
 '10005': '10005',
 '10006': '10006',
 '10007': '10007',
 '10009': '10009',
 '10010': '10010',
 '10011': '10011',
 '10012': '10012',
 '10013': '10013',
 '10014': '10014',
 '10016': '10016',
 '10017': '10017',
 '10018': '10018',
 '10019': '10019',
 '10020': '10020',
 '10021': '10021',
 '10022': '10022',
 '10023': '10023',
 '10024': '10024',
 '10025': '10025',
 '10026': '10026',
 '10027': '10027',
 '10028': '10028',
 '10029': '10029',
 '10030': '10030',
 '10031': '10031',
 '10032': '10032',
 '10033': '10033',
 '10034': '10034',
 '10035': '10035',
 '10036': '10036',
 '10037': '10037',
 '10038': '10038',
 '10039': '10039',
 '10040': '10040',
 '10044': '10044',
 '10069': '10069',
 '10103': '10103',
 '10110': '10110',
 '10111': '10111',
 '10112': '10112',
 '10115': '10115',
 '10119': '10119',
 '10128': '10128',
 '10152': '10152',
 '10153': '10153',
 '10154': '10154',
 '10162': '10162',
 '10165': '10165',
 '10167': '1

In [1158]:
df['Respondent ZCTA'] = df['Respondent Address (Zip Code)'].map(ZiptoZcta_Crosswalk_2021_NYC_dict)

In [1159]:
df["Respondent ZCTA"].isnull().value_counts()

False    208967
True       4276
Name: Respondent ZCTA, dtype: int64

In [1164]:
df['Respondent ZCTA'] = df["Respondent ZCTA"].astype(str)
print("Respondent ZCTA column contains a total {} unique zip codes. I will use this list to do web scraping to get the relevant census files".format(len(df['Respondent ZCTA'].unique())))
Respondent_ZCTA_list = list(set(df['Respondent ZCTA']))
print(Respondent_ZCTA_list)

Respondent ZCTA column contains a total 189 unique zip codes. I will use this list to do web scraping to get the relevant census files
['10023', '11217', '10302', '11366', '11355', '10007', '10003', '11385', '10036', '11105', '10307', '10005', '10469', '10301', '11207', '11234', '11218', '11213', '11373', '10033', '11362', '10457', '11691', '10001', '11214', '11224', '11231', '11216', '10464', '11365', '10020', '11103', '11430', '10024', '10019', '11230', '11416', '10014', '11697', '11423', '11434', '10455', '11413', '11212', '10021', '10452', '11204', '10454', '10128', '11226', '11414', '10461', '11215', '10453', '10312', '10463', '11692', '11367', '11420', '11377', '11429', '11378', '10304', '11417', '11368', '10303', '11436', '10112', '11221', '10012', '10199', '10035', '11411', '11360', '11106', '10040', '11237', '10465', '11359', '11432', '10030', '11101', '10468', '10306', '10309', '11004', '10470', '10473', '11363', '10002', '11104', '10314', '10456', '10022', '11109', '10037', 

# Web Scraping

In [396]:
pip install selenium

Collecting selenium
  Downloading selenium-4.0.0-py3-none-any.whl (954 kB)
[K     |████████████████████████████████| 954 kB 1.6 MB/s eta 0:00:01
[?25hCollecting trio~=0.17
  Downloading trio-0.19.0-py3-none-any.whl (356 kB)
[K     |████████████████████████████████| 356 kB 16.2 MB/s eta 0:00:01
[?25hCollecting trio-websocket~=0.9
  Downloading trio_websocket-0.9.2-py3-none-any.whl (16 kB)
Collecting outcome
  Downloading outcome-1.1.0-py2.py3-none-any.whl (9.7 kB)
Collecting wsproto>=0.14
  Downloading wsproto-1.0.0-py3-none-any.whl (24 kB)
Collecting h11<1,>=0.9.0
  Downloading h11-0.12.0-py3-none-any.whl (54 kB)
[K     |████████████████████████████████| 54 kB 12.2 MB/s eta 0:00:01
[?25hInstalling collected packages: outcome, h11, wsproto, trio, trio-websocket, selenium
Successfully installed h11-0.12.0 outcome-1.1.0 selenium-4.0.0 trio-0.19.0 trio-websocket-0.9.2 wsproto-1.0.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
pip install chromedriver-binary

zsh:1: = not found
Note: you may need to restart the kernel to use updated packages.


In [124]:
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup
import chromedriver_binary
#import get to call a get request on the site
from requests import get
import re

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import chromedriver_binary

driver = webdriver.Chrome("/Users/allisongao/Downloads/chromedriver 4")

  driver = webdriver.Chrome("/Users/allisongao/Downloads/chromedriver 4")


In [1165]:
ZCTA_url =[]

for x in Respondent_ZCTA_list:
    ZCTA_url.append("https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US" + x + "&tid=ACSST5Y2019.S1901&hidePreview=true")
    
    
ZCTA_url

['https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US10023&tid=ACSST5Y2019.S1901&hidePreview=true',
 'https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US11217&tid=ACSST5Y2019.S1901&hidePreview=true',
 'https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US10302&tid=ACSST5Y2019.S1901&hidePreview=true',
 'https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US11366&tid=ACSST5Y2019.S1901&hidePreview=true',
 'https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US11355&tid=ACSST5Y2019.S1901&hidePreview=true',
 'https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US10007&tid=ACSST5Y2019.S1901&hidePreview=true',
 'https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US10003&tid=ACSST5Y2019.S1901&hidePreview=true',
 'https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US11385&tid=ACSST5Y2019.S1901&hidePreview=true',
 'https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US10036&tid=ACSST5Y2019.S1901&hi

In [1167]:
# divide up the links into smaller groups to be mindful of the computer's limit capacity

def divide_chunks(l, n):
      
    for i in range(0, len(l), n): 
        yield l[i:i + n]
        
x = list(divide_chunks(ZCTA_url, n=5))
print (x)

[['https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US10023&tid=ACSST5Y2019.S1901&hidePreview=true', 'https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US11217&tid=ACSST5Y2019.S1901&hidePreview=true', 'https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US10302&tid=ACSST5Y2019.S1901&hidePreview=true', 'https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US11366&tid=ACSST5Y2019.S1901&hidePreview=true', 'https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US11355&tid=ACSST5Y2019.S1901&hidePreview=true'], ['https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US10007&tid=ACSST5Y2019.S1901&hidePreview=true', 'https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US10003&tid=ACSST5Y2019.S1901&hidePreview=true', 'https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US11385&tid=ACSST5Y2019.S1901&hidePreview=true', 'https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US10036&tid=ACSST5Y2019.S1901&hidePre

In [None]:
#code to auto click on the download button to get all the census data
import time
# #importing webdriver from selenium
# from selenium import webdriver
 
# # Here Chrome  will be used
# driver = webdriver.Chrome("/Users/allisongao/Downloads/chromedriver 4")
 
# firs_group = ['https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US11229&tid=ACSST5Y2019.S1901&hidePreview=true',
#  'https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US11215&tid=ACSST5Y2019.S1901&hidePreview=true',
#  'https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US11435&tid=ACSST5Y2019.S1901&hidePreview=true',
#  'https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US11208&tid=ACSST5Y2019.S1901&hidePreview=true',
#  'https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US11367&tid=ACSST5Y2019.S1901&hidePreview=true',
#  'https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US11206&tid=ACSST5Y2019.S1901&hidePreview=true',
#  'https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US10458&tid=ACSST5Y2019.S1901&hidePreview=true']
    
# # URL of website

# for url in firs_group:
#     driver.get(url)
    
#     downloadBtn = driver.find_element_by_xpath("//*[text() = 'Download']")
#     downloadBtn.click()
#     time.sleep(5)
#     downloadBtn2 = driver.find_element_by_xpath("//button[@class='aqua-button mt-5']")
#     downloadBtn2.click()
#     time.sleep(2)
#     downloadBtn3=driver.find_element_by_xpath('//*[@id="table-download-now-button"]')
#     downloadBtn3.click()


for x in Respondent_ZCTA_list_39:
    
#     link="https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US" + x + "&tid=ACSST5Y2019.S1901&hidePreview=true"
    link= x
    driver.get(link)
    downloadBtn = driver.find_element_by_xpath("//*[text() = 'Download']")
    downloadBtn.click()
    time.sleep(60)
    downloadBtn2 = driver.find_element_by_xpath("//button[@class='aqua-button mt-5']")
    downloadBtn2.click()
    time.sleep(60)
    downloadBtn3=driver.find_element_by_xpath('//*[@id="table-download-now-button"]')
    downloadBtn3.click()
    time.sleep(60)

# load census data

In [1098]:
# load all the census data by zip code in and concat it into a single dataframe

zcta_df = []

for x in range(1, 190):
    data = pd.read_csv(str(x) + "ACS.csv")
    
    zcta_df.append(data)
    
#merge them all horizontally 
zcta_df = pd.concat(zcta_df)   


#drop duplicated rows
zcta_df.drop_duplicates(keep='first', inplace=True)

zcta_df = zcta_df.reset_index(drop=True)

#second row is an observation for the entire country, let's drop it
zcta_df.drop(2, inplace=True)

 #change the first row for the header
new_header = zcta_df.iloc[0]
zcta_df = zcta_df[1:]
zcta_df.columns = new_header 

#make sure df is in good shape
zcta_df.head(5)

Unnamed: 0,id,Geographic Area Name,Estimate!!Households!!Total,Margin of Error!!Households!!Total,"Estimate!!Households!!Total!!Less than $10,000","Margin of Error!!Households!!Total!!Less than $10,000","Estimate!!Households!!Total!!$10,000 to $14,999","Margin of Error!!Households!!Total!!$10,000 to $14,999","Estimate!!Households!!Total!!$15,000 to $24,999","Margin of Error!!Households!!Total!!$15,000 to $24,999","Estimate!!Households!!Total!!$25,000 to $34,999","Margin of Error!!Households!!Total!!$25,000 to $34,999","Estimate!!Households!!Total!!$35,000 to $49,999","Margin of Error!!Households!!Total!!$35,000 to $49,999","Estimate!!Households!!Total!!$50,000 to $74,999","Margin of Error!!Households!!Total!!$50,000 to $74,999","Estimate!!Households!!Total!!$75,000 to $99,999","Margin of Error!!Households!!Total!!$75,000 to $99,999","Estimate!!Households!!Total!!$100,000 to $149,999","Margin of Error!!Households!!Total!!$100,000 to $149,999","Estimate!!Households!!Total!!$150,000 to $199,999","Margin of Error!!Households!!Total!!$150,000 to $199,999","Estimate!!Households!!Total!!$200,000 or more","Margin of Error!!Households!!Total!!$200,000 or more",Estimate!!Households!!Median income (dollars),Margin of Error!!Households!!Median income (dollars),Estimate!!Households!!Mean income (dollars),Margin of Error!!Households!!Mean income (dollars),Estimate!!Households!!PERCENT ALLOCATED!!Household income in the past 12 months,Margin of Error!!Households!!PERCENT ALLOCATED!!Household income in the past 12 months,Estimate!!Households!!PERCENT ALLOCATED!!Family income in the past 12 months,Margin of Error!!Households!!PERCENT ALLOCATED!!Family income in the past 12 months,Estimate!!Households!!PERCENT ALLOCATED!!Nonfamily income in the past 12 months,Margin of Error!!Households!!PERCENT ALLOCATED!!Nonfamily income in the past 12 months,Estimate!!Families!!Total,Margin of Error!!Families!!Total,"Estimate!!Families!!Total!!Less than $10,000","Margin of Error!!Families!!Total!!Less than $10,000","Estimate!!Families!!Total!!$10,000 to $14,999","Margin of Error!!Families!!Total!!$10,000 to $14,999","Estimate!!Families!!Total!!$15,000 to $24,999","Margin of Error!!Families!!Total!!$15,000 to $24,999","Estimate!!Families!!Total!!$25,000 to $34,999","Margin of Error!!Families!!Total!!$25,000 to $34,999","Estimate!!Families!!Total!!$35,000 to $49,999","Margin of Error!!Families!!Total!!$35,000 to $49,999","Estimate!!Families!!Total!!$50,000 to $74,999","Margin of Error!!Families!!Total!!$50,000 to $74,999","Estimate!!Families!!Total!!$75,000 to $99,999","Margin of Error!!Families!!Total!!$75,000 to $99,999","Estimate!!Families!!Total!!$100,000 to $149,999","Margin of Error!!Families!!Total!!$100,000 to $149,999","Estimate!!Families!!Total!!$150,000 to $199,999","Margin of Error!!Families!!Total!!$150,000 to $199,999","Estimate!!Families!!Total!!$200,000 or more","Margin of Error!!Families!!Total!!$200,000 or more",Estimate!!Families!!Median income (dollars),Margin of Error!!Families!!Median income (dollars),Estimate!!Families!!Mean income (dollars),Margin of Error!!Families!!Mean income (dollars),Estimate!!Families!!PERCENT ALLOCATED!!Household income in the past 12 months,Margin of Error!!Families!!PERCENT ALLOCATED!!Household income in the past 12 months,Estimate!!Families!!PERCENT ALLOCATED!!Family income in the past 12 months,Margin of Error!!Families!!PERCENT ALLOCATED!!Family income in the past 12 months,Estimate!!Families!!PERCENT ALLOCATED!!Nonfamily income in the past 12 months,Margin of Error!!Families!!PERCENT ALLOCATED!!Nonfamily income in the past 12 months,Estimate!!Married-couple families!!Total,Margin of Error!!Married-couple families!!Total,"Estimate!!Married-couple families!!Total!!Less than $10,000","Margin of Error!!Married-couple families!!Total!!Less than $10,000","Estimate!!Married-couple families!!Total!!$10,000 to $14,999","Margin of Error!!Married-couple families!!Total!!$10,000 to $14,999","Estimate!!Married-couple families!!Total!!$15,000 to $24,999","Margin of Error!!Married-couple families!!Total!!$15,000 to $24,999","Estimate!!Married-couple families!!Total!!$25,000 to $34,999","Margin of Error!!Married-couple families!!Total!!$25,000 to $34,999","Estimate!!Married-couple families!!Total!!$35,000 to $49,999","Margin of Error!!Married-couple families!!Total!!$35,000 to $49,999","Estimate!!Married-couple families!!Total!!$50,000 to $74,999","Margin of Error!!Married-couple families!!Total!!$50,000 to $74,999","Estimate!!Married-couple families!!Total!!$75,000 to $99,999","Margin of Error!!Married-couple families!!Total!!$75,000 to $99,999","Estimate!!Married-couple families!!Total!!$100,000 to $149,999","Margin of Error!!Married-couple families!!Total!!$100,000 to $149,999","Estimate!!Married-couple families!!Total!!$150,000 to $199,999","Margin of Error!!Married-couple families!!Total!!$150,000 to $199,999","Estimate!!Married-couple families!!Total!!$200,000 or more","Margin of Error!!Married-couple families!!Total!!$200,000 or more",Estimate!!Married-couple families!!Median income (dollars),Margin of Error!!Married-couple families!!Median income (dollars),Estimate!!Married-couple families!!Mean income (dollars),Margin of Error!!Married-couple families!!Mean income (dollars),Estimate!!Married-couple families!!PERCENT ALLOCATED!!Household income in the past 12 months,Margin of Error!!Married-couple families!!PERCENT ALLOCATED!!Household income in the past 12 months,Estimate!!Married-couple families!!PERCENT ALLOCATED!!Family income in the past 12 months,Margin of Error!!Married-couple families!!PERCENT ALLOCATED!!Family income in the past 12 months,Estimate!!Married-couple families!!PERCENT ALLOCATED!!Nonfamily income in the past 12 months,Margin of Error!!Married-couple families!!PERCENT ALLOCATED!!Nonfamily income in the past 12 months,Estimate!!Nonfamily households!!Total,Margin of Error!!Nonfamily households!!Total,"Estimate!!Nonfamily households!!Total!!Less than $10,000","Margin of Error!!Nonfamily households!!Total!!Less than $10,000","Estimate!!Nonfamily households!!Total!!$10,000 to $14,999","Margin of Error!!Nonfamily households!!Total!!$10,000 to $14,999","Estimate!!Nonfamily households!!Total!!$15,000 to $24,999","Margin of Error!!Nonfamily households!!Total!!$15,000 to $24,999","Estimate!!Nonfamily households!!Total!!$25,000 to $34,999","Margin of Error!!Nonfamily households!!Total!!$25,000 to $34,999","Estimate!!Nonfamily households!!Total!!$35,000 to $49,999","Margin of Error!!Nonfamily households!!Total!!$35,000 to $49,999","Estimate!!Nonfamily households!!Total!!$50,000 to $74,999","Margin of Error!!Nonfamily households!!Total!!$50,000 to $74,999","Estimate!!Nonfamily households!!Total!!$75,000 to $99,999","Margin of Error!!Nonfamily households!!Total!!$75,000 to $99,999","Estimate!!Nonfamily households!!Total!!$100,000 to $149,999","Margin of Error!!Nonfamily households!!Total!!$100,000 to $149,999","Estimate!!Nonfamily households!!Total!!$150,000 to $199,999","Margin of Error!!Nonfamily households!!Total!!$150,000 to $199,999","Estimate!!Nonfamily households!!Total!!$200,000 or more","Margin of Error!!Nonfamily households!!Total!!$200,000 or more",Estimate!!Nonfamily households!!Median income (dollars),Margin of Error!!Nonfamily households!!Median income (dollars),Estimate!!Nonfamily households!!Mean income (dollars),Margin of Error!!Nonfamily households!!Mean income (dollars),Estimate!!Nonfamily households!!PERCENT ALLOCATED!!Household income in the past 12 months,Margin of Error!!Nonfamily households!!PERCENT ALLOCATED!!Household income in the past 12 months,Estimate!!Nonfamily households!!PERCENT ALLOCATED!!Family income in the past 12 months,Margin of Error!!Nonfamily households!!PERCENT ALLOCATED!!Family income in the past 12 months,Estimate!!Nonfamily households!!PERCENT ALLOCATED!!Nonfamily income in the past 12 months,Margin of Error!!Nonfamily households!!PERCENT ALLOCATED!!Nonfamily income in the past 12 months
1,8600000US10005,ZCTA5 10005,4363,396,4.5,2.3,0.3,0.5,2.6,2.0,0.3,0.4,1.7,1.4,6.2,3.2,5.2,2.9,16.3,5.3,16.4,5.2,46.5,6.6,189702,15832,225453,25771,21.8,(X),(X),(X),(X),(X),1422,296,0.9,1.5,0.0,2.3,2.0,3.3,0.8,1.2,0.2,0.3,2.7,3.9,4.4,4.5,15.3,9.9,8.1,5.7,65.5,11.7,241094,37175,312806,57275,(X),(X),21.3,(X),(X),(X),1268,279,1.0,1.7,0.0,2.5,0.0,2.5,0.0,2.5,0.2,0.4,3.1,4.4,5.0,5.0,17.1,11.1,8.0,6.3,65.6,12.4,239800,34372,N,N,(X),(X),(X),(X),(X),(X),2941,432,6.3,3.4,0.5,0.8,2.9,2.6,0.0,1.1,2.4,2.1,7.9,4.2,5.6,3.8,16.9,6.8,20.7,7.1,36.9,8.1,180389,15320,180629,19749,(X),(X),(X),(X),22.0,(X)
3,8600000US10002,ZCTA5 10002,33459,574,14.7,1.7,11.8,1.3,13.3,1.7,8.6,1.2,10.9,1.6,11.0,1.4,8.6,1.3,9.7,1.2,4.3,0.7,7.1,0.8,36982,2384,69323,3102,32.0,(X),(X),(X),(X),(X),16723,775,8.5,1.9,10.2,2.0,15.6,2.5,10.8,1.9,14.4,2.1,12.8,2.0,7.6,1.4,8.8,1.6,4.1,1.0,7.2,1.1,40045,3383,72064,4340,(X),(X),36.6,(X),(X),(X),9865,727,4.5,1.6,11.5,2.6,14.4,3.0,11.1,2.6,13.6,3.0,11.6,2.4,7.8,2.1,9.5,2.4,5.3,1.7,10.7,1.9,43952,5337,86881,6899,(X),(X),(X),(X),(X),(X),16736,868,20.9,2.6,13.5,2.1,11.2,2.4,6.2,1.3,7.8,2.1,9.4,2.0,9.5,1.8,10.6,1.9,4.2,1.0,6.7,1.1,31767,4138,65351,4947,(X),(X),(X),(X),27.1,(X)
4,8600000US10003,ZCTA5 10003,26043,638,5.8,1.3,2.5,1.0,4.8,1.2,4.9,1.3,6.1,1.5,12.5,2.1,8.8,1.7,13.4,1.8,11.1,1.6,30.2,2.4,118161,10235,196931,12709,31.2,(X),(X),(X),(X),(X),7504,569,1.4,1.1,1.5,1.0,1.2,1.2,3.0,1.6,5.3,2.1,7.9,3.4,8.4,2.5,14.1,3.6,10.5,2.7,46.7,4.3,183787,27211,320538,32403,(X),(X),34.5,(X),(X),(X),6544,542,0.5,0.7,0.5,0.5,0.7,0.9,2.4,1.7,6.1,2.4,7.2,3.6,9.1,2.7,14.0,3.9,10.5,2.8,49.0,4.7,193778,25886,N,N,(X),(X),(X),(X),(X),(X),18539,718,7.6,1.8,3.0,1.3,6.2,1.6,5.5,1.6,6.5,2.0,14.4,2.5,8.9,1.9,13.2,2.1,11.6,2.0,23.2,2.6,95812,6867,146060,9333,(X),(X),(X),(X),29.9,(X)
5,8600000US11215,ZCTA5 11215,28445,510,4.0,0.9,2.1,0.6,3.2,0.7,3.4,0.8,4.4,0.9,9.1,1.2,10.6,1.4,19.1,1.7,13.9,1.5,30.2,1.7,132091,4395,183544,7698,31.3,(X),(X),(X),(X),(X),16621,569,1.8,0.7,0.8,0.5,2.1,0.8,1.8,0.8,3.9,1.0,6.9,1.3,7.9,1.4,17.8,2.0,15.5,1.8,41.5,2.5,170511,9001,229884,11295,(X),(X),29.9,(X),(X),(X),13954,561,1.1,0.7,0.0,0.1,1.2,0.6,1.3,0.7,3.1,1.0,5.8,1.4,6.4,1.3,17.2,2.1,16.0,2.1,47.8,2.8,188946,12173,N,N,(X),(X),(X),(X),(X),(X),11824,595,7.4,1.8,4.2,1.4,5.3,1.5,5.5,1.5,5.8,1.4,12.7,2.0,15.0,2.3,21.0,2.6,11.4,2.3,11.9,1.9,91672,4607,108235,5400,(X),(X),(X),(X),31.5,(X)
6,8600000US10031,ZCTA5 10031,21744,574,9.8,1.9,7.1,1.6,9.8,1.6,9.4,1.9,11.8,1.9,14.8,2.0,11.6,1.8,13.1,1.7,5.7,1.4,7.0,1.3,53660,3965,78767,4819,38.3,(X),(X),(X),(X),(X),10591,512,6.1,1.9,5.2,2.0,11.1,2.4,11.1,2.9,11.0,2.2,16.8,3.3,11.1,2.4,13.2,2.5,6.2,2.3,8.1,1.9,54699,4132,85680,7381,(X),(X),40.7,(X),(X),(X),5177,495,2.0,1.4,3.5,2.5,8.4,2.7,10.8,3.8,10.0,3.2,14.5,4.2,11.1,3.2,19.9,4.3,7.4,3.1,12.5,3.4,78197,12406,N,N,(X),(X),(X),(X),(X),(X),11153,591,14.5,3.0,8.8,2.4,10.0,2.2,8.0,2.1,12.6,2.6,12.8,2.6,11.7,2.7,10.8,2.2,5.4,1.8,5.5,1.9,45735,4549,68336,6814,(X),(X),(X),(X),35.0,(X)


In [1099]:
# need to convert the Geographic Area Name column so that I can use this to merge it to the main df
zcta_df["Geographic Area Name"]= zcta_df["Geographic Area Name"].str.replace('ZCTA5', '')
zcta_df['Geographic Area Name'] = zcta_df['Geographic Area Name'].astype(str)
zcta_df['Geographic Area Name'] = zcta_df['Geographic Area Name'].str.lstrip()

In [1100]:
for x in list (zcta_df.columns.tolist()):
    print(x)


id
Geographic Area Name
Estimate!!Households!!Total
Margin of Error!!Households!!Total
Estimate!!Households!!Total!!Less than $10,000
Margin of Error!!Households!!Total!!Less than $10,000
Estimate!!Households!!Total!!$10,000 to $14,999
Margin of Error!!Households!!Total!!$10,000 to $14,999
Estimate!!Households!!Total!!$15,000 to $24,999
Margin of Error!!Households!!Total!!$15,000 to $24,999
Estimate!!Households!!Total!!$25,000 to $34,999
Margin of Error!!Households!!Total!!$25,000 to $34,999
Estimate!!Households!!Total!!$35,000 to $49,999
Margin of Error!!Households!!Total!!$35,000 to $49,999
Estimate!!Households!!Total!!$50,000 to $74,999
Margin of Error!!Households!!Total!!$50,000 to $74,999
Estimate!!Households!!Total!!$75,000 to $99,999
Margin of Error!!Households!!Total!!$75,000 to $99,999
Estimate!!Households!!Total!!$100,000 to $149,999
Margin of Error!!Households!!Total!!$100,000 to $149,999
Estimate!!Households!!Total!!$150,000 to $199,999
Margin of Error!!Households!!Total!!$

In [1101]:
# the census data df has a lot of information
# isolate the relevant columns for the final df

columns = ['Geographic Area Name',
           
            "Estimate!!Nonfamily households!!Median income (dollars)",
            "Estimate!!Nonfamily households!!Mean income (dollars)",
            "Estimate!!Nonfamily households!!Total!!Less than $10,000",
            "Estimate!!Nonfamily households!!Total!!$10,000 to $14,999",
            "Estimate!!Nonfamily households!!Total!!$15,000 to $24,999",
            "Estimate!!Nonfamily households!!Total!!$25,000 to $34,999",
            "Estimate!!Nonfamily households!!Total!!$35,000 to $49,999",
            "Estimate!!Nonfamily households!!Total!!$50,000 to $74,999",
           
           
           
           "Estimate!!Households!!Median income (dollars)",
           "Estimate!!Households!!Mean income (dollars)",
           "Estimate!!Households!!Total!!Less than $10,000",
            "Estimate!!Households!!Total!!$10,000 to $14,999",
            "Estimate!!Households!!Total!!$15,000 to $24,999",
            "Estimate!!Households!!Total!!$25,000 to $34,999",
            "Estimate!!Households!!Total!!$35,000 to $49,999",
            "Estimate!!Households!!Total!!$50,000 to $74,999"]
           

In [1102]:
zcta_df = zcta_df[columns]
zcta_df.head(8)

Unnamed: 0,Geographic Area Name,Estimate!!Nonfamily households!!Median income (dollars),Estimate!!Nonfamily households!!Mean income (dollars),"Estimate!!Nonfamily households!!Total!!Less than $10,000","Estimate!!Nonfamily households!!Total!!$10,000 to $14,999","Estimate!!Nonfamily households!!Total!!$15,000 to $24,999","Estimate!!Nonfamily households!!Total!!$25,000 to $34,999","Estimate!!Nonfamily households!!Total!!$35,000 to $49,999","Estimate!!Nonfamily households!!Total!!$50,000 to $74,999",Estimate!!Households!!Median income (dollars),Estimate!!Households!!Mean income (dollars),"Estimate!!Households!!Total!!Less than $10,000","Estimate!!Households!!Total!!$10,000 to $14,999","Estimate!!Households!!Total!!$15,000 to $24,999","Estimate!!Households!!Total!!$25,000 to $34,999","Estimate!!Households!!Total!!$35,000 to $49,999","Estimate!!Households!!Total!!$50,000 to $74,999"
1,10005,180389,180629,6.3,0.5,2.9,0.0,2.4,7.9,189702,225453,4.5,0.3,2.6,0.3,1.7,6.2
3,10002,31767,65351,20.9,13.5,11.2,6.2,7.8,9.4,36982,69323,14.7,11.8,13.3,8.6,10.9,11.0
4,10003,95812,146060,7.6,3.0,6.2,5.5,6.5,14.4,118161,196931,5.8,2.5,4.8,4.9,6.1,12.5
5,11215,91672,108235,7.4,4.2,5.3,5.5,5.8,12.7,132091,183544,4.0,2.1,3.2,3.4,4.4,9.1
6,10031,45735,68336,14.5,8.8,10.0,8.0,12.6,12.8,53660,78767,9.8,7.1,9.8,9.4,11.8,14.8
7,11208,24462,37087,23.3,10.5,17.0,11.7,12.3,10.6,42403,60092,15.0,5.9,11.5,9.7,13.5,15.1
8,10023,100531,147850,8.5,3.5,6.0,5.9,4.9,11.0,132605,228373,5.7,2.6,5.4,4.9,3.7,8.0
9,11217,94148,120780,10.1,2.9,7.0,4.8,6.2,9.4,119375,172251,6.7,1.9,5.4,5.0,6.4,8.7


In [1103]:
zcta_df = zcta_df.rename(columns={"Geographic Area Name": 'Respondent Address (Zip Code)'})

In [1116]:
zcta_df.iloc[0,0]

'10005'

In [1117]:
df.iloc[7550,17]

'10005'

# merge main df with the census df

In [1143]:
merged_train_df = pd.merge(df, zcta_df, how="outer", on=["Respondent Address (Zip Code)"])

In [1144]:
merged_train_df.sample(5)

Unnamed: 0,Ticket Number,Violation Date,Issuing Agency,Respondent First Name,Respondent Last Name,Violation Location (Borough),Violation Location (Block No.),Violation Location (Lot No.),Violation Location (House #),Violation Location (Street Name),Violation Location (City),Violation Location (Zip Code),Violation Location (State Name),Respondent Address (Borough),Respondent Address (House #),Respondent Address (Street Name),Respondent Address (City),Respondent Address (Zip Code),Respondent Address (State Name),Decision Location (Borough),Penalty Imposed,Paid Amount,Charge #1: Code,Charge #1: Code Section,Charge #1: Code Description,Charge #1: Infraction Amount,Charge #2: Code,Charge #2: Code Section,Charge #2: Code Description,Charge #2: Infraction Amount,Charge #3: Code,Charge #3: Code Section,Charge #3: Code Description,Charge #3: Infraction Amount,complete violation location,complete respondent location,Hearing Result,Estimate!!Nonfamily households!!Median income (dollars),Estimate!!Nonfamily households!!Mean income (dollars),"Estimate!!Nonfamily households!!Total!!Less than $10,000","Estimate!!Nonfamily households!!Total!!$10,000 to $14,999","Estimate!!Nonfamily households!!Total!!$15,000 to $24,999","Estimate!!Nonfamily households!!Total!!$25,000 to $34,999","Estimate!!Nonfamily households!!Total!!$35,000 to $49,999","Estimate!!Nonfamily households!!Total!!$50,000 to $74,999",Estimate!!Households!!Median income (dollars),Estimate!!Households!!Mean income (dollars),"Estimate!!Households!!Total!!Less than $10,000","Estimate!!Households!!Total!!$10,000 to $14,999","Estimate!!Households!!Total!!$15,000 to $24,999","Estimate!!Households!!Total!!$25,000 to $34,999","Estimate!!Households!!Total!!$35,000 to $49,999","Estimate!!Households!!Total!!$50,000 to $74,999"
58702,0176571818,2010-02-05,NYPD TRANSPORT INTELLIGENCE DI,UNKNOWN,KEYSPAN,QUEENS,9837.0,99.0,UNKNOWN,166 STREET,JAMAICA,11432,NEW YORK,BROOKLYN,1,METROTECH CENTER,BROOKLYN,11201,NEW YORK,UNKNOWN,2400.0,830.0,AD01,A.C. 19-102 I,USE OPENING OF STREET W O PERMIT,800.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,QUEENS 9837.0 99.0 166 STREET JAMAICA 11432 N...,BROOKLYN 1 METROTECH CENTER BROOKLYN 11201 NEW...,1.0,104119.0,125332.0,10.2,2.6,4.4,5.1,5.2,10.4,129248.0,186989.0,6.7,2.6,4.7,4.1,5.1,8.0
6759,0201266809,2017-10-22,POLICE DEPARTMENT,GERMAN JR,RIVERA,BROOKLYN,3985.0,12.0,676,LIBERTY AVENUE,BROOKLYN,11208,NEW YORK,BROOKLYN,387,CLEVELAND STREET,BROOKLYN,11208,NEW YORK,UNKNOWN,25.0,0.0,AX25,AC 10-125,OPEN CONTAINER CONSUMPTION OF ALCOHOL ON STREETS,25.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 3985.0 12.0 676 LIBERTY AVENUE BROOKL...,BROOKLYN 387 CLEVELAND STREET BROOKLYN 11208 N...,1.0,24462.0,37087.0,23.3,10.5,17.0,11.7,12.3,10.6,42403.0,60092.0,15.0,5.9,11.5,9.7,13.5,15.1
26581,042648887N,2019-01-14,SANITATION POLICE,UNKNOWN,CHASE BANK,BRONX,4997.0,42.0,725,EAST 233 STREET,BRONX,10466,NEW YORK,BRONX,725,EAST 233 STREET,BRONX,10466,NEW YORK,UNKNOWN,280.0,250.0,AS97,16-118 2,"DIRTY SIDEWALK,FAIL TO CLEAN 18 INTO STREET,SI...",250.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BRONX 4997.0 42.0 725 EAST 233 STREET BRONX 10...,BRONX 725 EAST 233 STREET BRONX 10466 NEW YORK,1.0,28310.0,43344.0,23.5,10.9,12.1,10.5,9.7,14.0,58393.0,73965.0,10.9,5.7,8.8,9.4,9.9,16.4
197600,0112278421,1999-07-15,POLICE DEPARTMENT,LETIZIA,MENDOZA,MANHATTAN,763.0,34.0,601,8 AVENUE,MANHATTAN,10018,NEW YORK,QUEENS,111,ROOSEVELT AVE,FAR ROCKAWAY,11096,NEW YORK,UNKNOWN,1000.0,0.0,AF01,17-307 A,UNLICENSED FOOD VENDOR,500.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,MANHATTAN 763.0 34.0 601 8 AVENUE MANHATTAN 10...,QUEENS 111 ROOSEVELT AVE FAR ROCKAWAY 11096 NE...,0.0,,,,,,,,,,,,,,,,
190100,0114221618,2001-12-31,POLICE DEPARTMENT,SIXTO,MATIFNZO,MANHATTAN,1277.0,2.0,505,5 AVENUE,MANHATTAN,10017,NEW YORK,MANHATTAN,430,E 30TH ST,NEW YORK,10016,NEW YORK,MANHATTAN,25.0,UNKNOWN,AG11,20-465(A),"VENDING ON SIDEWALK LESS THAN 12 FT. WIDE, OR ...",100.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,MANHATTAN 1277.0 2.0 505 5 AVENUE MANHATTAN 10...,MANHATTAN 430 E 30TH ST NEW YORK 10016 NEW YORK,0.0,100619.0,133685.0,6.0,4.1,4.6,4.8,5.8,12.4,126628.0,179164.0,4.8,3.4,4.0,3.8,4.9,10.2


In [1145]:
merged_train_df.shape

(213244, 53)

In [1146]:
#moved the target column to the first for easier visual
first_column = merged_train_df.pop('Hearing Result')
merged_train_df.insert(0, 'Hearing Result', first_column)
merged_train_df.head(2)

Unnamed: 0,Hearing Result,Ticket Number,Violation Date,Issuing Agency,Respondent First Name,Respondent Last Name,Violation Location (Borough),Violation Location (Block No.),Violation Location (Lot No.),Violation Location (House #),Violation Location (Street Name),Violation Location (City),Violation Location (Zip Code),Violation Location (State Name),Respondent Address (Borough),Respondent Address (House #),Respondent Address (Street Name),Respondent Address (City),Respondent Address (Zip Code),Respondent Address (State Name),Decision Location (Borough),Penalty Imposed,Paid Amount,Charge #1: Code,Charge #1: Code Section,Charge #1: Code Description,Charge #1: Infraction Amount,Charge #2: Code,Charge #2: Code Section,Charge #2: Code Description,Charge #2: Infraction Amount,Charge #3: Code,Charge #3: Code Section,Charge #3: Code Description,Charge #3: Infraction Amount,complete violation location,complete respondent location,Estimate!!Nonfamily households!!Median income (dollars),Estimate!!Nonfamily households!!Mean income (dollars),"Estimate!!Nonfamily households!!Total!!Less than $10,000","Estimate!!Nonfamily households!!Total!!$10,000 to $14,999","Estimate!!Nonfamily households!!Total!!$15,000 to $24,999","Estimate!!Nonfamily households!!Total!!$25,000 to $34,999","Estimate!!Nonfamily households!!Total!!$35,000 to $49,999","Estimate!!Nonfamily households!!Total!!$50,000 to $74,999",Estimate!!Households!!Median income (dollars),Estimate!!Households!!Mean income (dollars),"Estimate!!Households!!Total!!Less than $10,000","Estimate!!Households!!Total!!$10,000 to $14,999","Estimate!!Households!!Total!!$15,000 to $24,999","Estimate!!Households!!Total!!$25,000 to $34,999","Estimate!!Households!!Total!!$35,000 to $49,999","Estimate!!Households!!Total!!$50,000 to $74,999"
0,2.0,162420262,2012-08-30,POLICE DEPARTMENT,RONALD,BARONE,BROOKLYN,UNKNOWN,UNKNOWN,1711,EAST 33 STREET,BROOKLYN,11234,NEW YORK,BROOKLYN,21,SEBA AVENUE,BROOKLYN,11229,NEW YORK,SAU: MANH,0.0,0.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,AN68,A.C. 24-238 A,IMPROPER AUDIBLE BURGLAR ALARM WITH NO AUTOMAT...,280.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 1711 EAST 33 STREET BROOKLYN 11234 ...,BROOKLYN 21 SEBA AVENUE BROOKLYN 11229 NEW YORK,32281,49761,15.2,14.0,14.0,8.8,13.1,14.0,64631,87355,6.8,6.3,9.5,7.8,11.3,13.4
1,0.0,176434684,2010-09-10,POLICE DEPARTMENT,KONSTANTIN,TSIPNYATOV,MANHATTAN,1016.0,36.0,1515,BROADWAY,NEW YORK,10036,NEW YORK,BROOKLYN,1815,EAST 17 STREET,BROOKLYN,11229,NEW YORK,UNKNOWN,1000.0,0.0,AG21,20-465.1,VENDING AT TIMES PLACES RESTRICTED BY RULE OF ...,100.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,MANHATTAN 1016.0 36.0 1515 BROADWAY NEW YORK 1...,BROOKLYN 1815 EAST 17 STREET BROOKLYN 11229 NE...,32281,49761,15.2,14.0,14.0,8.8,13.1,14.0,64631,87355,6.8,6.3,9.5,7.8,11.3,13.4


In [1147]:
merged_train_df["Hearing Result"].value_counts(normalize=True)

0.0    0.429200
1.0    0.273505
3.0    0.150786
2.0    0.146509
Name: Hearing Result, dtype: float64

### for the respondents, some are individuals and some are commerical entities as indicated on some rows as "LLC." Therefore, need to create a separate column labeling whether the respondent is a person or otherwise

In [1168]:
# pd.set_option('display.max_rows', 1000000000)
merged_df["Respondent First Name"].value_counts()

UNKNOWN                 72283
MARIA                    1503
JOSE                     1385
LLC                      1345
MOHAMED                   992
MOHAMMED                  928
JUAN                      860
INC                       854
JOHN                      829
MICHAEL                   788
JOSEPH                    786
DAVID                     637
AHMED                     579
JAMES                     572
ANTHONY                   565
WILLIAM                   537
CARLOS                    518
ROBERT                    518
MOHAMMAD                  516
LUIS                      507
C                         464
ROSA                      456
GEORGE                    438
PEDRO                     412
MIGUEL                    409
MANUEL                    402
M                         398
ABDUL                     360
MD                        354
THOMAS                    322
CHARLES                   317
RICHARD                   314
RAFAEL                    312
ANTONIO   

In [1169]:
merged_df.shape

(213244, 53)

In [1170]:
key_words_first_name = ["INC", "CORP", "MANAGEMENT","BUS SERVICE AND TOUR", 
"SCIENCES DIVISION",
"HOUSING DEVELOPMENT"]  

In [1171]:
key_words_last_name = ["INC", "CORP", "MANAGEMENT","FIRST HOME PROPERTIES",
"COR",
"3 NYC",
"HPENY HOUSING DEVELOPMENT FUND",
"RT HUDSON ELEMENTARY SCHOOL",
"DEVELOPMENT CO",
"HOLDING CO",
"BANANA KELLY HSG DEVE",
"AQUA PROPERTIES",
"THE BROOKLYN UNION GAS CO",
"VANDERBILT MORTGAGE AND FINANC",
"AMERICAN BROKERS CONDUIT",
"CMI BUSINESS FURNITURE",
"FRIENDS LAND DEVELOP",
"HARBOR VIEW PROP LTD",
"INGERSOLL TENANT ASSOC",
"THE BROOKLYN UNION GAS COMPANY ",
"PLAZA CONSTRUCTION",
"AUTO AUCTION"
"FIRST HOME PROP",
"1046 WASHINGTON AVE HDFC",
"DIEGO BEEKMAN MUTUAL HOUSING A",
"REV MANAGEMENT",
"LANDSLIDE PROPERTIES",
"NEIGHBORHOOD RESTORE HOUSING D",
"HTB ENTERPRISES LTD",
"ALLIANCE OF INDIVIDUA",
"WJR PROPERTIES INC",
"WJR PROPERTIES INC",
"KEYSPAN ENERGY DELIVERY NYC",
"RLTY",
"FIRST UNITED MORTGAGE BANKING",
"ASSET PLUSS MANAGEMENT SERVICE",
"KEYSPAN ENERGY DELIVERY N Y C",
"WELLS FARGO HOME MORT",
"ALLIANCE OF INDIVIDUAL",
"NEIGHBORHOOD RESTORE HDFC",
"WILMINGTON SAVINGS FUND SOCIET",
"YOUNG ISRAEL OF AVENUE K",
"FREMONT INVESTMENT LOAN",
"BELL ATLANTIC",
"EM ESS PETROLEUM CORP",
"PI CONSTRUCTION SERVICE INC",
"US BANK NATIONAL ASSOCIATION",
"CONKLIN MGMT CO",
"CON EDISON",
"CONSOLIDATED EDISON",
"EMPIRE CITY SUBWAY",
"DEUTSCHE BANK NATIONAL TRUST C",
"NATIONAL GRID",
"CONTACT HOLDINGS CORP",
"U S BANK NATIONAL ASSOCIATION",
"G G ASSOCIATES",
"WELLS FARGO BANK",
"LUCKY SEAFOOD",
"AGENT OWNER",
"FEDERAL NATIONAL MORTGAGE ASSO",
"AMENCAN HOME MORTGAGE",
"HOMESIDE LENDING",
"HSBC BANK USA",
"HSBC BANK USA NA",
"HIGH STATE RLTY CORP",
"NYC HOUSING AUTHORITY",
"PLAZA CONSTRUCTION CORP",
"EASY STREET PLUMBING INC",
"1249 WEBSTER AVE RLTY",
"DEVELOP", "BANK", "RESOURCES", "SERVICES", "LLC", "SCHOOL", "HOME","NATIONAL GRID","SAM CONEY ISLAND LLC"
                    "ALL PHASE PLUMBING CORP","ERCAT REALTY CORP"]

In [1172]:
merged_df['Respondent Last Name'] = merged_df['Respondent Last Name'].astype(str)

In [88]:
# def word_checker(sentence):
#     if any(word in key_words_last_name for word in sentence.lower().split()):
#         return 'Not Person'
#     else:
#         return 'Person'

In [89]:
# merged_df['Respondent Status'] = merged_df['Respondent Last Name'].apply(word_checker)  

In [90]:
def get_word(my_string):
    for word in key_words_last_name:
        for x in merged_df["Respondent Last Name"]:
            if word.lower() in my_string.lower():
                return "Not Person"
            else:
                return "Person"

In [91]:
merged_df["Respondent Status"]= merged_df["Respondent Last Name"].apply(get_word)

In [92]:
merged_df.sample(2)

Unnamed: 0,Hearing Result,Ticket Number,Violation Date,Issuing Agency,Respondent First Name,Respondent Last Name,Violation Location (Borough),Violation Location (Block No.),Violation Location (Lot No.),Violation Location (House #),Violation Location (Street Name),Violation Location (City),Violation Location (Zip Code),Violation Location (State Name),Respondent Address (Borough),Respondent Address (House #),Respondent Address (Street Name),Respondent Address (City),Respondent Address (Zip Code),Respondent Address (State Name),Decision Location (Borough),Penalty Imposed,Paid Amount,Charge #1: Code,Charge #1: Code Section,Charge #1: Code Description,Charge #1: Infraction Amount,Charge #2: Code,Charge #2: Code Section,Charge #2: Code Description,Charge #2: Infraction Amount,Charge #3: Code,Charge #3: Code Section,Charge #3: Code Description,Charge #3: Infraction Amount,complete violation location,complete respondent location,JURISDICTION NAME,COUNT PARTICIPANTS,COUNT FEMALE,PERCENT FEMALE,COUNT MALE,PERCENT MALE,COUNT GENDER UNKNOWN,PERCENT GENDER UNKNOWN,COUNT GENDER TOTAL,PERCENT GENDER TOTAL,COUNT PACIFIC ISLANDER,PERCENT PACIFIC ISLANDER,COUNT HISPANIC LATINO,PERCENT HISPANIC LATINO,COUNT AMERICAN INDIAN,PERCENT AMERICAN INDIAN,COUNT ASIAN NON HISPANIC,PERCENT ASIAN NON HISPANIC,COUNT WHITE NON HISPANIC,PERCENT WHITE NON HISPANIC,COUNT BLACK NON HISPANIC,PERCENT BLACK NON HISPANIC,COUNT OTHER ETHNICITY,PERCENT OTHER ETHNICITY,COUNT ETHNICITY UNKNOWN,PERCENT ETHNICITY UNKNOWN,COUNT ETHNICITY TOTAL,PERCENT ETHNICITY TOTAL,COUNT PERMANENT RESIDENT ALIEN,PERCENT PERMANENT RESIDENT ALIEN,COUNT US CITIZEN,PERCENT US CITIZEN,COUNT OTHER CITIZEN STATUS,PERCENT OTHER CITIZEN STATUS,COUNT CITIZEN STATUS UNKNOWN,PERCENT CITIZEN STATUS UNKNOWN,COUNT CITIZEN STATUS TOTAL,PERCENT CITIZEN STATUS TOTAL,COUNT RECEIVES PUBLIC ASSISTANCE,PERCENT RECEIVES PUBLIC ASSISTANCE,COUNT NRECEIVES PUBLIC ASSISTANCE,PERCENT NRECEIVES PUBLIC ASSISTANCE,COUNT PUBLIC ASSISTANCE UNKNOWN,PERCENT PUBLIC ASSISTANCE UNKNOWN,COUNT PUBLIC ASSISTANCE TOTAL,PERCENT PUBLIC ASSISTANCE TOTAL,Respondent Status
152990,2,0178948211,2011-09-15,POLICE DEPARTMENT,MOHAMED M,EL AKHRAS,MANHATTAN,UNKNOWN,UNKNOWN,2,PENN PLAZA,NEW YORK,10001,NEW YORK,BROOKLYN,7209,5 AVENUE,BROOKLYN,11209,NEW YORK,MANHATTAN,UNKNOWN,UNKNOWN,AG26,20-465 P,ILLEGAL USE OF ELECTRICAL OIL GAS POWERED DEVICE,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,MANHATTAN 2 PENN PLAZA NEW YORK 10001 NEW YORK,BROOKLYN 7209 5 AVENUE BROOKLYN 11209 NEW YORK,11209,0,0,0.0,0,0.0,0,0,0,0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0,0,0.0,0,0.0,0,0.0,0,0,0,0,0,0.0,0,0.0,0,0,0,0,Person
138975,3,042584192R,2014-07-07,SANITATION POLICE,UNKNOWN,410 REALTY LLC,BRONX,2374.0,78.0,410,EAST 153 STREET,BRONX,10455,NEW YORK,BRONX,410,EAST 153 STREET,BRONX,10455,NEW YORK,MANHATTAN,100.0,106.0,AS26,A.C. 16-118 2 A,FAILURE TO CLEAN 18 INCHES INTO STREET,100.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BRONX 2374.0 78.0 410 EAST 153 STREET BRONX 10...,BRONX 410 EAST 153 STREET BRONX 10455 NEW YORK,10455,27,17,0.63,10,0.37,0,0,27,100,0,0.0,5,0.19,0,0.0,0,0.0,0,0.0,20,0.74,2,0.07,0,0.0,27,100,1,0.04,25,0.93,1,0.04,0,0,27,100,7,0.26,20,0.74,0,0,27,100,Person


In [93]:
merged_df['Respondent Status'].value_counts()

Person        197595
Not Person     10427
Name: Respondent Status, dtype: int64

In [94]:
merged_df.loc[merged_df['Respondent Last Name'] == "NATIONAL GRID"]

Unnamed: 0,Hearing Result,Ticket Number,Violation Date,Issuing Agency,Respondent First Name,Respondent Last Name,Violation Location (Borough),Violation Location (Block No.),Violation Location (Lot No.),Violation Location (House #),Violation Location (Street Name),Violation Location (City),Violation Location (Zip Code),Violation Location (State Name),Respondent Address (Borough),Respondent Address (House #),Respondent Address (Street Name),Respondent Address (City),Respondent Address (Zip Code),Respondent Address (State Name),Decision Location (Borough),Penalty Imposed,Paid Amount,Charge #1: Code,Charge #1: Code Section,Charge #1: Code Description,Charge #1: Infraction Amount,Charge #2: Code,Charge #2: Code Section,Charge #2: Code Description,Charge #2: Infraction Amount,Charge #3: Code,Charge #3: Code Section,Charge #3: Code Description,Charge #3: Infraction Amount,complete violation location,complete respondent location,JURISDICTION NAME,COUNT PARTICIPANTS,COUNT FEMALE,PERCENT FEMALE,COUNT MALE,PERCENT MALE,COUNT GENDER UNKNOWN,PERCENT GENDER UNKNOWN,COUNT GENDER TOTAL,PERCENT GENDER TOTAL,COUNT PACIFIC ISLANDER,PERCENT PACIFIC ISLANDER,COUNT HISPANIC LATINO,PERCENT HISPANIC LATINO,COUNT AMERICAN INDIAN,PERCENT AMERICAN INDIAN,COUNT ASIAN NON HISPANIC,PERCENT ASIAN NON HISPANIC,COUNT WHITE NON HISPANIC,PERCENT WHITE NON HISPANIC,COUNT BLACK NON HISPANIC,PERCENT BLACK NON HISPANIC,COUNT OTHER ETHNICITY,PERCENT OTHER ETHNICITY,COUNT ETHNICITY UNKNOWN,PERCENT ETHNICITY UNKNOWN,COUNT ETHNICITY TOTAL,PERCENT ETHNICITY TOTAL,COUNT PERMANENT RESIDENT ALIEN,PERCENT PERMANENT RESIDENT ALIEN,COUNT US CITIZEN,PERCENT US CITIZEN,COUNT OTHER CITIZEN STATUS,PERCENT OTHER CITIZEN STATUS,COUNT CITIZEN STATUS UNKNOWN,PERCENT CITIZEN STATUS UNKNOWN,COUNT CITIZEN STATUS TOTAL,PERCENT CITIZEN STATUS TOTAL,COUNT RECEIVES PUBLIC ASSISTANCE,PERCENT RECEIVES PUBLIC ASSISTANCE,COUNT NRECEIVES PUBLIC ASSISTANCE,PERCENT NRECEIVES PUBLIC ASSISTANCE,COUNT PUBLIC ASSISTANCE UNKNOWN,PERCENT PUBLIC ASSISTANCE UNKNOWN,COUNT PUBLIC ASSISTANCE TOTAL,PERCENT PUBLIC ASSISTANCE TOTAL,Respondent Status
58597,3,0182000693,2013-10-12,POLICE DEPT,UNKNOWN,NATIONAL GRID,BROOKLYN,UNKNOWN,UNKNOWN,136,SOUTH 4TH STREET,BROOKLYN,11211,NEW YORK,BROOKLYN,1,METROTECH CENTER,BROOKLYN,11201,NEW YORK,UNKNOWN,250.0,234.0,AD16,A.C. 19-122,SAND DIRT RUBBISH DEBRIS NOT REMOVED FROM SITE...,25000.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 136 SOUTH 4TH STREET BROOKLYN 11211...,BROOKLYN 1 METROTECH CENTER BROOKLYN 11201 NEW...,11201,11,6,0.55,5,0.45,0,0,11,100,0,0.0,2,0.18,0,0.0,1,0.09,1,0.09,5,0.45,2,0.18,0,0.0,11,99,0,0.0,11,1.0,0,0.0,0,0,11,100,2,0.18,9,0.82,0,0,11,100,Person
58607,1,0177039574,2010-08-09,NYPD TRANSPORT INTELLIGENCE DI,UNKNOWN,NATIONAL GRID,BROOKLYN,UNKNOWN,UNKNOWN,UNKNOWN,F O 1664 EASTERN PARKWAY BT,BROOKLYN,11233,NEW YORK,BROOKLYN,ONE,METROTECH CENTER,BROOKLYN,11201,NEW YORK,UNKNOWN,3600.0,1200.0,AD30,A.C. 19-102 II,FAILURE TO COMPLY WITH THE TERMS AND CONDITION...,1200.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN F O 1664 EASTERN PARKWAY BT BROOKL...,BROOKLYN ONE METROTECH CENTER BROOKLYN 11201 N...,11201,11,6,0.55,5,0.45,0,0,11,100,0,0.0,2,0.18,0,0.0,1,0.09,1,0.09,5,0.45,2,0.18,0,0.0,11,99,0,0.0,11,1.0,0,0.0,0,0,11,100,2,0.18,9,0.82,0,0,11,100,Person
58609,3,0182000583,2013-07-24,POLICE DEPT,UNKNOWN,NATIONAL GRID,BROOKLYN,3031.0,10.0,12,STAGG STREET,BROOKLYN,11206,NEW YORK,BROOKLYN,1,METROTECT CENTER,BROOKLYN,11201,NEW YORK,UNKNOWN,280.0,262.0,AD16,A.C. 19-122,SAND DIRT RUBBISH DEBRIS NOT REMOVED FROM SITE...,25000.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 3031.0 10.0 12 STAGG STREET BROOKLYN ...,BROOKLYN 1 METROTECT CENTER BROOKLYN 11201 NEW...,11201,11,6,0.55,5,0.45,0,0,11,100,0,0.0,2,0.18,0,0.0,1,0.09,1,0.09,5,0.45,2,0.18,0,0.0,11,99,0,0.0,11,1.0,0,0.0,0,0,11,100,2,0.18,9,0.82,0,0,11,100,Person
58610,1,0180233662,2012-04-03,NYPD TRANSPORT INTELLIGENCE DI,UNKNOWN,NATIONAL GRID,BROOKLYN,1200.0,27.0,UNKNOWN,ATLANTIC AVENUE,BROOKLYN,11216,NEW YORK,BROOKLYN,1,METROTECH CENTER,BROOKLYN,11201,NEW YORK,UNKNOWN,1230.0,1200.0,AD30,A.C. 19-102(II),FAILURE TO COMPLY WITH THE TERMS AND CONDITION...,1200.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 1200.0 27.0 ATLANTIC AVENUE BROOKLYN...,BROOKLYN 1 METROTECH CENTER BROOKLYN 11201 NEW...,11201,11,6,0.55,5,0.45,0,0,11,100,0,0.0,2,0.18,0,0.0,1,0.09,1,0.09,5,0.45,2,0.18,0,0.0,11,99,0,0.0,11,1.0,0,0.0,0,0,11,100,2,0.18,9,0.82,0,0,11,100,Person
58620,3,0176395450,2012-01-03,POLICE DEPARTMENT,UNKNOWN,NATIONAL GRID,QUEENS,3096.0,7501.0,63-80,WETHEROLE STREET,REGO PARK,11374,NEW YORK,BROOKLYN,1,METROTECH CENTER,BROOKLYN,11201,NEW YORK,UNKNOWN,750.0,788.0,AD3C,34 RCNY 2-11 e 5,FAILURE TO MAINTAIN 5FT PEDESTRIAN WALKWAY ON S W,250.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,QUEENS 3096.0 7501.0 63-80 WETHEROLE STREET RE...,BROOKLYN 1 METROTECH CENTER BROOKLYN 11201 NEW...,11201,11,6,0.55,5,0.45,0,0,11,100,0,0.0,2,0.18,0,0.0,1,0.09,1,0.09,5,0.45,2,0.18,0,0.0,11,99,0,0.0,11,1.0,0,0.0,0,0,11,100,2,0.18,9,0.82,0,0,11,100,Person
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125261,3,0176395010,2012-01-23,POLICE DEPT,UNKNOWN,NATIONAL GRID,QUEENS,3098.0,16.0,64-64,WETHEROLE STREET,REGO PARK,11374,NEW YORK,QUEENS,89-67,162 STREET,JAMAICA,11432,NEW YORK,UNKNOWN,750.0,770.0,AD10,A.C. 19-121 B 2,DEBRIS CONSTR. MATERIALS OBSTRUCTING GUTTERS S...,25000.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,QUEENS 3098.0 16.0 64-64 WETHEROLE STREET REGO...,QUEENS 89-67 162 STREET JAMAICA 11432 NEW YORK,11432,2,2,1.00,0,0.00,0,0,2,100,0,0.0,0,0.00,0,0.0,1,0.50,0,0.00,1,0.50,0,0.00,0,0.0,2,100,0,0.0,2,1.0,0,0.0,0,0,2,100,0,0.00,2,1.00,0,0,2,100,Person
207739,1,0169015120,2009-12-16,POLICE DEPARTMENT,UNKNOWN,NATIONAL GRID,BROOKLYN,2102.0,31.0,UNKNOWN,DE KALB AVENUE,BROOKLYN,11205,NEW YORK,BROOKLYN,1,METRO TECH CENTER,BROOKLYN,11202,NEW YORK,UNKNOWN,3600.0,1292.0,AD30,A.C. 19-102 II,FAILURE TO COMPLY WITH THE TERMS AND CONDITION...,1200.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 2102.0 31.0 DE KALB AVENUE BROOKLYN ...,BROOKLYN 1 METRO TECH CENTER BROOKLYN 11202 NE...,11202,0,0,0.00,0,0.00,0,0,0,0,0,0.0,0,0.00,0,0.0,0,0.00,0,0.00,0,0.00,0,0.00,0,0.0,0,0,0,0.0,0,0.0,0,0.0,0,0,0,0,0,0.00,0,0.00,0,0,0,0,Person
207740,1,0177034285,2010-09-29,NYPD TRANSPORT INTELLIGENCE DI,UNKNOWN,NATIONAL GRID,BROOKLYN,2011.0,12.0,UNKNOWN,CLINTON AVENUE,BROOKLYN,11238,NEW YORK,BROOKLYN,1,METRO TECH CENTER,BROOKLYN,11202,NEW YORK,UNKNOWN,1230.0,1200.0,AD30,A.C. 19-102 II,FAILURE TO COMPLY WITH THE TERMS AND CONDITION...,1200.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 2011.0 12.0 CLINTON AVENUE BROOKLYN ...,BROOKLYN 1 METRO TECH CENTER BROOKLYN 11202 NE...,11202,0,0,0.00,0,0.00,0,0,0,0,0,0.0,0,0.00,0,0.0,0,0.00,0,0.00,0,0.00,0,0.00,0,0.0,0,0,0,0.0,0,0.0,0,0.0,0,0,0,0,0,0.00,0,0.00,0,0,0,0,Person
207746,1,0176395130,2012-01-30,POLICE DEPARTMENT,UNKNOWN,NATIONAL GRID,QUEENS,2124.0,20.0,102-18,63 AVENUE,FOREST HILLS,11375,NEW YORK,BROOKLYN,1,MOTER TECH CNT,BROOKLYN,11202,NEW YORK,UNKNOWN,430.0,400.0,AD05,A.C. 19-109 A,FAILURE TO PROVIDE ADEQUATE PROTECTION AT WORK...,400.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,QUEENS 2124.0 20.0 102-18 63 AVENUE FOREST HIL...,BROOKLYN 1 MOTER TECH CNT BROOKLYN 11202 NEW YORK,11202,0,0,0.00,0,0.00,0,0,0,0,0,0.0,0,0.00,0,0.0,0,0.00,0,0.00,0,0.00,0,0.00,0,0.0,0,0,0,0.0,0,0.0,0,0.0,0,0,0,0,0,0.00,0,0.00,0,0,0,0,Person


In [95]:
merged_df.drop("JURISDICTION NAME", axis=1, inplace=True)

In [96]:
merged_df.head(1)

Unnamed: 0,Hearing Result,Ticket Number,Violation Date,Issuing Agency,Respondent First Name,Respondent Last Name,Violation Location (Borough),Violation Location (Block No.),Violation Location (Lot No.),Violation Location (House #),Violation Location (Street Name),Violation Location (City),Violation Location (Zip Code),Violation Location (State Name),Respondent Address (Borough),Respondent Address (House #),Respondent Address (Street Name),Respondent Address (City),Respondent Address (Zip Code),Respondent Address (State Name),Decision Location (Borough),Penalty Imposed,Paid Amount,Charge #1: Code,Charge #1: Code Section,Charge #1: Code Description,Charge #1: Infraction Amount,Charge #2: Code,Charge #2: Code Section,Charge #2: Code Description,Charge #2: Infraction Amount,Charge #3: Code,Charge #3: Code Section,Charge #3: Code Description,Charge #3: Infraction Amount,complete violation location,complete respondent location,COUNT PARTICIPANTS,COUNT FEMALE,PERCENT FEMALE,COUNT MALE,PERCENT MALE,COUNT GENDER UNKNOWN,PERCENT GENDER UNKNOWN,COUNT GENDER TOTAL,PERCENT GENDER TOTAL,COUNT PACIFIC ISLANDER,PERCENT PACIFIC ISLANDER,COUNT HISPANIC LATINO,PERCENT HISPANIC LATINO,COUNT AMERICAN INDIAN,PERCENT AMERICAN INDIAN,COUNT ASIAN NON HISPANIC,PERCENT ASIAN NON HISPANIC,COUNT WHITE NON HISPANIC,PERCENT WHITE NON HISPANIC,COUNT BLACK NON HISPANIC,PERCENT BLACK NON HISPANIC,COUNT OTHER ETHNICITY,PERCENT OTHER ETHNICITY,COUNT ETHNICITY UNKNOWN,PERCENT ETHNICITY UNKNOWN,COUNT ETHNICITY TOTAL,PERCENT ETHNICITY TOTAL,COUNT PERMANENT RESIDENT ALIEN,PERCENT PERMANENT RESIDENT ALIEN,COUNT US CITIZEN,PERCENT US CITIZEN,COUNT OTHER CITIZEN STATUS,PERCENT OTHER CITIZEN STATUS,COUNT CITIZEN STATUS UNKNOWN,PERCENT CITIZEN STATUS UNKNOWN,COUNT CITIZEN STATUS TOTAL,PERCENT CITIZEN STATUS TOTAL,COUNT RECEIVES PUBLIC ASSISTANCE,PERCENT RECEIVES PUBLIC ASSISTANCE,COUNT NRECEIVES PUBLIC ASSISTANCE,PERCENT NRECEIVES PUBLIC ASSISTANCE,COUNT PUBLIC ASSISTANCE UNKNOWN,PERCENT PUBLIC ASSISTANCE UNKNOWN,COUNT PUBLIC ASSISTANCE TOTAL,PERCENT PUBLIC ASSISTANCE TOTAL,Respondent Status
0,2,162420262,2012-08-30,POLICE DEPARTMENT,RONALD,BARONE,BROOKLYN,UNKNOWN,UNKNOWN,1711,EAST 33 STREET,BROOKLYN,11234,NEW YORK,BROOKLYN,21,SEBA AVENUE,BROOKLYN,11229,NEW YORK,SAU: MANH,0.0,0.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,AN68,A.C. 24-238 A,IMPROPER AUDIBLE BURGLAR ALARM WITH NO AUTOMAT...,280.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 1711 EAST 33 STREET BROOKLYN 11234 ...,BROOKLYN 21 SEBA AVENUE BROOKLYN 11229 NEW YORK,52,32,0.62,20,0.38,0,0,52,100,0,0.0,2,0.04,0,0.0,5,0.1,39,0.75,2,0.04,3,0.06,1,0.02,52,100,3,0.06,49,0.94,0,0.0,0,0,52,100,5,0.1,47,0.9,0,0,52,100,Person


In [97]:
# pd.options.display.max_colwidth = 1000000
# pd.set_option('display.max_columns', 2000000000)
# pd.set_option('display.max_rows', 1000000000)
# pd.set_option('display.expand_frame_repr', True)

# FSM

just pulled in one feature b/c i need to do more work on the columns. 

In [50]:
dummy_model = DummyClassifier(strategy="most_frequent")
dummy_model.fit(df["Violation Location (City)"], df["Hearing Result"])
y_hat = dummy_model.predict(df)

In [54]:
acc = accuracy_score(df["Hearing Result"],y_hat)
macro_precision_score=precision_score(df["Hearing Result"], y_hat, average='macro')
micro_precision_score=precision_score(df["Hearing Result"] , y_hat, average='micro')
macro_recall_score=recall_score(df["Hearing Result"], y_hat, average='macro')
micro_recall_score=recall_score(df["Hearing Result"], y_hat, average='micro')

print('Accuracy Score: {}'.format(acc))
print('Macro Precision Score: {}'.format(macro_precision_score))
print('Micro Precision Score: {}'.format(micro_precision_score))
print('Macro Recall Score: {}'.format(macro_recall_score))
print('Micro Recall Score: {}'.format(micro_recall_score))

Accuracy Score: 0.4292004895823075
Macro Precision Score: 0.10730012239557687
Micro Precision Score: 0.4292004895823075
Macro Recall Score: 0.25
Micro Recall Score: 0.4292004895823075


  _warn_prf(average, modifier, msg_start, len(result))
