In [1]:
import pandas as pd
import numpy as np

from sklearn.dummy import DummyClassifier


from sklearn.metrics import confusion_matrix, plot_confusion_matrix,\
    precision_score, recall_score, accuracy_score, f1_score, log_loss,\
    roc_curve, roc_auc_score, classification_report

In [2]:
df = pd.read_csv("df_train.csv", low_memory = False)

In [3]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,Ticket Number,Violation Date,Issuing Agency,Respondent First Name,Respondent Last Name,Violation Location (Borough),Violation Location (Block No.),Violation Location (Lot No.),Violation Location (House #),...,Charge #2: Code Section,Charge #2: Code Description,Charge #2: Infraction Amount,Charge #3: Code,Charge #3: Code Section,Charge #3: Code Description,Charge #3: Infraction Amount,complete violation location,complete respondent location,Hearing Result
0,545274,162420262,2012-08-30,POLICE DEPARTMENT,RONALD,BARONE,BROOKLYN,,,1711,...,A.C. 24-238 A,IMPROPER AUDIBLE BURGLAR ALARM WITH NO AUTOMAT...,280.0,,,,,BROOKLYN 1711 EAST 33 STREET BROOKLYN 11234 ...,BROOKLYN 21 SEBA AVENUE BROOKLYN 11229 NEW YORK,2
1,25788,112647160,2003-11-23,POLICE DEPARTMENT,JOHN,PEZZULLO,BROOKLYN,989.0,42.0,256,...,,,,,,,,BROOKLYN 989.0 42.0 256 7 AVENUE BROOKLYN 1121...,BROOKLYN 359 9 AVE BROOKLYN 11215 NEW YORK,0


In [4]:
df = df.drop("Unnamed: 0", axis=1)

In [5]:
def fill_na(column):
    
    for x in column:
        df[x].fillna('UNKNOWN', inplace=True)
    
    return df.head(2)

In [6]:
column_names = list(df.columns)

In [7]:
fill_na(column_names)

Unnamed: 0,Ticket Number,Violation Date,Issuing Agency,Respondent First Name,Respondent Last Name,Violation Location (Borough),Violation Location (Block No.),Violation Location (Lot No.),Violation Location (House #),Violation Location (Street Name),...,Charge #2: Code Section,Charge #2: Code Description,Charge #2: Infraction Amount,Charge #3: Code,Charge #3: Code Section,Charge #3: Code Description,Charge #3: Infraction Amount,complete violation location,complete respondent location,Hearing Result
0,162420262,2012-08-30,POLICE DEPARTMENT,RONALD,BARONE,BROOKLYN,UNKNOWN,UNKNOWN,1711,EAST 33 STREET,...,A.C. 24-238 A,IMPROPER AUDIBLE BURGLAR ALARM WITH NO AUTOMAT...,280.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 1711 EAST 33 STREET BROOKLYN 11234 ...,BROOKLYN 21 SEBA AVENUE BROOKLYN 11229 NEW YORK,2
1,112647160,2003-11-23,POLICE DEPARTMENT,JOHN,PEZZULLO,BROOKLYN,989.0,42.0,256,7 AVENUE,...,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 989.0 42.0 256 7 AVENUE BROOKLYN 1121...,BROOKLYN 359 9 AVE BROOKLYN 11215 NEW YORK,0


In [8]:
df["Charge #2: Infraction Amount"].value_counts().head(3)

UNKNOWN    209908
0.0          2447
1000.0        157
Name: Charge #2: Infraction Amount, dtype: int64

In [9]:
#to match 

df["Respondent Address (Zip Code)"].value_counts(normalize=True)

11207    0.029952
11368    0.025825
10466    0.025820
10457    0.023809
10456    0.023218
           ...   
11703    0.000005
10385    0.000005
10066    0.000005
12182    0.000005
11167    0.000005
Name: Respondent Address (Zip Code), Length: 1108, dtype: float64

### load zip code dataframe to add neighborhood level data to the original data

In [10]:
zip_code = pd.read_csv("Demographic_Statistics_By_Zip_Code.csv", low_memory = False)

In [11]:
zip_code.loc[zip_code["JURISDICTION NAME"] == 12783]

Unnamed: 0,JURISDICTION NAME,COUNT PARTICIPANTS,COUNT FEMALE,PERCENT FEMALE,COUNT MALE,PERCENT MALE,COUNT GENDER UNKNOWN,PERCENT GENDER UNKNOWN,COUNT GENDER TOTAL,PERCENT GENDER TOTAL,...,COUNT CITIZEN STATUS TOTAL,PERCENT CITIZEN STATUS TOTAL,COUNT RECEIVES PUBLIC ASSISTANCE,PERCENT RECEIVES PUBLIC ASSISTANCE,COUNT NRECEIVES PUBLIC ASSISTANCE,PERCENT NRECEIVES PUBLIC ASSISTANCE,COUNT PUBLIC ASSISTANCE UNKNOWN,PERCENT PUBLIC ASSISTANCE UNKNOWN,COUNT PUBLIC ASSISTANCE TOTAL,PERCENT PUBLIC ASSISTANCE TOTAL
229,12783,201,66,0.33,135,0.67,0,0,201,100,...,201,100,77,0.38,124,0.62,0,0,201,100


In [12]:
merged_df = pd.merge(df, zip_code, left_on = "Respondent Address (Zip Code)", right_on = "JURISDICTION NAME", how="inner")

In [13]:
merged_df['Ticket Number'].is_unique

True

In [14]:
pd.set_option('display.max_columns', 2000000000)
merged_df.iloc[277360:]

Unnamed: 0,Ticket Number,Violation Date,Issuing Agency,Respondent First Name,Respondent Last Name,Violation Location (Borough),Violation Location (Block No.),Violation Location (Lot No.),Violation Location (House #),Violation Location (Street Name),Violation Location (City),Violation Location (Zip Code),Violation Location (State Name),Respondent Address (Borough),Respondent Address (House #),Respondent Address (Street Name),Respondent Address (City),Respondent Address (Zip Code),Respondent Address (State Name),Decision Location (Borough),Penalty Imposed,Paid Amount,Charge #1: Code,Charge #1: Code Section,Charge #1: Code Description,Charge #1: Infraction Amount,Charge #2: Code,Charge #2: Code Section,Charge #2: Code Description,Charge #2: Infraction Amount,Charge #3: Code,Charge #3: Code Section,Charge #3: Code Description,Charge #3: Infraction Amount,complete violation location,complete respondent location,Hearing Result,JURISDICTION NAME,COUNT PARTICIPANTS,COUNT FEMALE,PERCENT FEMALE,COUNT MALE,PERCENT MALE,COUNT GENDER UNKNOWN,PERCENT GENDER UNKNOWN,COUNT GENDER TOTAL,PERCENT GENDER TOTAL,COUNT PACIFIC ISLANDER,PERCENT PACIFIC ISLANDER,COUNT HISPANIC LATINO,PERCENT HISPANIC LATINO,COUNT AMERICAN INDIAN,PERCENT AMERICAN INDIAN,COUNT ASIAN NON HISPANIC,PERCENT ASIAN NON HISPANIC,COUNT WHITE NON HISPANIC,PERCENT WHITE NON HISPANIC,COUNT BLACK NON HISPANIC,PERCENT BLACK NON HISPANIC,COUNT OTHER ETHNICITY,PERCENT OTHER ETHNICITY,COUNT ETHNICITY UNKNOWN,PERCENT ETHNICITY UNKNOWN,COUNT ETHNICITY TOTAL,PERCENT ETHNICITY TOTAL,COUNT PERMANENT RESIDENT ALIEN,PERCENT PERMANENT RESIDENT ALIEN,COUNT US CITIZEN,PERCENT US CITIZEN,COUNT OTHER CITIZEN STATUS,PERCENT OTHER CITIZEN STATUS,COUNT CITIZEN STATUS UNKNOWN,PERCENT CITIZEN STATUS UNKNOWN,COUNT CITIZEN STATUS TOTAL,PERCENT CITIZEN STATUS TOTAL,COUNT RECEIVES PUBLIC ASSISTANCE,PERCENT RECEIVES PUBLIC ASSISTANCE,COUNT NRECEIVES PUBLIC ASSISTANCE,PERCENT NRECEIVES PUBLIC ASSISTANCE,COUNT PUBLIC ASSISTANCE UNKNOWN,PERCENT PUBLIC ASSISTANCE UNKNOWN,COUNT PUBLIC ASSISTANCE TOTAL,PERCENT PUBLIC ASSISTANCE TOTAL


In [15]:
#moved the target column to the first for easier visual

first_column = merged_df.pop('Hearing Result')
merged_df.insert(0, 'Hearing Result', first_column)
merged_df.head(2)

Unnamed: 0,Hearing Result,Ticket Number,Violation Date,Issuing Agency,Respondent First Name,Respondent Last Name,Violation Location (Borough),Violation Location (Block No.),Violation Location (Lot No.),Violation Location (House #),Violation Location (Street Name),Violation Location (City),Violation Location (Zip Code),Violation Location (State Name),Respondent Address (Borough),Respondent Address (House #),Respondent Address (Street Name),Respondent Address (City),Respondent Address (Zip Code),Respondent Address (State Name),Decision Location (Borough),Penalty Imposed,Paid Amount,Charge #1: Code,Charge #1: Code Section,Charge #1: Code Description,Charge #1: Infraction Amount,Charge #2: Code,Charge #2: Code Section,Charge #2: Code Description,Charge #2: Infraction Amount,Charge #3: Code,Charge #3: Code Section,Charge #3: Code Description,Charge #3: Infraction Amount,complete violation location,complete respondent location,JURISDICTION NAME,COUNT PARTICIPANTS,COUNT FEMALE,PERCENT FEMALE,COUNT MALE,PERCENT MALE,COUNT GENDER UNKNOWN,PERCENT GENDER UNKNOWN,COUNT GENDER TOTAL,PERCENT GENDER TOTAL,COUNT PACIFIC ISLANDER,PERCENT PACIFIC ISLANDER,COUNT HISPANIC LATINO,PERCENT HISPANIC LATINO,COUNT AMERICAN INDIAN,PERCENT AMERICAN INDIAN,COUNT ASIAN NON HISPANIC,PERCENT ASIAN NON HISPANIC,COUNT WHITE NON HISPANIC,PERCENT WHITE NON HISPANIC,COUNT BLACK NON HISPANIC,PERCENT BLACK NON HISPANIC,COUNT OTHER ETHNICITY,PERCENT OTHER ETHNICITY,COUNT ETHNICITY UNKNOWN,PERCENT ETHNICITY UNKNOWN,COUNT ETHNICITY TOTAL,PERCENT ETHNICITY TOTAL,COUNT PERMANENT RESIDENT ALIEN,PERCENT PERMANENT RESIDENT ALIEN,COUNT US CITIZEN,PERCENT US CITIZEN,COUNT OTHER CITIZEN STATUS,PERCENT OTHER CITIZEN STATUS,COUNT CITIZEN STATUS UNKNOWN,PERCENT CITIZEN STATUS UNKNOWN,COUNT CITIZEN STATUS TOTAL,PERCENT CITIZEN STATUS TOTAL,COUNT RECEIVES PUBLIC ASSISTANCE,PERCENT RECEIVES PUBLIC ASSISTANCE,COUNT NRECEIVES PUBLIC ASSISTANCE,PERCENT NRECEIVES PUBLIC ASSISTANCE,COUNT PUBLIC ASSISTANCE UNKNOWN,PERCENT PUBLIC ASSISTANCE UNKNOWN,COUNT PUBLIC ASSISTANCE TOTAL,PERCENT PUBLIC ASSISTANCE TOTAL
0,2,162420262,2012-08-30,POLICE DEPARTMENT,RONALD,BARONE,BROOKLYN,UNKNOWN,UNKNOWN,1711,EAST 33 STREET,BROOKLYN,11234,NEW YORK,BROOKLYN,21,SEBA AVENUE,BROOKLYN,11229,NEW YORK,SAU: MANH,0.0,0.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,AN68,A.C. 24-238 A,IMPROPER AUDIBLE BURGLAR ALARM WITH NO AUTOMAT...,280.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 1711 EAST 33 STREET BROOKLYN 11234 ...,BROOKLYN 21 SEBA AVENUE BROOKLYN 11229 NEW YORK,11229,52,32,0.62,20,0.38,0,0,52,100,0,0.0,2,0.04,0,0.0,5,0.1,39,0.75,2,0.04,3,0.06,1,0.02,52,100,3,0.06,49,0.94,0,0.0,0,0,52,100,5,0.1,47,0.9,0,0,52,100
1,0,176434684,2010-09-10,POLICE DEPARTMENT,KONSTANTIN,TSIPNYATOV,MANHATTAN,1016.0,36.0,1515,BROADWAY,NEW YORK,10036,NEW YORK,BROOKLYN,1815,EAST 17 STREET,BROOKLYN,11229,NEW YORK,UNKNOWN,1000.0,0.0,AG21,20-465.1,VENDING AT TIMES PLACES RESTRICTED BY RULE OF ...,100.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,MANHATTAN 1016.0 36.0 1515 BROADWAY NEW YORK 1...,BROOKLYN 1815 EAST 17 STREET BROOKLYN 11229 NE...,11229,52,32,0.62,20,0.38,0,0,52,100,0,0.0,2,0.04,0,0.0,5,0.1,39,0.75,2,0.04,3,0.06,1,0.02,52,100,3,0.06,49,0.94,0,0.0,0,0,52,100,5,0.1,47,0.9,0,0,52,100


In [16]:
df["Hearing Result"].value_counts(normalize=True)

0    0.429200
1    0.273505
3    0.150786
2    0.146509
Name: Hearing Result, dtype: float64

### for the respondents, some are individuals and some are commerical entities as indicated on some rows as "LLC." Therefore, need to create a separate column labeling whether the respondent is a person or otherwise

In [17]:
# pd.set_option('display.max_rows', 1000000000)
merged_df["Respondent First Name"].value_counts()

UNKNOWN      70502
MARIA         1438
JOSE          1358
LLC           1336
MOHAMED        974
             ...  
KYARA            1
CAO GUANG        1
ORTEAGA          1
MANZI            1
AUHAMMAD         1
Name: Respondent First Name, Length: 33344, dtype: int64

In [18]:
merged_df.shape

(208022, 83)

In [19]:
key_words_first_name = ["INC", "CORP", "MANAGEMENT","BUS SERVICE AND TOUR", 
"SCIENCES DIVISION",
"HOUSING DEVELOPMENT"]  

In [20]:
key_words_last_name = ["INC", "CORP", "MANAGEMENT","FIRST HOME PROPERTIES",
"COR",
"3 NYC",
"HPENY HOUSING DEVELOPMENT FUND",
"RT HUDSON ELEMENTARY SCHOOL",
"DEVELOPMENT CO",
"HOLDING CO",
"BANANA KELLY HSG DEVE",
"AQUA PROPERTIES",
"THE BROOKLYN UNION GAS CO",
"VANDERBILT MORTGAGE AND FINANC",
"AMERICAN BROKERS CONDUIT",
"CMI BUSINESS FURNITURE",
"FRIENDS LAND DEVELOP",
"HARBOR VIEW PROP LTD",
"INGERSOLL TENANT ASSOC",
"THE BROOKLYN UNION GAS COMPANY ",
"PLAZA CONSTRUCTION",
"AUTO AUCTION"
"FIRST HOME PROP",
"1046 WASHINGTON AVE HDFC",
"DIEGO BEEKMAN MUTUAL HOUSING A",
"REV MANAGEMENT",
"LANDSLIDE PROPERTIES",
"NEIGHBORHOOD RESTORE HOUSING D",
"HTB ENTERPRISES LTD",
"ALLIANCE OF INDIVIDUA",
"WJR PROPERTIES INC",
"WJR PROPERTIES INC",
"KEYSPAN ENERGY DELIVERY NYC",
"RLTY",
"FIRST UNITED MORTGAGE BANKING",
"ASSET PLUSS MANAGEMENT SERVICE",
"KEYSPAN ENERGY DELIVERY N Y C",
"WELLS FARGO HOME MORT",
"ALLIANCE OF INDIVIDUAL",
"NEIGHBORHOOD RESTORE HDFC",
"WILMINGTON SAVINGS FUND SOCIET",
"YOUNG ISRAEL OF AVENUE K",
"FREMONT INVESTMENT LOAN",
"BELL ATLANTIC",
"EM ESS PETROLEUM CORP",
"PI CONSTRUCTION SERVICE INC",
"US BANK NATIONAL ASSOCIATION",
"CONKLIN MGMT CO",
"CON EDISON",
"CONSOLIDATED EDISON",
"EMPIRE CITY SUBWAY",
"DEUTSCHE BANK NATIONAL TRUST C",
"NATIONAL GRID",
"CONTACT HOLDINGS CORP",
"U S BANK NATIONAL ASSOCIATION",
"G G ASSOCIATES",
"WELLS FARGO BANK",
"LUCKY SEAFOOD",
"AGENT OWNER",
"FEDERAL NATIONAL MORTGAGE ASSO",
"AMENCAN HOME MORTGAGE",
"HOMESIDE LENDING",
"HSBC BANK USA",
"HSBC BANK USA NA",
"HIGH STATE RLTY CORP",
"NYC HOUSING AUTHORITY",
"PLAZA CONSTRUCTION CORP",
"EASY STREET PLUMBING INC",
"1249 WEBSTER AVE RLTY",
"DEVELOP", "BANK", "RESOURCES", "SERVICES", "LLC", "SCHOOL", "HOME","NATIONAL GRID","SAM CONEY ISLAND LLC"
                    "ALL PHASE PLUMBING CORP","ERCAT REALTY CORP"]

In [21]:
merged_df['Respondent Last Name'] = merged_df['Respondent Last Name'].astype(str)

In [22]:
# def word_checker(sentence):
#     if any(word in key_words_last_name for word in sentence.lower().split()):
#         return 'Not Person'
#     else:
#         return 'Person'

In [23]:
# merged_df['Respondent Status'] = merged_df['Respondent Last Name'].apply(word_checker)  

In [24]:
def get_word(my_string):
    for word in key_words_last_name:
        for x in merged_df["Respondent Last Name"]:
            if word.lower() in my_string.lower():
                return "Not Person"
            else:
                return "Person"

In [25]:
merged_df["Respondent Status"]= merged_df["Respondent Last Name"].apply(get_word)

In [26]:
merged_df.sample(2)

Unnamed: 0,Hearing Result,Ticket Number,Violation Date,Issuing Agency,Respondent First Name,Respondent Last Name,Violation Location (Borough),Violation Location (Block No.),Violation Location (Lot No.),Violation Location (House #),Violation Location (Street Name),Violation Location (City),Violation Location (Zip Code),Violation Location (State Name),Respondent Address (Borough),Respondent Address (House #),Respondent Address (Street Name),Respondent Address (City),Respondent Address (Zip Code),Respondent Address (State Name),Decision Location (Borough),Penalty Imposed,Paid Amount,Charge #1: Code,Charge #1: Code Section,Charge #1: Code Description,Charge #1: Infraction Amount,Charge #2: Code,Charge #2: Code Section,Charge #2: Code Description,Charge #2: Infraction Amount,Charge #3: Code,Charge #3: Code Section,Charge #3: Code Description,Charge #3: Infraction Amount,complete violation location,complete respondent location,JURISDICTION NAME,COUNT PARTICIPANTS,COUNT FEMALE,PERCENT FEMALE,COUNT MALE,PERCENT MALE,COUNT GENDER UNKNOWN,PERCENT GENDER UNKNOWN,COUNT GENDER TOTAL,PERCENT GENDER TOTAL,COUNT PACIFIC ISLANDER,PERCENT PACIFIC ISLANDER,COUNT HISPANIC LATINO,PERCENT HISPANIC LATINO,COUNT AMERICAN INDIAN,PERCENT AMERICAN INDIAN,COUNT ASIAN NON HISPANIC,PERCENT ASIAN NON HISPANIC,COUNT WHITE NON HISPANIC,PERCENT WHITE NON HISPANIC,COUNT BLACK NON HISPANIC,PERCENT BLACK NON HISPANIC,COUNT OTHER ETHNICITY,PERCENT OTHER ETHNICITY,COUNT ETHNICITY UNKNOWN,PERCENT ETHNICITY UNKNOWN,COUNT ETHNICITY TOTAL,PERCENT ETHNICITY TOTAL,COUNT PERMANENT RESIDENT ALIEN,PERCENT PERMANENT RESIDENT ALIEN,COUNT US CITIZEN,PERCENT US CITIZEN,COUNT OTHER CITIZEN STATUS,PERCENT OTHER CITIZEN STATUS,COUNT CITIZEN STATUS UNKNOWN,PERCENT CITIZEN STATUS UNKNOWN,COUNT CITIZEN STATUS TOTAL,PERCENT CITIZEN STATUS TOTAL,COUNT RECEIVES PUBLIC ASSISTANCE,PERCENT RECEIVES PUBLIC ASSISTANCE,COUNT NRECEIVES PUBLIC ASSISTANCE,PERCENT NRECEIVES PUBLIC ASSISTANCE,COUNT PUBLIC ASSISTANCE UNKNOWN,PERCENT PUBLIC ASSISTANCE UNKNOWN,COUNT PUBLIC ASSISTANCE TOTAL,PERCENT PUBLIC ASSISTANCE TOTAL,Respondent Status
136322,3,0148055060,2005-06-09,SANITATION POLICE,WELZ,MARVIN,BROOKLYN,7160.0,72.0,2333,STILLWELL AVENUE,BROOKLYN,11223,NEW YORK,BROOKLYN,2333,STILLWELL AVENUE,BROOKLYN,11223,NEW YORK,MANHATTAN,100.0,100.0,AS06,16-118 2,DIRTY SIDEWALK,100.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 7160.0 72.0 2333 STILLWELL AVENUE BRO...,BROOKLYN 2333 STILLWELL AVENUE BROOKLYN 11223 ...,11223,109,53,0.49,56,0.51,0,0,109,100,0,0.0,1,0.01,0,0.0,7,0.06,95,0.87,1,0.01,5,0.05,0,0.0,109,100,6,0.06,102,0.94,1,0.01,0,0,109,100,20,0.18,89,0.82,0,0,109,100,Person
97529,0,040504747X,2011-12-13,SANITATION POLICE,ILAN,ZIV,QUEENS,15818.0,70.0,139,BEACH 26 STREET,FAR ROCKAWAY,11691,NEW YORK,QUEENS,139,BEACH 26 STREET,FAR ROCKAWAY,11691,NEW YORK,UNKNOWN,300.0,0.0,ASP4,A.C. 16-120 A,IMPROPER DISPOSAL BEDDING 1ST OFFENSE,100.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,QUEENS 15818.0 70.0 139 BEACH 26 STREET FAR RO...,QUEENS 139 BEACH 26 STREET FAR ROCKAWAY 11691 ...,11691,37,12,0.32,25,0.68,0,0,37,100,0,0.0,0,0.0,0,0.0,0,0.0,37,1.0,0,0.0,0,0.0,0,0.0,37,100,0,0.0,37,1.0,0,0.0,0,0,37,100,3,0.08,34,0.92,0,0,37,100,Person


In [27]:
merged_df['Respondent Status'].value_counts()

Person        197595
Not Person     10427
Name: Respondent Status, dtype: int64

In [28]:
merged_df.loc[merged_df['Respondent Last Name'] == "NATIONAL GRID"]

Unnamed: 0,Hearing Result,Ticket Number,Violation Date,Issuing Agency,Respondent First Name,Respondent Last Name,Violation Location (Borough),Violation Location (Block No.),Violation Location (Lot No.),Violation Location (House #),Violation Location (Street Name),Violation Location (City),Violation Location (Zip Code),Violation Location (State Name),Respondent Address (Borough),Respondent Address (House #),Respondent Address (Street Name),Respondent Address (City),Respondent Address (Zip Code),Respondent Address (State Name),Decision Location (Borough),Penalty Imposed,Paid Amount,Charge #1: Code,Charge #1: Code Section,Charge #1: Code Description,Charge #1: Infraction Amount,Charge #2: Code,Charge #2: Code Section,Charge #2: Code Description,Charge #2: Infraction Amount,Charge #3: Code,Charge #3: Code Section,Charge #3: Code Description,Charge #3: Infraction Amount,complete violation location,complete respondent location,JURISDICTION NAME,COUNT PARTICIPANTS,COUNT FEMALE,PERCENT FEMALE,COUNT MALE,PERCENT MALE,COUNT GENDER UNKNOWN,PERCENT GENDER UNKNOWN,COUNT GENDER TOTAL,PERCENT GENDER TOTAL,COUNT PACIFIC ISLANDER,PERCENT PACIFIC ISLANDER,COUNT HISPANIC LATINO,PERCENT HISPANIC LATINO,COUNT AMERICAN INDIAN,PERCENT AMERICAN INDIAN,COUNT ASIAN NON HISPANIC,PERCENT ASIAN NON HISPANIC,COUNT WHITE NON HISPANIC,PERCENT WHITE NON HISPANIC,COUNT BLACK NON HISPANIC,PERCENT BLACK NON HISPANIC,COUNT OTHER ETHNICITY,PERCENT OTHER ETHNICITY,COUNT ETHNICITY UNKNOWN,PERCENT ETHNICITY UNKNOWN,COUNT ETHNICITY TOTAL,PERCENT ETHNICITY TOTAL,COUNT PERMANENT RESIDENT ALIEN,PERCENT PERMANENT RESIDENT ALIEN,COUNT US CITIZEN,PERCENT US CITIZEN,COUNT OTHER CITIZEN STATUS,PERCENT OTHER CITIZEN STATUS,COUNT CITIZEN STATUS UNKNOWN,PERCENT CITIZEN STATUS UNKNOWN,COUNT CITIZEN STATUS TOTAL,PERCENT CITIZEN STATUS TOTAL,COUNT RECEIVES PUBLIC ASSISTANCE,PERCENT RECEIVES PUBLIC ASSISTANCE,COUNT NRECEIVES PUBLIC ASSISTANCE,PERCENT NRECEIVES PUBLIC ASSISTANCE,COUNT PUBLIC ASSISTANCE UNKNOWN,PERCENT PUBLIC ASSISTANCE UNKNOWN,COUNT PUBLIC ASSISTANCE TOTAL,PERCENT PUBLIC ASSISTANCE TOTAL,Respondent Status
58597,3,0182000693,2013-10-12,POLICE DEPT,UNKNOWN,NATIONAL GRID,BROOKLYN,UNKNOWN,UNKNOWN,136,SOUTH 4TH STREET,BROOKLYN,11211,NEW YORK,BROOKLYN,1,METROTECH CENTER,BROOKLYN,11201,NEW YORK,UNKNOWN,250.0,234.0,AD16,A.C. 19-122,SAND DIRT RUBBISH DEBRIS NOT REMOVED FROM SITE...,25000.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 136 SOUTH 4TH STREET BROOKLYN 11211...,BROOKLYN 1 METROTECH CENTER BROOKLYN 11201 NEW...,11201,11,6,0.55,5,0.45,0,0,11,100,0,0.0,2,0.18,0,0.0,1,0.09,1,0.09,5,0.45,2,0.18,0,0.0,11,99,0,0.0,11,1.0,0,0.0,0,0,11,100,2,0.18,9,0.82,0,0,11,100,Person
58607,1,0177039574,2010-08-09,NYPD TRANSPORT INTELLIGENCE DI,UNKNOWN,NATIONAL GRID,BROOKLYN,UNKNOWN,UNKNOWN,UNKNOWN,F O 1664 EASTERN PARKWAY BT,BROOKLYN,11233,NEW YORK,BROOKLYN,ONE,METROTECH CENTER,BROOKLYN,11201,NEW YORK,UNKNOWN,3600.0,1200.0,AD30,A.C. 19-102 II,FAILURE TO COMPLY WITH THE TERMS AND CONDITION...,1200.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN F O 1664 EASTERN PARKWAY BT BROOKL...,BROOKLYN ONE METROTECH CENTER BROOKLYN 11201 N...,11201,11,6,0.55,5,0.45,0,0,11,100,0,0.0,2,0.18,0,0.0,1,0.09,1,0.09,5,0.45,2,0.18,0,0.0,11,99,0,0.0,11,1.0,0,0.0,0,0,11,100,2,0.18,9,0.82,0,0,11,100,Person
58609,3,0182000583,2013-07-24,POLICE DEPT,UNKNOWN,NATIONAL GRID,BROOKLYN,3031.0,10.0,12,STAGG STREET,BROOKLYN,11206,NEW YORK,BROOKLYN,1,METROTECT CENTER,BROOKLYN,11201,NEW YORK,UNKNOWN,280.0,262.0,AD16,A.C. 19-122,SAND DIRT RUBBISH DEBRIS NOT REMOVED FROM SITE...,25000.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 3031.0 10.0 12 STAGG STREET BROOKLYN ...,BROOKLYN 1 METROTECT CENTER BROOKLYN 11201 NEW...,11201,11,6,0.55,5,0.45,0,0,11,100,0,0.0,2,0.18,0,0.0,1,0.09,1,0.09,5,0.45,2,0.18,0,0.0,11,99,0,0.0,11,1.0,0,0.0,0,0,11,100,2,0.18,9,0.82,0,0,11,100,Person
58610,1,0180233662,2012-04-03,NYPD TRANSPORT INTELLIGENCE DI,UNKNOWN,NATIONAL GRID,BROOKLYN,1200.0,27.0,UNKNOWN,ATLANTIC AVENUE,BROOKLYN,11216,NEW YORK,BROOKLYN,1,METROTECH CENTER,BROOKLYN,11201,NEW YORK,UNKNOWN,1230.0,1200.0,AD30,A.C. 19-102(II),FAILURE TO COMPLY WITH THE TERMS AND CONDITION...,1200.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 1200.0 27.0 ATLANTIC AVENUE BROOKLYN...,BROOKLYN 1 METROTECH CENTER BROOKLYN 11201 NEW...,11201,11,6,0.55,5,0.45,0,0,11,100,0,0.0,2,0.18,0,0.0,1,0.09,1,0.09,5,0.45,2,0.18,0,0.0,11,99,0,0.0,11,1.0,0,0.0,0,0,11,100,2,0.18,9,0.82,0,0,11,100,Person
58620,3,0176395450,2012-01-03,POLICE DEPARTMENT,UNKNOWN,NATIONAL GRID,QUEENS,3096.0,7501.0,63-80,WETHEROLE STREET,REGO PARK,11374,NEW YORK,BROOKLYN,1,METROTECH CENTER,BROOKLYN,11201,NEW YORK,UNKNOWN,750.0,788.0,AD3C,34 RCNY 2-11 e 5,FAILURE TO MAINTAIN 5FT PEDESTRIAN WALKWAY ON S W,250.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,QUEENS 3096.0 7501.0 63-80 WETHEROLE STREET RE...,BROOKLYN 1 METROTECH CENTER BROOKLYN 11201 NEW...,11201,11,6,0.55,5,0.45,0,0,11,100,0,0.0,2,0.18,0,0.0,1,0.09,1,0.09,5,0.45,2,0.18,0,0.0,11,99,0,0.0,11,1.0,0,0.0,0,0,11,100,2,0.18,9,0.82,0,0,11,100,Person
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125261,3,0176395010,2012-01-23,POLICE DEPT,UNKNOWN,NATIONAL GRID,QUEENS,3098.0,16.0,64-64,WETHEROLE STREET,REGO PARK,11374,NEW YORK,QUEENS,89-67,162 STREET,JAMAICA,11432,NEW YORK,UNKNOWN,750.0,770.0,AD10,A.C. 19-121 B 2,DEBRIS CONSTR. MATERIALS OBSTRUCTING GUTTERS S...,25000.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,QUEENS 3098.0 16.0 64-64 WETHEROLE STREET REGO...,QUEENS 89-67 162 STREET JAMAICA 11432 NEW YORK,11432,2,2,1.00,0,0.00,0,0,2,100,0,0.0,0,0.00,0,0.0,1,0.50,0,0.00,1,0.50,0,0.00,0,0.0,2,100,0,0.0,2,1.0,0,0.0,0,0,2,100,0,0.00,2,1.00,0,0,2,100,Person
207739,1,0169015120,2009-12-16,POLICE DEPARTMENT,UNKNOWN,NATIONAL GRID,BROOKLYN,2102.0,31.0,UNKNOWN,DE KALB AVENUE,BROOKLYN,11205,NEW YORK,BROOKLYN,1,METRO TECH CENTER,BROOKLYN,11202,NEW YORK,UNKNOWN,3600.0,1292.0,AD30,A.C. 19-102 II,FAILURE TO COMPLY WITH THE TERMS AND CONDITION...,1200.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 2102.0 31.0 DE KALB AVENUE BROOKLYN ...,BROOKLYN 1 METRO TECH CENTER BROOKLYN 11202 NE...,11202,0,0,0.00,0,0.00,0,0,0,0,0,0.0,0,0.00,0,0.0,0,0.00,0,0.00,0,0.00,0,0.00,0,0.0,0,0,0,0.0,0,0.0,0,0.0,0,0,0,0,0,0.00,0,0.00,0,0,0,0,Person
207740,1,0177034285,2010-09-29,NYPD TRANSPORT INTELLIGENCE DI,UNKNOWN,NATIONAL GRID,BROOKLYN,2011.0,12.0,UNKNOWN,CLINTON AVENUE,BROOKLYN,11238,NEW YORK,BROOKLYN,1,METRO TECH CENTER,BROOKLYN,11202,NEW YORK,UNKNOWN,1230.0,1200.0,AD30,A.C. 19-102 II,FAILURE TO COMPLY WITH THE TERMS AND CONDITION...,1200.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 2011.0 12.0 CLINTON AVENUE BROOKLYN ...,BROOKLYN 1 METRO TECH CENTER BROOKLYN 11202 NE...,11202,0,0,0.00,0,0.00,0,0,0,0,0,0.0,0,0.00,0,0.0,0,0.00,0,0.00,0,0.00,0,0.00,0,0.0,0,0,0,0.0,0,0.0,0,0.0,0,0,0,0,0,0.00,0,0.00,0,0,0,0,Person
207746,1,0176395130,2012-01-30,POLICE DEPARTMENT,UNKNOWN,NATIONAL GRID,QUEENS,2124.0,20.0,102-18,63 AVENUE,FOREST HILLS,11375,NEW YORK,BROOKLYN,1,MOTER TECH CNT,BROOKLYN,11202,NEW YORK,UNKNOWN,430.0,400.0,AD05,A.C. 19-109 A,FAILURE TO PROVIDE ADEQUATE PROTECTION AT WORK...,400.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,QUEENS 2124.0 20.0 102-18 63 AVENUE FOREST HIL...,BROOKLYN 1 MOTER TECH CNT BROOKLYN 11202 NEW YORK,11202,0,0,0.00,0,0.00,0,0,0,0,0,0.0,0,0.00,0,0.0,0,0.00,0,0.00,0,0.00,0,0.00,0,0.0,0,0,0,0.0,0,0.0,0,0.0,0,0,0,0,0,0.00,0,0.00,0,0,0,0,Person


In [29]:
merged_df.drop("JURISDICTION NAME", axis=1, inplace=True)

In [30]:
merged_df.head(1)

Unnamed: 0,Hearing Result,Ticket Number,Violation Date,Issuing Agency,Respondent First Name,Respondent Last Name,Violation Location (Borough),Violation Location (Block No.),Violation Location (Lot No.),Violation Location (House #),Violation Location (Street Name),Violation Location (City),Violation Location (Zip Code),Violation Location (State Name),Respondent Address (Borough),Respondent Address (House #),Respondent Address (Street Name),Respondent Address (City),Respondent Address (Zip Code),Respondent Address (State Name),Decision Location (Borough),Penalty Imposed,Paid Amount,Charge #1: Code,Charge #1: Code Section,Charge #1: Code Description,Charge #1: Infraction Amount,Charge #2: Code,Charge #2: Code Section,Charge #2: Code Description,Charge #2: Infraction Amount,Charge #3: Code,Charge #3: Code Section,Charge #3: Code Description,Charge #3: Infraction Amount,complete violation location,complete respondent location,COUNT PARTICIPANTS,COUNT FEMALE,PERCENT FEMALE,COUNT MALE,PERCENT MALE,COUNT GENDER UNKNOWN,PERCENT GENDER UNKNOWN,COUNT GENDER TOTAL,PERCENT GENDER TOTAL,COUNT PACIFIC ISLANDER,PERCENT PACIFIC ISLANDER,COUNT HISPANIC LATINO,PERCENT HISPANIC LATINO,COUNT AMERICAN INDIAN,PERCENT AMERICAN INDIAN,COUNT ASIAN NON HISPANIC,PERCENT ASIAN NON HISPANIC,COUNT WHITE NON HISPANIC,PERCENT WHITE NON HISPANIC,COUNT BLACK NON HISPANIC,PERCENT BLACK NON HISPANIC,COUNT OTHER ETHNICITY,PERCENT OTHER ETHNICITY,COUNT ETHNICITY UNKNOWN,PERCENT ETHNICITY UNKNOWN,COUNT ETHNICITY TOTAL,PERCENT ETHNICITY TOTAL,COUNT PERMANENT RESIDENT ALIEN,PERCENT PERMANENT RESIDENT ALIEN,COUNT US CITIZEN,PERCENT US CITIZEN,COUNT OTHER CITIZEN STATUS,PERCENT OTHER CITIZEN STATUS,COUNT CITIZEN STATUS UNKNOWN,PERCENT CITIZEN STATUS UNKNOWN,COUNT CITIZEN STATUS TOTAL,PERCENT CITIZEN STATUS TOTAL,COUNT RECEIVES PUBLIC ASSISTANCE,PERCENT RECEIVES PUBLIC ASSISTANCE,COUNT NRECEIVES PUBLIC ASSISTANCE,PERCENT NRECEIVES PUBLIC ASSISTANCE,COUNT PUBLIC ASSISTANCE UNKNOWN,PERCENT PUBLIC ASSISTANCE UNKNOWN,COUNT PUBLIC ASSISTANCE TOTAL,PERCENT PUBLIC ASSISTANCE TOTAL,Respondent Status
0,2,162420262,2012-08-30,POLICE DEPARTMENT,RONALD,BARONE,BROOKLYN,UNKNOWN,UNKNOWN,1711,EAST 33 STREET,BROOKLYN,11234,NEW YORK,BROOKLYN,21,SEBA AVENUE,BROOKLYN,11229,NEW YORK,SAU: MANH,0.0,0.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,AN68,A.C. 24-238 A,IMPROPER AUDIBLE BURGLAR ALARM WITH NO AUTOMAT...,280.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 1711 EAST 33 STREET BROOKLYN 11234 ...,BROOKLYN 21 SEBA AVENUE BROOKLYN 11229 NEW YORK,52,32,0.62,20,0.38,0,0,52,100,0,0.0,2,0.04,0,0.0,5,0.1,39,0.75,2,0.04,3,0.06,1,0.02,52,100,3,0.06,49,0.94,0,0.0,0,0,52,100,5,0.1,47,0.9,0,0,52,100,Person


In [51]:
# pd.options.display.max_colwidth = 1000000
# pd.set_option('display.max_columns', 2000000000)
# pd.set_option('display.max_rows', 1000000000)
# pd.set_option('display.expand_frame_repr', True)

# FSM

just pulled in one feature b/c i need to do more work on the columns. 

In [50]:
dummy_model = DummyClassifier(strategy="most_frequent")
dummy_model.fit(df["Violation Location (City)"], df["Hearing Result"])
y_hat = dummy_model.predict(df)

In [54]:
acc = accuracy_score(df["Hearing Result"],y_hat)
macro_precision_score=precision_score(df["Hearing Result"], y_hat, average='macro')
micro_precision_score=precision_score(df["Hearing Result"] , y_hat, average='micro')
macro_recall_score=recall_score(df["Hearing Result"], y_hat, average='macro')
micro_recall_score=recall_score(df["Hearing Result"], y_hat, average='micro')

print('Accuracy Score: {}'.format(acc))
print('Macro Precision Score: {}'.format(macro_precision_score))
print('Micro Precision Score: {}'.format(micro_precision_score))
print('Macro Recall Score: {}'.format(macro_recall_score))
print('Micro Recall Score: {}'.format(micro_recall_score))

Accuracy Score: 0.4292004895823075
Macro Precision Score: 0.10730012239557687
Micro Precision Score: 0.4292004895823075
Macro Recall Score: 0.25
Micro Recall Score: 0.4292004895823075


  _warn_prf(average, modifier, msg_start, len(result))


## Create a  ZIP Code Tabulation Areas (ZCTAs) column based on the zip code column so that ZCTA can be used to pull in neighborhood level data such as income and education level

1. add a new column to the datafram with ZIP Code Tabulation Areas (ZCTAs)
2. bring in income data matching on ZCTAs
3. to download ZCTAS, will need to do web scrapping

In [31]:
nyc_zip_codes = ["10001",
"10002",
"10003",
"10004",
"10005",
"10006",
"10007",
"10009",
"10010",
"10011",
"10012",
"10013",
"10014",
"10015",
"10016",
"10017",
"10018",
"10019",
"10020",
"10021",
"10022",
"10023",
"10024",
"10025",
"10026",
"10027",
"10028",
"10029",
"10030",
"10031",
"10032",
"10033",
"10034",
"10035",
"10036",
"10037",
"10038",
"10039",
"10040",
"10041",
"10044",
"10045",
"10048",
"10055",
"10060",
"10069",
"10090",
"10095",
"10098",
"10099",
"10103",
"10104",
"10105",
"10106",
"10107",
"10110",
"10111",
"10112",
"10115",
"10118",
"10119",
"10120",
"10121",
"10122",
"10123",
"10128",
"10151",
"10152",
"10153",
"10154",
"10155",
"10158",
"10161",
"10162",
"10165",
"10166",
"10167",
"10168",
"10169",
"10170",
"10171",
"10172",
"10173",
"10174",
"10175",
"10176",
"10177",
"10178",
"10199",
"10270",
"10271",
"10278",
"10279",
"10280",
"10281",
"10282",
"10301",
"10302",
"10303",
"10304",
"10305",
"10306",
"10307",
"10308",
"10309",
"10310",
"10311",
"10312",
"10314",
"10451",
"10452",
"10453",
"10454",
"10455",
"10456",
"10457",
"10458",
"10459",
"10460",
"10461",
"10462",
"10463",
"10464",
"10465",
"10466",
"10467",
"10468",
"10469",
"10470",
"10471",
"10472",
"10473",
"10474",
"10475",
"11004",
"11101",
"11102",
"11103",
"11104",
"11105",
"11106",
"11109",
"11201",
"11203",
"11204",
"11205",
"11206",
"11207",
"11208",
"11209",
"11210",
"11211",
"11212",
"11213",
"11214",
"11215",
"11216",
"11217",
"11218",
"11219",
"11220",
"11221",
"11222",
"11223",
"11224",
"11225",
"11226",
"11228",
"11229",
"11230",
"11231",
"11232",
"11233",
"11234",
"11235",
"11236",
"11237",
"11238",
"11239",
"11241",
"11242",
"11243",
"11249",
"11252",
"11256",
"11351",
"11354",
"11355",
"11356",
"11357",
"11358",
"11359",
"11360",
"11361",
"11362",
"11363",
"11364",
"11365",
"11366",
"11367",
"11368",
"11369",
"11370",
"11371",
"11372",
"11373",
"11374",
"11375",
"11377",
"11378",
"11379",
"11385",
"11411",
"11412",
"11413",
"11414",
"11415",
"11416",
"11417",
"11418",
"11419",
"11420",
"11421",
"11422",
"11423",
"11426",
"11427",
"11428",
"11429",
"11430",
"11432",
"11433",
"11434",
"11435",
"11436",
"11691",
"11692",
"11693",
"11694",
"11697"]


In [32]:
# this dataset contains USPS zip_code and ZCTA zip code for several states. 
#It indicates whether some USPS and ZCTA willl match up and if it doesn't, the equivalence of that
ZiptoZcta_Crosswalk_2021 = pd.read_excel("ZiptoZcta_Crosswalk_2021.xlsx")
ZiptoZcta_Crosswalk_2021["ZIP_CODE"] = ZiptoZcta_Crosswalk_2021["ZIP_CODE"].astype(str)
df['Respondent Address (Zip Code)'] = df['Respondent Address (Zip Code)'].astype(str)

# narrown down the df to only pull out zip codes that matches the nyc zipcode list above
ZiptoZcta_Crosswalk_2021["ZIP_CODE_NYC"] = np.where(ZiptoZcta_Crosswalk_2021["ZIP_CODE"].isin(nyc_zip_codes), "NYC", "Other")

In [33]:
ZiptoZcta_Crosswalk_2021["ZIP_CODE_NYC"].value_counts()

Other    40873
NYC        234
Name: ZIP_CODE_NYC, dtype: int64

In [34]:
ZiptoZcta_Crosswalk_2021_NYC = ZiptoZcta_Crosswalk_2021.loc[ZiptoZcta_Crosswalk_2021["ZIP_CODE_NYC"] == "NYC"]
ZiptoZcta_Crosswalk_2021_NYC

Unnamed: 0,ZIP_CODE,PO_NAME,STATE,ZIP_TYPE,ZCTA,zip_join_type,ZIP_CODE_NYC
0,10001,New York,NY,Zip Code Area,10001,Zip matches ZCTA,NYC
1,10002,New York,NY,Zip Code Area,10002,Zip matches ZCTA,NYC
3190,10003,New York,NY,Zip Code Area,10003,Zip matches ZCTA,NYC
3191,10004,New York,NY,Zip Code Area,10004,Zip matches ZCTA,NYC
3192,10005,New York,NY,Zip Code Area,10005,Zip matches ZCTA,NYC
...,...,...,...,...,...,...,...
5087,11242,Brooklyn,NY,Post Office or large volume customer,11201,Spatial join to ZCTA,NYC
5088,11243,Brooklyn,NY,Post Office or large volume customer,11217,Spatial join to ZCTA,NYC
5091,11249,Brooklyn,NY,Zip Code Area,11211,Spatial join to ZCTA,NYC
5093,11252,Brooklyn,NY,Zip Code Area,11209,Spatial join to ZCTA,NYC


In [35]:
ZiptoZcta_Crosswalk_2021_NYC.to_csv("NYC_Only_ZiptoZcta_Crosswalk_2021.csv")

In [36]:
ZiptoZcta_Crosswalk_2021_NYC

Unnamed: 0,ZIP_CODE,PO_NAME,STATE,ZIP_TYPE,ZCTA,zip_join_type,ZIP_CODE_NYC
0,10001,New York,NY,Zip Code Area,10001,Zip matches ZCTA,NYC
1,10002,New York,NY,Zip Code Area,10002,Zip matches ZCTA,NYC
3190,10003,New York,NY,Zip Code Area,10003,Zip matches ZCTA,NYC
3191,10004,New York,NY,Zip Code Area,10004,Zip matches ZCTA,NYC
3192,10005,New York,NY,Zip Code Area,10005,Zip matches ZCTA,NYC
...,...,...,...,...,...,...,...
5087,11242,Brooklyn,NY,Post Office or large volume customer,11201,Spatial join to ZCTA,NYC
5088,11243,Brooklyn,NY,Post Office or large volume customer,11217,Spatial join to ZCTA,NYC
5091,11249,Brooklyn,NY,Zip Code Area,11211,Spatial join to ZCTA,NYC
5093,11252,Brooklyn,NY,Zip Code Area,11209,Spatial join to ZCTA,NYC


In [37]:
ZiptoZcta_Crosswalk_2021_NYC_dict = dict(zip(ZiptoZcta_Crosswalk_2021_NYC.ZIP_CODE, ZiptoZcta_Crosswalk_2021_NYC.ZCTA))

In [38]:
ZiptoZcta_Crosswalk_2021_NYC_dict

{'10001': '10001',
 '10002': '10002',
 '10003': '10003',
 '10004': '10004',
 '10005': '10005',
 '10006': '10006',
 '10007': '10007',
 '10009': '10009',
 '10010': '10010',
 '10011': '10011',
 '10012': '10012',
 '10013': '10013',
 '10014': '10014',
 '10016': '10016',
 '10017': '10017',
 '10018': '10018',
 '10019': '10019',
 '10020': '10020',
 '10021': '10021',
 '10022': '10022',
 '10023': '10023',
 '10024': '10024',
 '10025': '10025',
 '10026': '10026',
 '10027': '10027',
 '10028': '10028',
 '10029': '10029',
 '10030': '10030',
 '10031': '10031',
 '10032': '10032',
 '10033': '10033',
 '10034': '10034',
 '10035': '10035',
 '10036': '10036',
 '10037': '10037',
 '10038': '10038',
 '10039': '10039',
 '10040': '10040',
 '10044': '10044',
 '10069': '10069',
 '10103': '10103',
 '10110': '10110',
 '10111': '10111',
 '10112': '10112',
 '10115': '10115',
 '10119': '10119',
 '10128': '10128',
 '10152': '10152',
 '10153': '10153',
 '10154': '10154',
 '10162': '10162',
 '10165': '10165',
 '10167': '1

In [39]:
df['Respondent ZCTA'] = df['Respondent Address (Zip Code)'].map(ZiptoZcta_Crosswalk_2021_NYC_dict)

In [40]:
df.sample(50)

Unnamed: 0,Ticket Number,Violation Date,Issuing Agency,Respondent First Name,Respondent Last Name,Violation Location (Borough),Violation Location (Block No.),Violation Location (Lot No.),Violation Location (House #),Violation Location (Street Name),Violation Location (City),Violation Location (Zip Code),Violation Location (State Name),Respondent Address (Borough),Respondent Address (House #),Respondent Address (Street Name),Respondent Address (City),Respondent Address (Zip Code),Respondent Address (State Name),Decision Location (Borough),Penalty Imposed,Paid Amount,Charge #1: Code,Charge #1: Code Section,Charge #1: Code Description,Charge #1: Infraction Amount,Charge #2: Code,Charge #2: Code Section,Charge #2: Code Description,Charge #2: Infraction Amount,Charge #3: Code,Charge #3: Code Section,Charge #3: Code Description,Charge #3: Infraction Amount,complete violation location,complete respondent location,Hearing Result,Respondent ZCTA
174751,0155299531,2006-03-14,POLICE DEPT,MOHAMMED B,AKHTER,MANHATTAN,0.0,0.0,UNKNOWN,48 MADISON,NEW YORK,0,NEW YORK,QUEENS,28-45,35 STREET,ASTORIA,11103,NEW YORK,MANHATTAN,250.0,295.0,AF18,17-315 E,"IN BUS STOP, OR 10 FT. OF DRIVE, SUBWAY, CROSS...",10000.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,MANHATTAN 0.0 0.0 48 MADISON NEW YORK 00000 N...,QUEENS 28-45 35 STREET ASTORIA 11103 NEW YORK,3,11103
184764,0169619836,2010-09-21,POLICE DEPARTMENT,JAMES,STEWART,MANHATTAN,210.0,5.0,336,CANAL STREET,NEW YORK,10013,NEW YORK,MANHATTAN,95,LENOX AVENUE,NEW YORK,10026,NEW YORK,UNKNOWN,250.0,0.0,AG65,6 RCNY 2-307 B,FAILURE TO DISPLAY PRICE,100.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,MANHATTAN 210.0 5.0 336 CANAL STREET NEW YORK ...,MANHATTAN 95 LENOX AVENUE NEW YORK 10026 NEW YORK,0,10026
162748,042648665H,2018-11-21,SANITATION POLICE,UNKNOWN,RSRSOK LLC,BRONX,3533.0,1.0,567,OLMSTEAD AVENUE,BRONX,10473,NEW YORK,BRONX,567,OLMSTEAD AVENUE,BRONX,10473,NEW YORK,UNKNOWN,300.0,0.0,AS06,A.C. 16-118 2 A,DIRTY SIDEWALK DIRTY AREA,100.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BRONX 3533.0 1.0 567 OLMSTEAD AVENUE BRONX 104...,BRONX 567 OLMSTEAD AVENUE BRONX 10473 NEW YORK,1,10473
207212,0112791223,2000-05-09,POLICE DEPARTMENT,JESUS,TORRES,MANHATTAN,786.0,1.0,520,8 AVENUE,MANHATTAN,10018,NEW YORK,QUEENS,104-39,44 AVENUE,QUEENS,11368,NEW YORK,UNKNOWN,1000.0,UNKNOWN,AF02,17-307(B),"LACK OF PERMIT FOR VEHICLE, PUSHCART OR STAND",500.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,MANHATTAN 786.0 1.0 520 8 AVENUE MANHATTAN 100...,QUEENS 104-39 44 AVENUE QUEENS 11368 NEW YORK,0,11368
120843,0202979691,2018-11-07,POLICE DEPARTMENT,LOUIS,PADILLA,MANHATTAN,1792.0,5.0,2495,2 AVENUE,NEW YORK,10035,NEW YORK,MANHATTAN,600,EAST 125 STREET,NEW YORK,10035,NEW YORK,UNKNOWN,112.0,0.0,AS9I,16-118 6,PUBLIC URINATION 1ST OFFENSE,75.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,MANHATTAN 1792.0 5.0 2495 2 AVENUE NEW YORK 10...,MANHATTAN 600 EAST 125 STREET NEW YORK 10035 N...,1,10035
47536,0112678969,2000-03-28,POLICE DEPARTMENT,ANNA,GUILERMO,MANHATTAN,2118.0,1.0,3820,BROADWAY,MANHATTAN,10032,NEW YORK,MANHATTAN,545,WEST 164 STREET,MANHATTAN,10032,NEW YORK,UNKNOWN,1000.0,UNKNOWN,AF02,17-307(B),"LACK OF PERMIT FOR VEHICLE, PUSHCART OR STAND",500.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,MANHATTAN 2118.0 1.0 3820 BROADWAY MANHATTAN 1...,MANHATTAN 545 WEST 164 STREET MANHATTAN 10032 ...,0,10032
192260,040484956R,2011-05-11,SANITATION POLICE,UNKNOWN,B D DEV INC,BROOKLYN,4078.0,116.0,573,JEROME STREET,BROOKLYN,11207,NEW YORK,BROOKLYN,573,JEROME STREET,BROOKLYN,11207,NEW YORK,UNKNOWN,300.0,0.0,AS97,16-118 2,DIRTY SIDEWALK- 2ND OFFENSE,250.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 4078.0 116.0 573 JEROME STREET BROOKL...,BROOKLYN 573 JEROME STREET BROOKLYN 11207 NEW ...,0,11207
21059,040429267J,2016-05-23,SANITATION POLICE,UNKNOWN,BOWEN GILLIAN,BROOKLYN,4504.0,64.0,817,LOGAN STREET,BROOKLYN,11208,NEW YORK,BROOKLYN,817,LOGAN STREET,BROOKLYN,11208,NEW YORK,UNKNOWN,300.0,0.0,AS97,16-118 2,"DIRTY SIDEWALK,FAIL TO CLEAN 18 INTO STREET,SI...",250.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 4504.0 64.0 817 LOGAN STREET BROOKLYN...,BROOKLYN 817 LOGAN STREET BROOKLYN 11208 NEW YORK,1,11208
23857,0179838102,2011-08-22,NYPD TRANSPORT INTELLIGENCE DI,UNKNOWN,NATIONAL GRID,BROOKLYN,UNKNOWN,UNKNOWN,UNKNOWN,F O 239 ST MARKS AVE BETWEEN,BROOKLYN,11238,NEW YORK,BROOKLYN,ONE,METROTECH CENTER,BROOKLYN,11201,NEW YORK,UNKNOWN,1830.0,1800.0,AD03,19-107.,STREET CLOSING WITHOUT PERMIT,1800.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN F O 239 ST MARKS AVE BETWEEN BROOK...,BROOKLYN ONE METROTECH CENTER BROOKLYN 11201 N...,1,11201
19341,0154760531,2006-10-15,SANITATION POLICE,MICHAEL,TEPLER,BROOKLYN,0.0,0.0,415,PARK AVENUE,BROOKLYN,11205,NEW YORK,BROOKLYN,415,PARK AVENUE,BROOKLYN,11205,NEW YORK,SAU: MANH,UNKNOWN,UNKNOWN,AS16,A.C. 16-120 A,MAINTAINING RECEPTACLES,100.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,BROOKLYN 0.0 0.0 415 PARK AVENUE BROOKLYN 1120...,BROOKLYN 415 PARK AVENUE BROOKLYN 11205 NEW YORK,2,11205


In [41]:
df["Respondent ZCTA"].isnull().value_counts()

False    208967
True       4276
Name: Respondent ZCTA, dtype: int64

In [42]:
df['Respondent ZCTA'] = df["Respondent ZCTA"].astype(str)

In [43]:
len(df['Respondent ZCTA'].unique())

189

In [44]:
Respondent_ZCTA_list = list(df["Respondent ZCTA"])

### Web Scraping

In [396]:
pip install selenium

Collecting selenium
  Downloading selenium-4.0.0-py3-none-any.whl (954 kB)
[K     |████████████████████████████████| 954 kB 1.6 MB/s eta 0:00:01
[?25hCollecting trio~=0.17
  Downloading trio-0.19.0-py3-none-any.whl (356 kB)
[K     |████████████████████████████████| 356 kB 16.2 MB/s eta 0:00:01
[?25hCollecting trio-websocket~=0.9
  Downloading trio_websocket-0.9.2-py3-none-any.whl (16 kB)
Collecting outcome
  Downloading outcome-1.1.0-py2.py3-none-any.whl (9.7 kB)
Collecting wsproto>=0.14
  Downloading wsproto-1.0.0-py3-none-any.whl (24 kB)
Collecting h11<1,>=0.9.0
  Downloading h11-0.12.0-py3-none-any.whl (54 kB)
[K     |████████████████████████████████| 54 kB 12.2 MB/s eta 0:00:01
[?25hInstalling collected packages: outcome, h11, wsproto, trio, trio-websocket, selenium
Successfully installed h11-0.12.0 outcome-1.1.0 selenium-4.0.0 trio-0.19.0 trio-websocket-0.9.2 wsproto-1.0.0
Note: you may need to restart the kernel to use updated packages.


In [398]:
pip install chromedriver-binary

Collecting chromedriver-binary
  Downloading chromedriver-binary-95.0.4638.17.0.tar.gz (4.8 kB)
Building wheels for collected packages: chromedriver-binary
  Building wheel for chromedriver-binary (setup.py) ... [?25ldone
[?25h  Created wheel for chromedriver-binary: filename=chromedriver_binary-95.0.4638.17.0-py3-none-any.whl size=8225099 sha256=a9d650e55ac44fd8b0b273613256871a18fca1053da1ead808a1844dffa6911e
  Stored in directory: /Users/allisongao/Library/Caches/pip/wheels/40/38/62/cf18d5e0fda72737dbf7a404c5116fdf974c768072e542e244
Successfully built chromedriver-binary
Installing collected packages: chromedriver-binary
Successfully installed chromedriver-binary-95.0.4638.17.0
Note: you may need to restart the kernel to use updated packages.


In [61]:
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup
from selenium import webdriver
import chromedriver_binary
#import get to call a get request on the site
from requests import get
import re

browser = webdriver.Chrome("/Users/allisongao/Downloads/chromedriver")

  browser = webdriver.Chrome("/Users/allisongao/Downloads/chromedriver")


SessionNotCreatedException: Message: session not created: This version of ChromeDriver only supports Chrome version 95
Current browser version is 94.0.4606.81 with binary path /Applications/Google Chrome.app/Contents/MacOS/Google Chrome
Stacktrace:
0   chromedriver                        0x000000010d14bbc9 __gxx_personality_v0 + 573977
1   chromedriver                        0x000000010d0d89a3 __gxx_personality_v0 + 102387
2   chromedriver                        0x000000010cc9a488 chromedriver + 173192
3   chromedriver                        0x000000010ccc0cf0 chromedriver + 330992
4   chromedriver                        0x000000010ccbc7d1 chromedriver + 313297
5   chromedriver                        0x000000010ccb92c9 chromedriver + 299721
6   chromedriver                        0x000000010ccf2ccb chromedriver + 535755
7   chromedriver                        0x000000010cceca73 chromedriver + 510579
8   chromedriver                        0x000000010ccc30e0 chromedriver + 340192
9   chromedriver                        0x000000010ccc4345 chromedriver + 344901
10  chromedriver                        0x000000010d108d5f __gxx_personality_v0 + 299951
11  chromedriver                        0x000000010d11f8db __gxx_personality_v0 + 393003
12  chromedriver                        0x000000010d12585f __gxx_personality_v0 + 417455
13  chromedriver                        0x000000010d12100a __gxx_personality_v0 + 398938
14  chromedriver                        0x000000010d0fd95c __gxx_personality_v0 + 253868
15  chromedriver                        0x000000010d13c198 __gxx_personality_v0 + 509928
16  chromedriver                        0x000000010d13c321 __gxx_personality_v0 + 510321
17  chromedriver                        0x000000010d153108 __gxx_personality_v0 + 603992
18  libsystem_pthread.dylib             0x00007fff2039f8fc _pthread_start + 224
19  libsystem_pthread.dylib             0x00007fff2039b443 thread_start + 15


In [None]:
ZCTA_url =[]

for x in Respondent_ZCTA_list:
    ZCTA_url.append("https://data.census.gov/cedsci/table?q=mean%20income&g=8600000US" + x + "&tid=ACSST5Y2019.S1901&hidePreview=true")
    
    
ZCTA_url

In [51]:
from selenium import webdriver
import chromedriver_binary

In [54]:
# import time
# importing webdriver from selenium
from selenium import webdriver
 
# Here Chrome  will be used
driver = webdriver.Chrome("/Users/allisongao/Downloads")
 
# URL of website
# urls = "https://www.geeksforgeeks.org/"
for url in ZCTA_url:
    driver.get(url)



    

# getting the button by class name
button = driver.find_element_by_class_name("aqua-button.mt-5")
 
# clicking on the button
button.click()

  driver = webdriver.Chrome("/Users/allisongao/Downloads")


WebDriverException: Message: 'Downloads' executable may have wrong permissions. Please see https://chromedriver.chromium.org/home
