In [3]:
import pandas as pd
from warnings import simplefilter
import psycopg2
import psycopg2.extras
import requests
from io import StringIO

In [6]:
# function to download and parse site data
def download_and_parse_data(url):
    response = requests.get(url)
    
    # Ensure the request was successful
    if response.status_code != 200:
        raise Exception(f"Failed to download data: {response.status_code}")
    
    # Parse the CSV data into a pandas DataFrame
    data = pd.read_csv(StringIO(response.text), header=0)
    
    return data

# URL from the provided link
url = 'https://www.irs.gov/pub/irs-soi/21zpallagi.csv'

# Download and parse the data
irs_data = download_and_parse_data(url)

   STATEFIPS STATE  zipcode  agi_stub        N1     mars1     MARS2     MARS4  \
0          1    AL        0         1  720280.0  466930.0   75840.0  165330.0   
1          1    AL        0         2  524160.0  264900.0  109720.0  134980.0   
2          1    AL        0         3  291860.0  120090.0  116380.0   46410.0   
3          1    AL        0         4  184320.0   48650.0  115740.0   16120.0   
4          1    AL        0         5  287720.0   38750.0  234070.0    9640.0   

        ELF    CPREP  ...  N85300  A85300    N11901    A11901    N11900  \
0  677820.0  21700.0  ...     0.0     0.0   42310.0   35969.0  632780.0   
1  495550.0  14170.0  ...     0.0     0.0   87370.0  115842.0  435160.0   
2  275270.0   8580.0  ...     0.0     0.0   79750.0  165576.0  212390.0   
3  173910.0   4010.0  ...     0.0     0.0   60450.0  172959.0  122910.0   
4  272240.0   7750.0  ...   110.0    34.0  128440.0  638854.0  159690.0   

      A11900    N11902     A11902  N12000   A12000  
0  218299

In [13]:
# filter by New York state
ny_df = irs_data[irs_data['STATE'] == 'NY'].reset_index(drop=True)

In [9]:
# get nyc zip codes and put it into a 2nd dataframe
url = 'https://raw.githubusercontent.com/erikgregorywebb/nyc-housing/master/Data/nyc-zip-codes.csv'
zips = download_and_parse_data(url)


  Borough            Neighborhood  ZipCode
0   Bronx           Central Bronx    10453
1   Bronx           Central Bronx    10457
2   Bronx           Central Bronx    10460
3   Bronx  Bronx Park and Fordham    10458
4   Bronx  Bronx Park and Fordham    10467


In [11]:
# rename columns to make merging the 2 dataframes easier
zips = zips.rename(columns={'ZipCode': 'zipcode'})


  Borough            Neighborhood  zipcode
0   Bronx           Central Bronx    10453
1   Bronx           Central Bronx    10457
2   Bronx           Central Bronx    10460
3   Bronx  Bronx Park and Fordham    10458
4   Bronx  Bronx Park and Fordham    10467


In [14]:
# merge tax data with nyc neighborhood data on zipcode
nyc_irs = pd.merge(ny_df, zips, on='zipcode')

In [33]:
# save final csv to be used for adding data to database
nyc_irs.to_csv('./nyc_irs.csv', index=False)