In [1]:
import pandas as pd
import sqlite3

# NYC Neighborhoods (ETL): Extract

In [2]:
# Get NYC neighborhoods from NYS Department of Health
# https://www.health.ny.gov/statistics/cancer/registry/appendix/neighborhoods.htm
df = pd.read_csv('../Data/neighborhoods.csv')
df

Unnamed: 0,Borough,Neighborhood,ZIP Codes
0,Bronx,Central Bronx,"10453, 10457, 10460"
1,Bronx,Bronx Park and Fordham,"10458, 10467, 10468"
2,Bronx,High Bridge and Morrisania,"10451, 10452, 10456"
3,Bronx,Hunts Point and Mott Haven,"10454, 10455, 10459, 10474"
4,Bronx,Kingsbridge and Riverdale,"10463, 10471"
5,Bronx,Northeast Bronx,"10466, 10469, 10470, 10475"
6,Bronx,Southeast Bronx,"10461, 10462,10464, 10465, 10472, 10473"
7,Brooklyn,Central Brooklyn,"11212, 11213, 11216, 11233, 11238"
8,Brooklyn,Southwest Brooklyn,"11209, 11214, 11228"
9,Brooklyn,Borough Park,"11204, 11218, 11219, 11230"


# NYC Neighborhoods (ETL): Transform

In [3]:
#reorganize dataframe by zip code
nyc_zip_codes = []
nyc_neighborhoods = []
nyc_boroughs = []

# create record for each zip code
for row in range(df.shape[0]):
    codes = df['ZIP Codes'][row].split(',')
    for code in codes:
        nyc_zip_codes.append(int(code))
        nyc_neighborhoods.append(df['Neighborhood'][row])
        nyc_boroughs.append(df['Borough'][row])

# create dataframe indexed by zip code
nyc = pd.DataFrame({'ZIP Code': nyc_zip_codes,
                    'Neighborhood': nyc_neighborhoods,
                    'Borough': nyc_boroughs})

In [4]:
#review cleaned dataset
nyc.head()

Unnamed: 0,ZIP Code,Neighborhood,Borough
0,10453,Central Bronx,Bronx
1,10457,Central Bronx,Bronx
2,10460,Central Bronx,Bronx
3,10458,Bronx Park and Fordham,Bronx
4,10467,Bronx Park and Fordham,Bronx


# NYC Neighborhoods (ETL): Load

In [5]:
#creating SQL connection
conn = sqlite3.connect('../Data/pet_care_industry.db')
c = conn.cursor()

#function to create table
def create_table(query):
    c.execute(query)

#function to close connection
def close_c_conn():
    c.close()
    conn.close()

In [6]:
#create neighborhoods table
create_query = """CREATE TABLE neighborhoods
                (ZipCode INTEGER PRIMARY KEY,
                 Neighborhood TEXT,
                 Borough TEXT);"""

c.execute('DROP TABLE IF EXISTS neighborhoods')
create_table(create_query)

In [7]:
#function to insert neighborhoods into table
def insert_neighborhoods(neighborhoods):
    for i in range(len(neighborhoods.index)):
        c.execute("""INSERT INTO neighborhoods
                  (ZipCode,
                   Neighborhood,
                   Borough)
                   VALUES
                   (?,?,?)""",
                   (int(neighborhoods.iloc[i]['ZIP Code']),
                    neighborhoods.iloc[i]['Neighborhood'],
                    neighborhoods.iloc[i]['Borough']))
        
    conn.commit()
    
#insert neighborhoods into table
insert_neighborhoods(nyc)

In [8]:
#check SQL neighborhoods table
neighborhoods = pd.read_sql_query("SELECT * FROM neighborhoods;", conn)
neighborhoods

Unnamed: 0,ZipCode,Neighborhood,Borough
0,10001,Chelsea and Clinton,Manhattan
1,10002,Lower East Side,Manhattan
2,10003,Lower East Side,Manhattan
3,10004,Lower Manhattan,Manhattan
4,10005,Lower Manhattan,Manhattan
5,10006,Lower Manhattan,Manhattan
6,10007,Lower Manhattan,Manhattan
7,10009,Lower East Side,Manhattan
8,10010,Gramercy Park and Murray Hill,Manhattan
9,10011,Chelsea and Clinton,Manhattan


In [9]:
#close connection
close_c_conn()