In [4]:
import pandas as pd
import os
from sqlalchemy import create_engine

cwd = os.getcwd()
cwd

'C:\\Users\\bxprd\\Data Analytics Bootcamp\\Git_Repos\\ETL-challenge'

### Extract CSVs into DataFrames

In [7]:
crime_file = "./Resources/NYPD_Complaint_Data.csv"
crime_df = pd.read_csv(crime_file)
crime_df

Unnamed: 0,CMPLNT_NUM,BORO_NM,CMPLNT_FR_DT,CMPLNT_TO_DT,CRM_ATPT_CPTD_CD,KY_CD,LAW_CAT_CD,OFNS_DESC,SUSP_AGE_GROUP,SUSP_RACE,SUSP_SEX,VIC_AGE_GROUP,VIC_RACE,VIC_SEX,Latitude,Longitude
0,192701448,QUEENS,10/7/1918,2/2/2019,COMPLETED,109,FELONY,GRAND LARCENY,UNKNOWN,UNKNOWN,U,45-64,WHITE,M,40.762070,-73.929744
1,122423264,STATEN ISLAND,11/24/1919,11/24/2019,COMPLETED,109,FELONY,GRAND LARCENY,,,,45-64,WHITE,M,40.538354,-74.215574
2,595267268,MANHATTAN,1/9/1920,1/9/2020,COMPLETED,341,MISDEMEANOR,PETIT LARCENY,UNKNOWN,BLACK,U,UNKNOWN,UNKNOWN,D,40.722053,-73.988215
3,759752818,MANHATTAN,4/29/1920,4/29/2020,COMPLETED,341,MISDEMEANOR,PETIT LARCENY,UNKNOWN,WHITE HISPANIC,M,UNKNOWN,UNKNOWN,D,40.731890,-73.994546
4,351329583,QUEENS,7/6/1920,7/7/2020,COMPLETED,341,MISDEMEANOR,PETIT LARCENY,45-64,ASIAN / PACIFIC ISLANDER,M,18-24,BLACK HISPANIC,M,40.758563,-73.865846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413407,462517741,BROOKLYN,09/04/1010,9/10/2020,COMPLETED,341,MISDEMEANOR,PETIT LARCENY,25-44,WHITE,F,UNKNOWN,UNKNOWN,D,40.692681,-73.990914
413408,386907725,MANHATTAN,11/03/1010,11/9/2020,COMPLETED,341,MISDEMEANOR,PETIT LARCENY,25-44,WHITE HISPANIC,M,UNKNOWN,UNKNOWN,D,40.762394,-73.983421
413409,577517798,BRONX,12/04/1010,12/10/2020,COMPLETED,341,MISDEMEANOR,PETIT LARCENY,UNKNOWN,UNKNOWN,U,<18,UNKNOWN,F,40.824551,-73.910235
413410,938263841,BRONX,12/04/1020,12/2/2020,COMPLETED,351,MISDEMEANOR,CRIMINAL MISCHIEF & RELATED OF,,,,45-64,BLACK HISPANIC,M,40.841978,-73.904158


### Transform Crime DataFrame

In [4]:
# Create a filtered dataframe from specific columns
crime_cols = ["CMPLNT_NUM", "BORO_NM", "CMPLNT_FR_DT", "CMPLNT_TO_DT", "CRM_ATPT_CPTD_CD", "KY_CD", "LAW_CAT_CD",
                "OFNS_DESC", "SUSP_AGE_GROUP", "SUSP_RACE", "SUSP_SEX", "VIC_AGE_GROUP", 
                "VIC_RACE", "VIC_SEX", "Latitude", "Longitude"]

crime_transformed = crime_df[crime_cols].copy()

# Rename the column headers
crime_transformed = crime_transformed.rename(columns={"CMPLNT_NUM": "Complaint_ID", 
                                                      "BORO_NM": "Boro_Name", 
                                                      "CMPLNT_FR_DT":"Start_Date", 
                                                      "CMPLNT_TO_DT":"End_Date", 
                                                      "CRM_ATPT_CPTD_CD":"Complaint_Status", 
                                                      "KY_CD":"Complaint_Code", 
                                                      "LAW_CAT_CD": "Complaint_Cat",
                                                      "OFNS_DESC":"Complaint_Desc", 
                                                      "SUSP_AGE_GROUP":"Suspect_Age", 
                                                      "SUSP_RACE":"Suspect_Race", 
                                                      "SUSP_SEX": "Suspect_Age", 
                                                      "VIC_AGE_GROUP":"Victim_Age", 
                                                      "VIC_RACE":"Victim_Race", 
                                                      "VIC_SEX":"Victim_Gender", 
                                                      "Latitude":"Latitude", 
                                                      "Longitude":"Longitude"})

# Clean the data by dropping duplicates and setting the index
premise_transformed.drop_duplicates("id", inplace=True)
premise_transformed.set_index("id", inplace=True)

premise_transformed.head()

Unnamed: 0_level_0,premise_name,county_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1311660,CANA ARRIBA GROCERY NO 2 INC,0
1310023,JOHANA GROCERY & DELI CORP,0
1310024,TKO BEVERAGES LLC,1
1311663,181 LEXINGTON AVENUE BBQ LLC,2
1310029,AZIZ DELI & GRILL CORP,1


### Transform county DataFrame

In [5]:
county_cols = ["ID", "County Name (Licensee)", "County ID Code", "License Count"]
county_transformed = county_df[county_cols].copy()

# Rename the column headers
county_transformed = county_transformed.rename(columns={"ID": "id",
                                                         "County Name (Licensee)": "county_name",
                                                         "License Count": "license_count",
                                                         "County ID Code": "county_id"})

# Set index
county_transformed.set_index("id", inplace=True)

county_transformed.head()

Unnamed: 0_level_0,county_name,county_id,license_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,ALBANY,5,77
1,ALLEGANY,59,4
2,BRONX,0,104
3,BROOME,35,14
4,CATTARAUGUS,41,9


### Create database connection

In [6]:
connection_string = "postgres:postgres@localhost:5432/customer_db"
engine = create_engine(f'postgresql://{connection_string}')

In [7]:
# Confirm tables
engine.table_names()

['county', 'premise']

### Load DataFrames into database

In [8]:
premise_transformed.to_sql(name='premise', con=engine, if_exists='append', index=True)

In [9]:
county_transformed.to_sql(name='county', con=engine, if_exists='append', index=True)