# Airbnb Database Load into PostgreSQL

## Project III 

    Katy Fuentes, Nicole Pipkins, Radhika Balasubramaniam, Reza Abasaltian
    December 5, 2020

### Reference the following CSV files to tables data load
        
        merged_census_crime --> merged_crime_census

In [1]:
# Import dependencies
import requests
import random
import sqlalchemy
import warnings
import pandas as pd
import json
import ast
import os

from pprint import pprint
from datetime import datetime, timedelta
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect
from pandas.core.common import SettingWithCopyWarning

# Postgres database user and password import
# from config import password
from db_key import user, password

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

# Set today as current date and time
t = datetime.now()

# Print todays date formatted as mm/dd/yy
date = t.strftime('%m/%d/%y')
datef = t.strftime('%m%d%y')

# format time to round to the nearest hour in hundreds
time = (t.replace(second=0, microsecond=0, minute=0, hour=t.hour)
                        + timedelta(hours=t.minute//30))
hour = time.strftime('%H'+'00')

print(f'Today is {date} @ {hour} hour.')

Today is 12/08/20 @ 1600 hour.


In [2]:
csv_load = pd.read_csv('../data/census-csv/merged_crime_census.csv')
df_census = pd.DataFrame(csv_load)
df_census.head()

Unnamed: 0.1,Unnamed: 0,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,...,Unemployment,Crime_RatePer100K,Murder,Rape,Robbery,Agg.Assault,Burglary,Larceny,MotorVeh Theft,Arson
0,0,California,Alameda County,1629615,799649,829966,22.5,32.2,10.7,0.3,...,6.0,753.693574,153,477,6220,5050,13245,35841,13044,337
1,1,California,Los Angeles County,10105722,4979641,5126081,48.4,26.5,7.9,0.2,...,7.8,444.800814,600,1976,18923,23057,49107,142604,40555,2210
2,2,California,San Diego County,3283665,1651147,1632518,33.4,46.2,4.7,0.4,...,7.1,368.672406,107,692,3200,7840,14076,43412,12136,346
3,3,California,San Francisco County,864263,440633,423630,15.3,40.8,5.1,0.2,...,5.4,701.42171,69,110,3554,2141,5401,29256,5381,211
4,4,District of Columbia,District of Columbia,672391,319046,353345,10.7,36.0,46.9,0.2,...,8.0,1216.801325,88,236,4037,3505,3519,23575,3663,51


In [3]:
df_census.keys()

Index(['Unnamed: 0', 'State', 'County', 'TotalPop', 'Men', 'Women', 'Hispanic',
       'White', 'Black', 'Native', 'Asian', 'Pacific', 'IncomePerCap',
       'Professional', 'Walk', 'OtherTransp', 'WorkAtHome', 'MeanCommute',
       'Unemployment', 'Crime_RatePer100K', 'Murder', 'Rape', 'Robbery',
       'Agg.Assault', 'Burglary', 'Larceny', 'MotorVeh Theft', 'Arson'],
      dtype='object')

In [4]:
df_census = df_census.rename(columns={'MotorVeh Theft': 'MotorVeh'})
df_census.keys()

Index(['Unnamed: 0', 'State', 'County', 'TotalPop', 'Men', 'Women', 'Hispanic',
       'White', 'Black', 'Native', 'Asian', 'Pacific', 'IncomePerCap',
       'Professional', 'Walk', 'OtherTransp', 'WorkAtHome', 'MeanCommute',
       'Unemployment', 'Crime_RatePer100K', 'Murder', 'Rape', 'Robbery',
       'Agg.Assault', 'Burglary', 'Larceny', 'MotorVeh', 'Arson'],
      dtype='object')

In [17]:
#database connection
engine = create_engine(f'postgresql://{user}:{password}@localhost:5432/airbnb_db')
engine.table_names()

['top_neighborhood_overview',
 'listings_info',
 'neighborhood_overview',
 'neighborhood_insights',
 'rental_rates',
 'rental_rates_info',
 'city_nbh',
 'top_airbnb_cities',
 'merged_census_crime']

In [11]:
#confirm data has been loaded to sql table
df_nbh = pd.read_sql_query('select * from top_neighborhood_overview', con=engine)
df_nbh.head()

Unnamed: 0,nbh_id,name,county,state
0,273471,Dignowity Hill-St. Paul Square,Bexar,TX
1,271352,Downtown,Bexar,TX
2,275804,United Homeowners,Bexar,TX
3,271349,Dignowity Hill,Bexar,TX
4,274033,Historic Gardens,Bexar,TX


In [12]:
nbh_cnty = []
for i in range(len(df_nbh)):
    if (i % 5 == 0):
        if (df_nbh['county'][i] == ""):
            county = df_nbh['county'][i+4]
            nbh_id = df_nbh['nbh_id'][i+4]
        else:
            county = df_nbh['county'][i]
            nbh_id = df_nbh['nbh_id'][i]
        
        dict_nbhcnty = {            
            "county": county,
            "nbh_id": nbh_id,
        }
        nbh_cnty.append(dict_nbhcnty)
        
df_nbh_cnty = pd.DataFrame(nbh_cnty)
df_nbh_cnty.head()

Unnamed: 0,county,nbh_id
0,Bexar,273471
1,Travis,271360
2,Tarrant,422770
3,Dallas,276189
4,Harris,271389


In [13]:
# assign a custom crime id and link to nbh_id
df_census['crime_id'] = ""
df_census['nbh_id'] = 0
alph = 'abcdefghijklmnopqrstuvwxyz'
for i in range(len(df_census)):
    cid = 0
    nbh_id = 0
    name = str(df_census['County'][i]).replace('County','').rstrip()
    st = str(df_census['State'][i])[0:2]
    for j in range(len(name)):
        cid = cid + alph.find(name[j].lower())**3
    for k in range(len(st)):
        cid = cid + alph.find(st[k].lower())**3
    for m in range(len(df_nbh_cnty)):
        if (str(df_nbh_cnty['county'][m]) == name):
            nbh_id = int(df_nbh_cnty['nbh_id'][m])
            
    df_census['crime_id'][i] = cid
    df_census['County'][i] = name
    df_census['nbh_id'][i] = nbh_id
df_census.drop(df_census.columns[[0,1,2]], axis=1, inplace=True)
df_census2 = df_census.drop(df_census.index[10])
df_census3 = df_census2.drop(df_census.index[11])
df_census4 = df_census3.drop(df_census.index[17])
df_census4.head()

Unnamed: 0,TotalPop,Men,Women,Hispanic,White,Black,Native,Asian,Pacific,IncomePerCap,...,Murder,Rape,Robbery,Agg.Assault,Burglary,Larceny,MotorVeh,Arson,crime_id,nbh_id
0,1629615,799649,829966,22.5,32.2,10.7,0.3,28.7,0.8,41363,...,153,477,6220,5050,13245,35841,13044,337,3158,274853
1,10105722,4979641,5126081,48.4,26.5,7.9,0.2,14.3,0.2,30798,...,600,1976,18923,23057,49107,142604,40555,2210,19618,416300
2,3283665,1651147,1632518,33.4,46.2,4.7,0.4,11.5,0.4,34350,...,107,692,3200,7840,14076,43412,12136,346,11599,273808
3,864263,440633,423630,15.3,40.8,5.1,0.2,33.9,0.3,59508,...,69,110,3554,2141,5401,29256,5381,211,24375,417519
4,672391,319046,353345,10.7,36.0,46.9,0.2,3.7,0.0,50832,...,88,236,4037,3505,3519,23575,3663,51,43252,403138


In [14]:
#already data loaded, dont run again
df_census4.to_sql(name='merged_census_crime', con=engine, if_exists='append', index=False)

In [15]:
#confirm data has been loaded to sql table
census_sql = pd.read_sql_query('select * from merged_census_crime', con=engine)
census_sql.head()

Unnamed: 0,TotalPop,Men,Women,Hispanic,White,Black,Native,Asian,Pacific,IncomePerCap,...,Murder,Rape,Robbery,Agg.Assault,Burglary,Larceny,MotorVeh,Arson,crime_id,nbh_id
0,1629615,799649,829966,22.5,32.2,10.7,0.3,28.7,0.8,41363,...,153,477,6220,5050,13245,35841,13044,337,3158,274853
1,10105722,4979641,5126081,48.4,26.5,7.9,0.2,14.3,0.2,30798,...,600,1976,18923,23057,49107,142604,40555,2210,19618,416300
2,3283665,1651147,1632518,33.4,46.2,4.7,0.4,11.5,0.4,34350,...,107,692,3200,7840,14076,43412,12136,346,11599,273808
3,864263,440633,423630,15.3,40.8,5.1,0.2,33.9,0.3,59508,...,69,110,3554,2141,5401,29256,5381,211,24375,417519
4,672391,319046,353345,10.7,36.0,46.9,0.2,3.7,0.0,50832,...,88,236,4037,3505,3519,23575,3663,51,43252,403138


In [16]:
# END DATABASE LOAD