# EXTRACT

In [46]:
# Import dependencies
import pandas as pd
import requests 
import json
from pprint import pprint
import numpy as np
from sqlalchemy import create_engine
from config import connection_string
import warnings
warnings.filterwarnings("ignore")

In [47]:
# Read csv and json data
walmart_df=pd.read_json('Resources/walmart.json')
us_state= pd.read_csv('Resources/US_states.csv')

In [48]:
# Display walmart(json)
walmart_df.head()

Unnamed: 0,id,storeType,timeZone,openDate,name,postalCode,address1,city,state,country,latitude,longitude,phone_number
0,6601,3,K,01/04/1994 12:00,Sam's Club,99515,8801 Old Seward Hwy,Anchorage,AK,US,61.14077,-149.860016,(907) 522-2333
1,6602,3,K,01/06/1994 12:00,Sam's Club,99504,1074 N. Muldoon Road,Anchorage,AK,US,61.231079,-149.740844,(907) 276-2996
2,2074,1,K,03/29/1994 12:00,Walmart Supercenter,99654,1350 S Seward Meridian Pkwy,Wasilla,AK,US,61.568752,-149.363647,(907) 376-9780
3,2070,1,K,03/29/1994 12:00,Walmart Supercenter,99503,3101 A St,Anchorage,AK,US,61.192337,-149.880356,(907) 563-5900
4,6603,3,K,01/08/1994 12:00,Sam's Club,99701,48 College Rd,Fairbanks,AK,US,64.852867,-147.707336,(907) 451-4800


In [49]:
us_state.head()

Unnamed: 0,ID,STATE
0,1.0,AL
1,2.0,AK
2,4.0,AZ
3,5.0,AR
4,6.0,CA


# TRANSFORM

In [50]:
# Data types of the datafram and number, and percentage of NAN values (walmart):
def missing_data(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    missing_values = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    #Convert to string and add to list
    for col in data.columns: 
        dtype = str(data[col].dtype)
        types.append(dtype)
    missing_values['Types'] = types
    return(np.transpose(missing_values))

# Run function:
missing_data(walmart_df)

Unnamed: 0,id,storeType,timeZone,openDate,name,postalCode,address1,city,state,country,latitude,longitude,phone_number
Total,0,0,15,838,0,0,0,0,0,0,0,0,0
Percent,0,0,0.257688,14.3962,0,0,0,0,0,0,0,0,0
Types,int64,int64,object,object,object,int64,object,object,object,object,float64,float64,object


In [57]:
# Filtering to show just walmart Supercenter and walmart data
walmart = walmart_df.loc[(walmart_df["storeType"]==1) | (walmart_df["storeType"]==2)]
walmart

Unnamed: 0,id,storeType,timeZone,openDate,name,postalCode,address1,city,state,country,latitude,longitude,phone_number
2,2074,1,K,03/29/1994 12:00,Walmart Supercenter,99654,1350 S Seward Meridian Pkwy,Wasilla,AK,US,61.568752,-149.363647,(907) 376-9780
3,2070,1,K,03/29/1994 12:00,Walmart Supercenter,99503,3101 A St,Anchorage,AK,US,61.192337,-149.880356,(907) 563-5900
5,2071,1,K,03/29/1994 12:00,Walmart Supercenter,99515,8900 Old Seward Hwy,Anchorage,AK,US,61.140263,-149.868835,(907) 344-5300
6,2188,1,K,04/26/2000 12:00,Walmart Supercenter,99577,18600 Eagle River Rd,Eagle River,AK,US,61.309483,-149.534912,(907) 694-9780
7,3814,1,K,09/12/2007 12:00,Walmart Supercenter,99801,6525 Glacier Hwy,Juneau,AK,US,58.358349,-134.514862,(907) 789-5000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5815,1456,1,MST,08/01/1990 12:00,Walmart Supercenter,82930,125 N 2Nd St,Evanston,WY,US,41.265720,-110.954559,(307) 789-0010
5816,1485,1,MST,01/30/1991 12:00,Walmart Supercenter,82718,2300 S Douglas Hwy,Gillette,WY,US,44.273472,-105.494208,(307) 686-4060
5817,1457,1,MST,06/27/1990 12:00,Walmart Supercenter,82501,1733 N Federal Blvd,Riverton,WY,US,43.042599,-108.379036,(307) 856-3261
5818,1778,1,MST,04/01/1992 12:00,Walmart Supercenter,82414,321 Yellowstone Ave,Cody,WY,US,44.517460,-109.089333,(307) 527-4673


In [59]:
# Merge walmart_df with us_state to incluse ID for states:
walmart.rename(columns={"state":"STATE"}, inplace=True)
walmart_state=walmart.merge(us_state, on="STATE", how="inner")
walmart_state.head()

Unnamed: 0,id,storeType,timeZone,openDate,name,postalCode,address1,city,STATE,country,latitude,longitude,phone_number,ID
0,2074,1,K,03/29/1994 12:00,Walmart Supercenter,99654,1350 S Seward Meridian Pkwy,Wasilla,AK,US,61.568752,-149.363647,(907) 376-9780,2.0
1,2070,1,K,03/29/1994 12:00,Walmart Supercenter,99503,3101 A St,Anchorage,AK,US,61.192337,-149.880356,(907) 563-5900,2.0
2,2071,1,K,03/29/1994 12:00,Walmart Supercenter,99515,8900 Old Seward Hwy,Anchorage,AK,US,61.140263,-149.868835,(907) 344-5300,2.0
3,2188,1,K,04/26/2000 12:00,Walmart Supercenter,99577,18600 Eagle River Rd,Eagle River,AK,US,61.309483,-149.534912,(907) 694-9780,2.0
4,3814,1,K,09/12/2007 12:00,Walmart Supercenter,99801,6525 Glacier Hwy,Juneau,AK,US,58.358349,-134.514862,(907) 789-5000,2.0


In [62]:
cols_to_keep=["ID", "city", "postalCode", "id", "address1", "latitude", "longitude"]
walmart_state_df=walmart_state[cols_to_keep]
walmart_state_df.rename(columns={"id":"store id", "address1":"address"}, inplace=True)
walmart_state_df.head()

Unnamed: 0,ID,city,postalCode,store id,address,latitude,longitude
0,2.0,Wasilla,99654,2074,1350 S Seward Meridian Pkwy,61.568752,-149.363647
1,2.0,Anchorage,99503,2070,3101 A St,61.192337,-149.880356
2,2.0,Anchorage,99515,2071,8900 Old Seward Hwy,61.140263,-149.868835
3,2.0,Eagle River,99577,2188,18600 Eagle River Rd,61.309483,-149.534912
4,2.0,Juneau,99801,3814,6525 Glacier Hwy,58.358349,-134.514862


# LOAD

In [63]:
# Create engine
engine=create_engine(f"postgresql://{connection_string}")

In [None]:
# Create table state_category and add data 
walmart_state_df.to_sql(name="walmart_state_df", con=engine, if_exists='append', index=True)

In [None]:
# Check table names in database
engine.table_names()

In [None]:
# Add primary key constraint to table walmart_state_df
engine.execute('ALTER TABLE "walmart_state_df" ADD PRIMARY KEY ("store id")')