# Data Extraction

In [1]:
#Import dependencies
import pandas as pd
import requests
import json
import os
import numpy as np

In [2]:
#Extract and view data from CSV
walmart_marketShare = pd.read_csv('Resources/Walmart_MarketShare_data.csv')
walmart_marketShare.head()

Unnamed: 0,CITY,STATE,POPULATION,MARKET_SHARE
0,Atchison,Kan.,16580,95%
1,Portales,N.M.,19730,95%
2,Sterling,Colo.,22068,91%
3,Deming,N.M.,24699,90%
4,Guymon,Ohio,21385,90%


In [3]:
#Extract and view data from JSON

#1.Specify url
url = "https://gist.githubusercontent.com/anonymous/83803696b0e3430a52f1/raw/29f2b252981659dfa6ad51922c8155e66ac261b2/walmart.json"

#2. Get and print response
walmart_locations = pd.DataFrame(requests.get(url).json())
walmart_locations.head()

Unnamed: 0,id,storeType,timeZone,openDate,name,postalCode,address1,city,state,country,latitude,longitude,phone_number
0,6601,3,K,01/04/1994 12:00,Sam's Club,99515,8801 Old Seward Hwy,Anchorage,AK,US,61.14077,-149.860016,(907) 522-2333
1,6602,3,K,01/06/1994 12:00,Sam's Club,99504,1074 N. Muldoon Road,Anchorage,AK,US,61.231079,-149.740844,(907) 276-2996
2,2074,1,K,03/29/1994 12:00,Walmart Supercenter,99654,1350 S Seward Meridian Pkwy,Wasilla,AK,US,61.568752,-149.363647,(907) 376-9780
3,2070,1,K,03/29/1994 12:00,Walmart Supercenter,99503,3101 A St,Anchorage,AK,US,61.192337,-149.880356,(907) 563-5900
4,6603,3,K,01/08/1994 12:00,Sam's Club,99701,48 College Rd,Fairbanks,AK,US,64.852867,-147.707336,(907) 451-4800


# Data Transformation

CSV Data Cleaning

In [4]:
#Determine datatypes
walmart_marketShare.dtypes

CITY            object
STATE           object
POPULATION       int64
MARKET_SHARE    object
dtype: object

In [5]:
#Remove '%' from 'MARKET_SHARE' 
walmart_marketShare['MARKET_SHARE'] = walmart_marketShare['MARKET_SHARE'].str.replace('%','')
#walmart_marketShare.head()

#Convert from object to int
walmart_marketShare = walmart_marketShare.astype({"MARKET_SHARE": int})
walmart_marketShare.dtypes

CITY            object
STATE           object
POPULATION       int64
MARKET_SHARE     int32
dtype: object

In [13]:
#Create dataframe
marketShare_df = pd.DataFrame(walmart_marketShare)
marketShare_df.head()

Unnamed: 0,CITY,STATE,POPULATION,MARKET_SHARE,STATE_CODE
0,Atchison,Kan.,16580,95,
1,Portales,N.M.,19730,95,
2,Sterling,Colo.,22068,91,
3,Deming,N.M.,24699,90,
4,Guymon,Ohio,21385,90,


In [14]:
us_state_abbrev={'Ala.': 'AL','Alaska': 'AK','AS': 'AS','Ariz.': 'AZ','Ark.': 'AR',
                'Calif.': 'CA','Colo.': 'CO','Conn.': 'CT',
                'Del.': 'DE', 'D.C.':'DC',
                'Fla.': 'FL',
                'Ga.': 'GA','Guam': 'GU',
                 'Hawaii': 'HI',
                 'Idaho': 'ID','Ill.': 'IL','Ind.': 'IN','Iowa': 'IA',
                 'Kan.': 'KS','Ky.': 'KY',
                 'La.': 'LA',
                 'Maine': 'ME','Md.': 'MD','Mass.': 'MA','Mich.': 'MI','Minn.': 'MN','Miss.': 'MS','Mo.': 'MO','Mont.': 'MT',
                 'Neb.': 'NE','Nev.': 'NV','N.H.': 'NH','N.J.': 'NJ','N.M.': 'NM','N.Y.': 'NY','N.C.': 'NC','N.D.': 'ND',
                 'M.P.': 'MP',
                 'Ohio': 'OH','Okla.': 'OK','Ore.': 'OR',
                 'Pa.': 'PA','P.R.': 'PR',
                 'R.I.': 'RI',
                 'S.C.': 'SC','S.D.': 'SD',
                 'Tenn.': 'TN','Tex.': 'TX',
                 'Utah': 'UT',
                 'Vt.': 'VT','VI': 'VI','Va.': 'VA',
                 'Wash.': 'WA','W.Va.': 'WV','Wis.': 'WI','Wyo.': 'WY'
                }

In [None]:
us_state_code = {'Ala.':'AL',
                    'Alaska':'AK'}

In [18]:
#Replace 'STATE' values with two-letter state codes
marketShare_df['STATE_CODE'] = marketShare_df['STATE'].map(us_state_abbrev, na_action='ignore')
marketShare_df

Unnamed: 0,CITY,STATE,POPULATION,MARKET_SHARE,STATE_CODE
0,Atchison,Kan.,16580,95,
1,Portales,N.M.,19730,95,
2,Sterling,Colo.,22068,91,
3,Deming,N.M.,24699,90,
4,Guymon,Ohio,21385,90,
...,...,...,...,...,...
198,Maysville,Ky.,17230,50,
199,Enid,Okla.,62602,50,
200,Shawnee,Okla.,71961,50,
201,Troy,Ala.,33368,50,


In [None]:
walmart_marketShare_df["STATE"].map(us_state_abbrev)
walmart_marketShare_df.head()

In [None]:
import pandas as pd
df= pd.DataFrame({'Country':['China','India','USA','Indonesia','Brazil'],
                  'Population':[1403500365,1324171354,322179605,261115456,207652865]})
df

In [None]:
country_capital={
'Germany':'Berlin',
'Brazil':'Brasília',
'Budapest':'Hungary',
'China':'Beijing',
'India':'New Delhi',
'Norway':'Oslo',
'France':'Paris',
'Indonesia': 'Jakarta',
'USA':'Washington'
}

In [None]:
df['Capital'] = df['Country'].map(country_capital)

In [None]:
df

In [None]:
print(df1.columns)

In [None]:
df1.columns = ['ID', 'STORE_TYPE', 'TIMEZONE', 'OPEN_DATE', 'NAME', 'POSTAL_CODE', 'ADDRESS1', 'CITY', 'STATE', 'COUNTRY', 'LATITUDE', 'LONGITUDE','PHONE_NUMBER']

In [None]:
df1

In [None]:
#view csv content
df2 = pd.read_csv('Resources/Walmart_MarketShare_data.csv')
df2

Clean "POPULATION" column, integer dtype.

In [None]:
df2.iloc[:, 1]

In [None]:
print(df2.columns)

In [None]:
df2.dtypes

In [None]:
#replace "STATE" with two-letter code


In [None]:
merged_df = pd.merge(df1, df2, on='CITY')
merged_df

In [None]:
rem_duplicates = merged_df.drop_duplicates()
rem_duplicates

Summary tables/groupby tables per state

In [None]:
state = merged_df.groupby('STATE_x')
state