# Data Extraction

In [1]:
#pip install us

In [2]:
#Import dependencies
import os
import us
import pandas as pd
import numpy as np
import re
import json
import requests

In [3]:
#Retrieve data from us python package
us_state = us.states.mapping('fips','abbr')

#Convert dictionary to dataframe
state_id = pd.DataFrame.from_dict(us_state, orient='index',
                       columns=['STATE'])

#Export to csv
#state_id.to_csv('Resources/US_states.csv', index=True) #comment out after first run

In [4]:
#Create dictionary showing abbreviation and state id for each state
abbr_id = dict(us.states.mapping('ap_abbr', 'fips'))
print(abbr_id)

{'Ala.': '01', 'Alaska': '02', 'Ariz.': '04', 'Ark.': '05', 'Calif.': '06', 'Colo.': '08', 'Conn.': '09', 'Del.': '10', 'Fla.': '12', 'Ga.': '13', 'Hawaii': '15', 'Idaho': '16', 'Ill.': '17', 'Ind.': '18', 'Iowa': '19', 'Kan.': '20', 'Ky.': '21', 'La.': '22', 'Maine': '23', 'Md.': '24', 'Mass.': '25', 'Mich.': '26', 'Minn.': '27', 'Miss.': '28', 'Mo.': '29', 'Mont.': '30', 'Neb.': '31', 'Nev.': '32', 'N.H.': '33', 'N.J.': '34', 'N.M.': '35', 'N.Y.': '36', 'N.C.': '37', 'N.D.': '38', 'Ohio': '39', 'Okla.': '40', 'Ore.': '41', 'Pa.': '42', 'R.I.': '44', 'S.C.': '45', 'S.D.': '46', 'Tenn.': '47', 'Texas': '48', 'Utah': '49', 'Vt.': '50', 'Va.': '51', 'Wash.': '53', 'W.Va.': '54', 'Wis.': '55', 'Wyo.': '56', None: None, 'D.C.': '11'}


In [5]:
#Extract and view data from CSV
walmart_marketShare = pd.read_csv('Resources/Walmart_MarketShare_data.csv')
walmart_marketShare.head()

Unnamed: 0,CITY,STATE,POPULATION,MARKET_SHARE
0,Atchison,Kan.,16580,95%
1,Portales,N.M.,19730,95%
2,Sterling,Colo.,22068,91%
3,Deming,N.M.,24699,90%
4,Guymon,Ohio,21385,90%


# Data Transformation

In [6]:
#Create datafrae for Walmart market share
marketShare_df = pd.DataFrame(walmart_marketShare)
marketShare_df

Unnamed: 0,CITY,STATE,POPULATION,MARKET_SHARE
0,Atchison,Kan.,16580,95%
1,Portales,N.M.,19730,95%
2,Sterling,Colo.,22068,91%
3,Deming,N.M.,24699,90%
4,Guymon,Ohio,21385,90%
...,...,...,...,...
198,Maysville,Ky.,17230,50%
199,Enid,Okla.,62602,50%
200,Shawnee,Okla.,71961,50%
201,Troy,Ala.,33368,50%


In [7]:
#Clean data

#1.Determine datatypes
marketShare_df.dtypes

CITY            object
STATE           object
POPULATION       int64
MARKET_SHARE    object
dtype: object

In [8]:
#Remove '%' from 'MARKET_SHARE', then view
marketShare_df['MARKET_SHARE'] = marketShare_df['MARKET_SHARE'].str.replace('%','')
#marketShare_df.head()

#Convert from object ('MARKET_SHARE') to int
marketShare_df['MARKET_SHARE'] = marketShare_df['MARKET_SHARE'].astype(int)
marketShare_df.dtypes

CITY            object
STATE           object
POPULATION       int64
MARKET_SHARE     int32
dtype: object

In [9]:
marketShare_df['STATE_ID'] = marketShare_df['STATE'].replace(abbr_id, regex=True)
marketShare_df.head()

Unnamed: 0,CITY,STATE,POPULATION,MARKET_SHARE,STATE_ID
0,Atchison,Kan.,16580,95,20
1,Portales,N.M.,19730,95,35
2,Sterling,Colo.,22068,91,8
3,Deming,N.M.,24699,90,35
4,Guymon,Ohio,21385,90,39


In [10]:
#Delete 'STATE' column and set index to 'STATE_ID'
del marketShare_df['STATE']
marketShare_df.set_index('STATE_ID')
marketShare_df

Unnamed: 0,CITY,POPULATION,MARKET_SHARE,STATE_ID
0,Atchison,16580,95,20
1,Portales,19730,95,35
2,Sterling,22068,91,08
3,Deming,24699,90,35
4,Guymon,21385,90,39
...,...,...,...,...
198,Maysville,17230,50,21
199,Enid,62602,50,40
200,Shawnee,71961,50,40
201,Troy,33368,50,01


In [12]:
#Rearrange columns and display final table format
cols = marketShare_df.columns.tolist()
cols = cols[-1:] + cols[:-1]
marketShare_df = marketShare_df[cols]
marketShare_df.head()

Unnamed: 0,MARKET_SHARE,STATE_ID,CITY,POPULATION
0,95,20,Atchison,16580
1,95,35,Portales,19730
2,91,8,Sterling,22068
3,90,35,Deming,24699
4,90,39,Guymon,21385
