# Example on Data Preparation: US States Data

In [1]:
import pandas as pd

In [2]:
!curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-population.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
  7 57935    7  4134    0     0   2331      0  0:00:24  0:00:01  0:00:23  2336
100 57935  100 57935    0     0  27975      0  0:00:02  0:00:02 --:--:-- 28042


In [3]:
pop = pd.read_csv('state-population.csv')
pop.head()

Unnamed: 0,state/region,ages,year,population
0,AL,under18,2012,1117489.0
1,AL,total,2012,4817528.0
2,AL,under18,2010,1130966.0
3,AL,total,2010,4785570.0
4,AL,under18,2011,1125763.0


In [4]:
pop[500:700]

Unnamed: 0,state/region,ages,year,population
500,GA,total,2004,8769252.0
501,GA,under18,2004,2308855.0
502,GA,total,2001,8377038.0
503,GA,under18,2001,2215390.0
504,GA,total,2002,8508256.0
...,...,...,...,...
695,IN,under18,2001,1579527.0
696,IN,total,2002,6155967.0
697,IN,under18,2002,1580814.0
698,IN,total,1999,6044970.0


In [5]:
!curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-areas.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0   835    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100   835  100   835    0     0   1606      0 --:--:-- --:--:-- --:--:--  1615


In [6]:
areas = pd.read_csv('state-areas.csv')
areas.head()

Unnamed: 0,state,area (sq. mi)
0,Alabama,52423
1,Alaska,656425
2,Arizona,114006
3,Arkansas,53182
4,California,163707


In [6]:
!curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-abbrevs.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100   872  100   872    0     0   1656      0 --:--:-- --:--:-- --:--:--  1664


In [7]:
abbrevs = pd.read_csv('state-abbrevs.csv')
abbrevs.head()

Unnamed: 0,state,abbreviation
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


## Merge Data Sets of States Abbreviation

In [9]:
pop_ab = pd.merge(pop, abbrevs, how='outer',
                  left_on='state/region', 
                  right_on='abbreviation')
pop_ab.head()

Unnamed: 0,state/region,ages,year,population,state,abbreviation
0,AK,total,1990,553290.0,Alaska,AK
1,AK,under18,1990,177502.0,Alaska,AK
2,AK,total,1992,588736.0,Alaska,AK
3,AK,under18,1991,182180.0,Alaska,AK
4,AK,under18,1992,184878.0,Alaska,AK


In [10]:
pop_ab = pop_ab.drop('abbreviation',axis=1) 
pop_ab.head()

Unnamed: 0,state/region,ages,year,population,state
0,AK,total,1990,553290.0,Alaska
1,AK,under18,1990,177502.0,Alaska
2,AK,total,1992,588736.0,Alaska
3,AK,under18,1991,182180.0,Alaska
4,AK,under18,1992,184878.0,Alaska


## Fill in Missing Values for States

In [11]:
pop_ab.isnull().any()

state/region    False
ages            False
year            False
population       True
state            True
dtype: bool

In [12]:
pop_ab[pop_ab['state'].isnull()].head()

Unnamed: 0,state/region,ages,year,population,state
1872,PR,under18,1990,,
1873,PR,total,1990,,
1874,PR,total,1991,,
1875,PR,under18,1991,,
1876,PR,total,1993,,


In [13]:
# Choose ALL null rows, and column 'state/region'. Read its unique values.
pop_ab.loc[pop_ab['state'].isnull(), 'state/region'].unique()

array(['PR', 'USA'], dtype=object)

In [14]:
# data.loc[row condition, variable_to_be_updated] = 'new value'

pop_ab.loc[pop_ab['state/region'] == 'PR', 'state'] = 'Puerto Rico'
pop_ab.loc[pop_ab['state/region'] == 'USA', 'state'] = 'United States'

pop_ab.isnull().any()

state/region    False
ages            False
year            False
population       True
state           False
dtype: bool

### Merge the 3rd Dataset

In [16]:
areas = pd.read_csv("state-areas.csv", encoding="utf-8-sig")
areas.columns = areas.columns.str.strip()

pop_ab.columns = pop_ab.columns.str.strip()  # optional, keeps things tidy

final = pd.merge(pop_ab, areas, on="state", how="left")
final.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
0,AK,total,1990,553290.0,Alaska,656425.0
1,AK,under18,1990,177502.0,Alaska,656425.0
2,AK,total,1992,588736.0,Alaska,656425.0
3,AK,under18,1991,182180.0,Alaska,656425.0
4,AK,under18,1992,184878.0,Alaska,656425.0


In [17]:
final.isnull().any()

state/region     False
ages             False
year             False
population        True
state            False
area (sq. mi)     True
dtype: bool

In [18]:
final.dropna(inplace=True)
final.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
0,AK,total,1990,553290.0,Alaska,656425.0
1,AK,under18,1990,177502.0,Alaska,656425.0
2,AK,total,1992,588736.0,Alaska,656425.0
3,AK,under18,1991,182180.0,Alaska,656425.0
4,AK,under18,1992,184878.0,Alaska,656425.0


## Creating New Variables

In [19]:
final['density'] = final['population'] / final['area (sq. mi)']
final.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi),density
0,AK,total,1990,553290.0,Alaska,656425.0,0.842884
1,AK,under18,1990,177502.0,Alaska,656425.0,0.270407
2,AK,total,1992,588736.0,Alaska,656425.0,0.896882
3,AK,under18,1991,182180.0,Alaska,656425.0,0.277534
4,AK,under18,1992,184878.0,Alaska,656425.0,0.281644


## Subsetting the Data

In [20]:
data2010 = final.query(" year == 2010 & ages == 'total' ")
data2010.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi),density
43,AK,total,2010,713868.0,Alaska,656425.0,1.087509
51,AL,total,2010,4785570.0,Alabama,52423.0,91.287603
141,AR,total,2010,2922280.0,Arkansas,53182.0,54.948667
149,AZ,total,2010,6408790.0,Arizona,114006.0,56.214497
197,CA,total,2010,37333601.0,California,163707.0,228.051342


In [21]:
data2010.set_index('state', inplace=True) 

In [22]:
data2010

Unnamed: 0_level_0,state/region,ages,year,population,area (sq. mi),density
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alaska,AK,total,2010,713868.0,656425.0,1.087509
Alabama,AL,total,2010,4785570.0,52423.0,91.287603
Arkansas,AR,total,2010,2922280.0,53182.0,54.948667
Arizona,AZ,total,2010,6408790.0,114006.0,56.214497
California,CA,total,2010,37333601.0,163707.0,228.051342
Colorado,CO,total,2010,5048196.0,104100.0,48.493718
Connecticut,CT,total,2010,3579210.0,5544.0,645.600649
District of Columbia,DC,total,2010,605125.0,68.0,8898.897059
Delaware,DE,total,2010,899711.0,1954.0,460.445752
Florida,FL,total,2010,18846054.0,65758.0,286.597129
