In [1]:
import pandas as pd

In [2]:
df_state_abbr = pd.read_csv('csv/state-abbrevs.csv')
df_state_areas = pd.read_csv('csv/state-areas.csv')
df_state_popl = pd.read_csv('csv/state-population.csv')

In [3]:
df_state_abbr.head()

Unnamed: 0,state,abbreviation
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


In [4]:
df_state_areas.head()

Unnamed: 0,state,area (sq. mi)
0,Alabama,52423
1,Alaska,656425
2,Arizona,114006
3,Arkansas,53182
4,California,163707


In [5]:
df_state_popl.head()

Unnamed: 0,state/region,ages,year,population
0,AL,under18,2012,1117489.0
1,AL,total,2012,4817528.0
2,AL,under18,2010,1130966.0
3,AL,total,2010,4785570.0
4,AL,under18,2011,1125763.0


## Problem: Rank US states and territories by their 2010 population density

In [6]:
df_abbr_pop = pd.merge(
    df_state_abbr,
    df_state_popl,
    left_on="abbreviation",
    right_on='state/region').drop('state/region', axis=1)

df_abbr_pop.head()

Unnamed: 0,state,abbreviation,ages,year,population
0,Alabama,AL,under18,2012,1117489.0
1,Alabama,AL,total,2012,4817528.0
2,Alabama,AL,under18,2010,1130966.0
3,Alabama,AL,total,2010,4785570.0
4,Alabama,AL,under18,2011,1125763.0


## Lets check if there are any missing values.

In [7]:
df_abbr_pop.isnull().any()

state           False
abbreviation    False
ages            False
year            False
population      False
dtype: bool

In [8]:
df_all = pd.merge(df_abbr_pop, df_state_areas, on='state')
df_all.head()

Unnamed: 0,state,abbreviation,ages,year,population,area (sq. mi)
0,Alabama,AL,under18,2012,1117489.0,52423
1,Alabama,AL,total,2012,4817528.0,52423
2,Alabama,AL,under18,2010,1130966.0,52423
3,Alabama,AL,total,2010,4785570.0,52423
4,Alabama,AL,under18,2011,1125763.0,52423


In [9]:
filter_2010_total = df_all.query("year == 2010 & ages == 'total'")
filter_2010_total.head()
filter_2010_total.columns
filter_2010_total.index

Int64Index([   3,   91,  101,  189,  197,  283,  293,  379,  389,  475,  485,
             570,  581,  666,  677,  762,  773,  858,  869,  954,  965, 1050,
            1061, 1146, 1157, 1242, 1253, 1338, 1349, 1434, 1445, 1530, 1541,
            1626, 1637, 1722, 1733, 1818, 1829, 1914, 1925, 2010, 2021, 2106,
            2117, 2202, 2213, 2298, 2309, 2394, 2405],
           dtype='int64')

In [10]:
# We will re-index our data so, we can see state names instead of indexes (which would be difficult to make out.)
filter_2010_total.set_index('state', inplace=True)


density = filter_2010_total['population'] / filter_2010_total['area (sq. mi)']

In [11]:
density.sort_values(ascending=False, inplace=True)
density.head()

state
District of Columbia    8898.897059
New Jersey              1009.253268
Rhode Island             681.339159
Connecticut              645.600649
Massachusetts            621.815538
dtype: float64

In [12]:
type(density)

pandas.core.series.Series