In [1]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

In [6]:
# 首先导入文件，并查看数据样本
abbrevs = pd.read_csv('data/state-abbrevs.csv')
area = pd.read_csv('data/state-areas.csv')
population = pd.read_csv('data/state-population.csv')

In [8]:
display(abbrevs,population)

Unnamed: 0,state,abbreviation
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA
5,Colorado,CO
6,Connecticut,CT
7,Delaware,DE
8,District of Columbia,DC
9,Florida,FL


Unnamed: 0,state/region,ages,year,population
0,AL,under18,2012,1117489.0
1,AL,total,2012,4817528.0
2,AL,under18,2010,1130966.0
3,AL,total,2010,4785570.0
4,AL,under18,2011,1125763.0
5,AL,total,2011,4801627.0
6,AL,total,2009,4757938.0
7,AL,under18,2009,1134192.0
8,AL,under18,2013,1111481.0
9,AL,total,2013,4833722.0


In [21]:
# 合并pop与abbrevs两个DataFrame，分别依据state/region列和abbreviation列来合并。为了保留所有信息，使用外合并
new_pop = population.merge(abbrevs,how='outer',left_on='state/region',right_on='abbreviation')


Unnamed: 0,state/region,ages,year,population,state
0,AL,under18,2012,1117489.0,Alabama
1,AL,total,2012,4817528.0,Alabama
2,AL,under18,2010,1130966.0,Alabama
3,AL,total,2010,4785570.0,Alabama
4,AL,under18,2011,1125763.0,Alabama
5,AL,total,2011,4801627.0,Alabama
6,AL,total,2009,4757938.0,Alabama
7,AL,under18,2009,1134192.0,Alabama
8,AL,under18,2013,1111481.0,Alabama
9,AL,total,2013,4833722.0,Alabama


In [24]:
# 去除abbreviation的那一列（axis=1）
new_pop=new_pop.drop(labels='abbreviation',axis=1)

Unnamed: 0,state/region,ages,year,population,state
0,AL,under18,2012,1117489.0,Alabama
1,AL,total,2012,4817528.0,Alabama
2,AL,under18,2010,1130966.0,Alabama
3,AL,total,2010,4785570.0,Alabama
4,AL,under18,2011,1125763.0,Alabama
5,AL,total,2011,4801627.0,Alabama
6,AL,total,2009,4757938.0,Alabama
7,AL,under18,2009,1134192.0,Alabama
8,AL,under18,2013,1111481.0,Alabama
9,AL,total,2013,4833722.0,Alabama


In [42]:
# 查看存在缺失数据的列。使用.isnull().any()，只有某一列存在一个缺失数据，就会显示True。
new_pop.isnull().any()

state/region    False
ages            False
year            False
population       True
state            True
dtype: bool

In [53]:
# 查看缺失数据,根据数据是否缺失情况显示数据，如果缺失为True，那么显示
new_pop[new_pop['population'].isnull()]
new_pop[new_pop['state'].isnull()]

Unnamed: 0,state/region,ages,year,population,state
2448,PR,under18,1990,,
2449,PR,total,1990,,
2450,PR,total,1991,,
2451,PR,under18,1991,,
2452,PR,total,1993,,
2453,PR,under18,1993,,
2454,PR,under18,1992,,
2455,PR,total,1992,,
2456,PR,under18,1994,,
2457,PR,total,1994,,


In [54]:
# 找到有哪些state/region使得state的值为NaN，使用unique()查看非重复值
bad_state = new_pop[new_pop['state'].isnull()]
pd.unique(bad_state['state/region'])

array(['PR', 'USA'], dtype=object)

In [94]:
# 为找到的这些state/region的state项补上正确的值，从而去除掉state这一列的所有NaN！
new_pop.loc[new_pop['state/region']=='PR','state'] = 'Puerto Rico'


In [None]:
# 为找到的这些state/region的state项补上正确的值，从而去除掉state这一列的所有NaN！
new_pop.loc[new_pop['state/region']=='PR','state'] = 'Puerto Rico'

In [92]:
# 为找到的这些state/region的state项补上正确的值，从而去除掉state这一列的所有NaN！
new_pop.loc[new_pop['state/region']=='USA','state'] = 'United States of America'
new_pop

Unnamed: 0,state/region,ages,year,population,state
0,AL,under18,2012,1117489.0,Alabama
1,AL,total,2012,4817528.0,Alabama
2,AL,under18,2010,1130966.0,Alabama
3,AL,total,2010,4785570.0,Alabama
4,AL,under18,2011,1125763.0,Alabama
5,AL,total,2011,4801627.0,Alabama
6,AL,total,2009,4757938.0,Alabama
7,AL,under18,2009,1134192.0,Alabama
8,AL,under18,2013,1111481.0,Alabama
9,AL,total,2013,4833722.0,Alabama


In [130]:
# 为找到的这些state/region的state项补上正确的值，从而去除掉state这一列的所有NaN！

new_pop = new_pop.drop(index=new_pop[new_pop['population'].isnull()].index)


In [93]:
area

Unnamed: 0,state,area (sq. mi)
0,Alabama,52423
1,Alaska,656425
2,Arizona,114006
3,Arkansas,53182
4,California,163707
5,Colorado,104100
6,Connecticut,5544
7,Delaware,1954
8,Florida,65758
9,Georgia,59441


In [101]:
# 合并各州面积数据areas，使用左合并。
new_pop = new_pop.merge(area,how='left')

In [131]:
# 继续寻找存在缺失数据的列
new_pop.isnull().any()

state/region     False
ages             False
year             False
population       False
state            False
area (sq. mi)     True
dtype: bool

In [135]:
# 我们会发现area(sq.mi)这一列有缺失数据，为了找出是哪一行，我们需要找出是哪个state没有数据
pd.unique(new_pop[new_pop['area (sq. mi)'].isnull()]['state'])


array(['United States of America'], dtype=object)

In [139]:
# 去除含有缺失数据的行
back = new_pop.copy()
new_pop = new_pop.drop(index=new_pop[new_pop['state']=='United States of America'].index)
new_pop


Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
0,AL,under18,2012,1117489.0,Alabama,52423.0
1,AL,total,2012,4817528.0,Alabama,52423.0
2,AL,under18,2010,1130966.0,Alabama,52423.0
3,AL,total,2010,4785570.0,Alabama,52423.0
4,AL,under18,2011,1125763.0,Alabama,52423.0
5,AL,total,2011,4801627.0,Alabama,52423.0
6,AL,total,2009,4757938.0,Alabama,52423.0
7,AL,under18,2009,1134192.0,Alabama,52423.0
8,AL,under18,2013,1111481.0,Alabama,52423.0
9,AL,total,2013,4833722.0,Alabama,52423.0


In [140]:
# 查看数据是否缺失
new_pop.isnull().any()

state/region     False
ages             False
year             False
population       False
state            False
area (sq. mi)    False
dtype: bool

In [141]:
new_pop

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
0,AL,under18,2012,1117489.0,Alabama,52423.0
1,AL,total,2012,4817528.0,Alabama,52423.0
2,AL,under18,2010,1130966.0,Alabama,52423.0
3,AL,total,2010,4785570.0,Alabama,52423.0
4,AL,under18,2011,1125763.0,Alabama,52423.0
5,AL,total,2011,4801627.0,Alabama,52423.0
6,AL,total,2009,4757938.0,Alabama,52423.0
7,AL,under18,2009,1134192.0,Alabama,52423.0
8,AL,under18,2013,1111481.0,Alabama,52423.0
9,AL,total,2013,4833722.0,Alabama,52423.0


In [175]:
# 找出2010年的全民人口数据,df.query(查询语句)
new_pop.query('year==2010')[new_pop.query('year==2010')['ages']=='total']['population'].sum()

313047503.0

In [186]:
# 对查询结果进行处理，以state列作为新的行索引:set_index
div = new_pop.query('year==2010')[new_pop.query('year==2010')['ages']=='total'].set_index('state')
div

Unnamed: 0_level_0,state/region,ages,year,population,area (sq. mi)
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama,AL,total,2010,4785570.0,52423.0
Alaska,AK,total,2010,713868.0,656425.0
Arizona,AZ,total,2010,6408790.0,114006.0
Arkansas,AR,total,2010,2922280.0,53182.0
California,CA,total,2010,37333601.0,163707.0
Colorado,CO,total,2010,5048196.0,104100.0
Connecticut,CT,total,2010,3579210.0,5544.0
Delaware,DE,total,2010,899711.0,1954.0
District of Columbia,DC,total,2010,605125.0,68.0
Florida,FL,total,2010,18846054.0,65758.0


In [209]:
# 排序，并找出人口密度最高的五个州sort_values()
pop = div['population']
are = div['area (sq. mi)']
new = DataFrame(pop/are)
new.sort_values(by=[0],ascending=False)[:5]

Unnamed: 0_level_0,0
state,Unnamed: 1_level_1
District of Columbia,8898.897059
Puerto Rico,1058.665149
New Jersey,1009.253268
Rhode Island,681.339159
Connecticut,645.600649


In [210]:
# 排序，并找出人口密度最低的五个州sort_values()
new.sort_values(by=[0])[:5]

Unnamed: 0_level_0,0
state,Unnamed: 1_level_1
Alaska,1.087509
Wyoming,5.768079
Montana,6.736171
North Dakota,9.537565
South Dakota,10.583512
