## 美国各州人口数据分析

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
# 州名
abbr = pd.read_csv('./USA/state-abbrevs.csv')
display(abbr.shape, abbr.head())

(51, 2)

Unnamed: 0,state,abbreviation
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


In [3]:
# 面积
areas = pd.read_csv('./USA/state-areas.csv')
display(areas.shape, areas.head())

(52, 2)

Unnamed: 0,state,area (sq. mi)
0,Alabama,52423
1,Alaska,656425
2,Arizona,114006
3,Arkansas,53182
4,California,163707


In [4]:
# 人口
pop = pd.read_csv('./USA/state-population.csv')
display(pop.shape, pop.head())

(2544, 4)

Unnamed: 0,state/region,ages,year,population
0,AL,under18,2012,1117489.0
1,AL,total,2012,4817528.0
2,AL,under18,2010,1130966.0
3,AL,total,2010,4785570.0
4,AL,under18,2011,1125763.0


In [5]:
# 合并人口和州名（缩写）
pop2 = pop.merge(abbr, how='outer', left_on='state/region', right_on='abbreviation')
pop2

Unnamed: 0,state/region,ages,year,population,state,abbreviation
0,AL,under18,2012,1117489.0,Alabama,AL
1,AL,total,2012,4817528.0,Alabama,AL
2,AL,under18,2010,1130966.0,Alabama,AL
3,AL,total,2010,4785570.0,Alabama,AL
4,AL,under18,2011,1125763.0,Alabama,AL
5,AL,total,2011,4801627.0,Alabama,AL
6,AL,total,2009,4757938.0,Alabama,AL
7,AL,under18,2009,1134192.0,Alabama,AL
8,AL,under18,2013,1111481.0,Alabama,AL
9,AL,total,2013,4833722.0,Alabama,AL


In [9]:
# 删除abbreviation列
pop2.drop(labels='abbreviation', axis=1, inplace=True)

In [10]:
# 查询哪些属性为空
pop2.isnull().any()

state/region    False
ages            False
year            False
population       True
state            True
dtype: bool

In [17]:
# 查找state为空的州名

cond = pop2['state'].isnull()
states = pop2['state/region'][cond]
states.unique()

array(['PR', 'USA'], dtype=object)

In [20]:
# 将缩写为PR的洲名换成Puerto Rico
cond = pop2['state/region'] == 'PR'
print(pop2[cond])
pop2['state'][cond] = 'Puerto Rico'
print(pop2[cond])

     state/region     ages  year  population state
2448           PR  under18  1990         NaN   NaN
2449           PR    total  1990         NaN   NaN
2450           PR    total  1991         NaN   NaN
2451           PR  under18  1991         NaN   NaN
2452           PR    total  1993         NaN   NaN
2453           PR  under18  1993         NaN   NaN
2454           PR  under18  1992         NaN   NaN
2455           PR    total  1992         NaN   NaN
2456           PR  under18  1994         NaN   NaN
2457           PR    total  1994         NaN   NaN
2458           PR    total  1995         NaN   NaN
2459           PR  under18  1995         NaN   NaN
2460           PR  under18  1996         NaN   NaN
2461           PR    total  1996         NaN   NaN
2462           PR  under18  1998         NaN   NaN
2463           PR    total  1998         NaN   NaN
2464           PR    total  1997         NaN   NaN
2465           PR  under18  1997         NaN   NaN
2466           PR    total  199

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [23]:
# 将缩写为USA的州名换成the United States
cond = pop2['state/region'] == 'USA'
display(pop2[cond])
pop2['state'][cond] = 'the United States'
display(pop2[cond])

Unnamed: 0,state/region,ages,year,population,state
2496,USA,under18,1990,64218512.0,
2497,USA,total,1990,249622814.0,
2498,USA,total,1991,252980942.0,
2499,USA,under18,1991,65313018.0,
2500,USA,under18,1992,66509177.0,
2501,USA,total,1992,256514231.0,
2502,USA,total,1993,259918595.0,
2503,USA,under18,1993,67594938.0,
2504,USA,under18,1994,68640936.0,
2505,USA,total,1994,263125826.0,


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,state/region,ages,year,population,state
2496,USA,under18,1990,64218512.0,the United States
2497,USA,total,1990,249622814.0,the United States
2498,USA,total,1991,252980942.0,the United States
2499,USA,under18,1991,65313018.0,the United States
2500,USA,under18,1992,66509177.0,the United States
2501,USA,total,1992,256514231.0,the United States
2502,USA,total,1993,259918595.0,the United States
2503,USA,under18,1993,67594938.0,the United States
2504,USA,under18,1994,68640936.0,the United States
2505,USA,total,1994,263125826.0,the United States


In [24]:
areas

Unnamed: 0,state,area (sq. mi)
0,Alabama,52423
1,Alaska,656425
2,Arizona,114006
3,Arkansas,53182
4,California,163707
5,Colorado,104100
6,Connecticut,5544
7,Delaware,1954
8,Florida,65758
9,Georgia,59441


In [25]:
pop2.head()

Unnamed: 0,state/region,ages,year,population,state
0,AL,under18,2012,1117489.0,Alabama
1,AL,total,2012,4817528.0,Alabama
2,AL,under18,2010,1130966.0,Alabama
3,AL,total,2010,4785570.0,Alabama
4,AL,under18,2011,1125763.0,Alabama


In [29]:
# 将面积合并到表中
pop3 = pop2.merge(areas, how="outer")
pop3.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
0,AL,under18,2012,1117489.0,Alabama,52423.0
1,AL,total,2012,4817528.0,Alabama,52423.0
2,AL,under18,2010,1130966.0,Alabama,52423.0
3,AL,total,2010,4785570.0,Alabama,52423.0
4,AL,under18,2011,1125763.0,Alabama,52423.0


In [30]:
pop3.isnull().any()

state/region     False
ages             False
year             False
population        True
state            False
area (sq. mi)     True
dtype: bool

In [32]:
pop3['area (sq. mi)'].isnull().sum()
# 48

# 找出面积为空的洲名
cond = pop3['area (sq. mi)'].isnull()
states = pop3['state/region'][cond]
states.unique()

array(['USA'], dtype=object)

In [34]:
areas.head()

Unnamed: 0,state,area (sq. mi)
0,Alabama,52423
1,Alaska,656425
2,Arizona,114006
3,Arkansas,53182
4,California,163707


In [35]:
# 计算总面积
areas['area (sq. mi)'].sum()

3790399

In [45]:
# 将USA面积替换
cond = pop3['state/region'] == 'USA'
pop3['area (sq. mi)'][cond] = 3790399
display(pop3[cond])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
2496,USA,under18,1990,64218512.0,the United States,3790399.0
2497,USA,total,1990,249622814.0,the United States,3790399.0
2498,USA,total,1991,252980942.0,the United States,3790399.0
2499,USA,under18,1991,65313018.0,the United States,3790399.0
2500,USA,under18,1992,66509177.0,the United States,3790399.0
2501,USA,total,1992,256514231.0,the United States,3790399.0
2502,USA,total,1993,259918595.0,the United States,3790399.0
2503,USA,under18,1993,67594938.0,the United States,3790399.0
2504,USA,under18,1994,68640936.0,the United States,3790399.0
2505,USA,total,1994,263125826.0,the United States,3790399.0


In [47]:
pop3.isnull().sum()

state/region      0
ages              0
year              0
population       20
state             0
area (sq. mi)     0
dtype: int64

In [48]:
# 清空空数据
pop3.dropna(inplace=True)

In [49]:
pop3.isnull().any()

state/region     False
ages             False
year             False
population       False
state            False
area (sq. mi)    False
dtype: bool