In [1]:
import pandas as pd
name1880 = pd.read_csv('pydata-book/datasets/babynames/yob1880.txt', names=['name', 'sex', 'births'])
name1880

In [2]:
name1880.groupby('sex').births.sum()

In [3]:
# 统治所有数据
years=range(1880,2011)
pieces=[]
columns=['name','sex','births']
for year in years:
    path='pydata-book/datasets/babynames/yob%d.txt'%year
    frame=pd.read_csv(path,names=columns)
    frame['year']=year
    pieces.append(frame)
names=pd.concat(pieces,ignore_index=True)
names

In [4]:
# 在year和sex上对其聚合
total_births=names.pivot_table('births',index='year',columns='sex',aggfunc=sum)
total_births.tail()

In [5]:
# 插入 prop 列，存放指定名字的婴儿数相对于总出生数的比例
def add_prop(group):
    births=group.births.astype(float)
    group['prop']=births/births.sum()
    return group
names=names.groupby(['year','sex']).apply(add_prop)
names

In [6]:
# 验证分组值是否有效，想家是否等于1
import numpy as np
np.allclose(names.groupby(['year','sex']).prop.sum(),1)

In [7]:
# 取出一个子集
def get_top1000(group):
    return group.sort_values(by='births',ascending=False)[:1000]
grouped=names.groupby(['year','sex'])
top1000=grouped.apply(get_top1000)
top1000

In [8]:
# 将前1000个名字分成男女两部分
boys=top1000[top1000.sex=='M']
girls=top1000[top1000.sex=='F']

In [9]:
total_births=top1000.pivot_table('births',index=['year'],columns=['name'],aggfunc=sum)
total_births

In [10]:
# 为举例名字绘制曲线图
import matplotlib.pyplot as plt 
subset=total_births[['John','Harry','Mary','Marilyn']]
subset.plot(subplots=True,figsize=(12,10),grid=False,title='Number of births per year')
plt.show()

In [11]:
table=top1000.pivot_table('prop',index=['year'],columns=['sex'],aggfunc=sum)
table.plot(title='Sum of table1000.prop by year and sex',yticks=np.linspace(0,1.2,13),xticks=range(1880,2020,10))
plt.show()

In [12]:
df=boys[boys.year==2010]
df

In [13]:
prop_cumsum=df.sort_values(by='prop',ascending=False).prop.cumsum()
prop_cumsum[:10]

In [14]:
prop_cumsum.searchsorted(0.5)

In [15]:
df=boys[boys.year==1900]
in1900=df.sort_values(by='prop',ascending=False).prop.cumsum()
in1900.searchsorted(0.5)+1

In [16]:
def get_quantile_count(group,q=0.5):
    group=group.sort_values(by='prop',ascending=False)
    return group.prop.cumsum().searchsorted(q)+1
diversity=top1000.groupby(['year','sex']).apply(get_quantile_count)
diversity=diversity.unstack('sex')

In [19]:
diversity.head()

In [20]:
diversity.plot(title='Number of popular names in top 50%')

In [21]:
diversity

In [27]:
get_last_letter=lambda x:x[-1]
last_letters=names.name.map(get_last_letter)
last_letters.name='last_letter'
table=names.pivot_table('births',index=last_letters,columns=['sex','year'],aggfunc=sum)

In [29]:
subtable=table.reindex(columns=[1920,1960,2010],level='year')
subtable.head()

In [30]:
subtable.sum()

In [31]:
letter_prop=subtable/subtable.sum().astype(float)

In [33]:
import matplotlib.pyplot as plt
fig,axes=plt.subplots(2,1,figsize=(10,8))
letter_prop['M'].plot(kind='bar',rot=0,ax=axes[0],title='Male')
letter_prop['F'].plot(kind='bar',rot=0,ax=axes[1],title='Female')
plt.show()

In [38]:
# 各年出生的男孩中名字以 d/n/y结尾的人数比例
letter_prop=table/table.sum().astype(float)
dny_ts=letter_prop.ix[['d','n','y'],'M'].T
dny_ts.head()

In [39]:
dny_ts.plot()
plt.show()

In [41]:
all_names=top1000.name.unique()
mask=np.array(['lesl' in x.lower() for x in all_names])
lesley_like=all_names[mask]
lesley_like


In [43]:
filtered=top1000[top1000.name.isin(lesley_like)]
filtered.groupby('name').births.sum()

In [44]:
table=filtered.pivot_table('births',index='year',columns='sex',aggfunc=sum)
table=table.div(table.sum(1),axis=0)
table.tail()

In [48]:
table.plot(style={'M':'k-','F':'k--'})
plt.show()