In [1]:
!head -n 10 babynames/yob1994.txt

Jessica,F,32117
Ashley,F,30278
Emily,F,24148
Samantha,F,22817
Sarah,F,22281
Taylor,F,20731
Brittany,F,18899
Amanda,F,18715
Elizabeth,F,16778
Megan,F,16578


In [2]:
import pandas as pd

In [3]:
names1994 = pd.read_csv('babynames/yob1994.txt', names=['name', 'sex', 'births'])

In [4]:
names1994

Unnamed: 0,name,sex,births
0,Jessica,F,32117
1,Ashley,F,30278
2,Emily,F,24148
3,Samantha,F,22817
4,Sarah,F,22281
5,Taylor,F,20731
6,Brittany,F,18899
7,Amanda,F,18715
8,Elizabeth,F,16778
9,Megan,F,16578


In [14]:
years = range(1880, 2015)

In [15]:
pieces = []

In [16]:
columns = ['name', 'sex', 'births']

In [21]:
for year in years:
    path = 'babynames/yob%d.txt' % year
    frame = pd.read_csv(path, names=columns)
    
    frame['year'] = year
    pieces.append(frame)

In [22]:
names = pd.concat(pieces, ignore_index=True)

In [23]:
names

Unnamed: 0,name,sex,births,year
0,Emma,F,20886,2014
1,Olivia,F,19761,2014
2,Sophia,F,18563,2014
3,Isabella,F,17027,2014
4,Ava,F,15660,2014
5,Mia,F,13484,2014
6,Emily,F,12622,2014
7,Abigail,F,12049,2014
8,Madison,F,10291,2014
9,Charlotte,F,10094,2014


In [25]:
total_births = names.pivot_table('births', index='year', columns='sex', aggfunc=sum)

In [26]:
total_births.tail()

sex,F,M
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2010,1773440,1914629
2011,1754424,1894262
2012,1755254,1890889
2013,1747544,1883945
2014,3554484,3821752


In [27]:
def add_prop(group):
    births = group.births.astype(float)
    
    group['prop'] = births / births.sum()
    return group

In [28]:
names = names.groupby(['year', 'sex']).apply(add_prop)

In [29]:
names

Unnamed: 0,name,sex,births,year,prop
0,Emma,F,20886,2014,0.005876
1,Olivia,F,19761,2014,0.005559
2,Sophia,F,18563,2014,0.005222
3,Isabella,F,17027,2014,0.004790
4,Ava,F,15660,2014,0.004406
5,Mia,F,13484,2014,0.003794
6,Emily,F,12622,2014,0.003551
7,Abigail,F,12049,2014,0.003390
8,Madison,F,10291,2014,0.002895
9,Charlotte,F,10094,2014,0.002840


In [31]:
import numpy as np

In [32]:
np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1)

True

In [36]:
def get_top1000(group):
    return group.sort_values(by='births', ascending=False)[:1000]

In [37]:
grouped = names.groupby(['year', 'sex'])

In [38]:
top1000 = grouped.apply(get_top1000)

In [39]:
top1000

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,name,sex,births,year,prop
year,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1880,F,33176,Mary,F,7065,1880,0.077644
1880,F,33177,Anna,F,2604,1880,0.028618
1880,F,33178,Emma,F,2003,1880,0.022013
1880,F,33179,Elizabeth,F,1939,1880,0.021310
1880,F,33180,Minnie,F,1746,1880,0.019189
1880,F,33181,Margaret,F,1578,1880,0.017342
1880,F,33182,Ida,F,1472,1880,0.016177
1880,F,33183,Alice,F,1414,1880,0.015540
1880,F,33184,Bertha,F,1320,1880,0.014507
1880,F,33185,Sarah,F,1288,1880,0.014155


In [40]:
 boys = top1000[top1000.sex == 'M']

In [41]:
 girls = top1000[top1000.sex == 'F']

In [42]:
total_births = top1000.pivot_table('births', index='year', columns='name', aggfunc=sum)

In [43]:
total_births

name,Aaden,Aaliyah,Aarav,Aaron,Aarush,Ab,Abagail,Abb,Abbey,Abbie,...,Zoa,Zoe,Zoey,Zoie,Zola,Zollie,Zona,Zora,Zula,Zuri
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1880,,,,102.0,,,,,,71.0,...,8.0,23.0,,,7.0,,8.0,28.0,27.0,
1881,,,,94.0,,,,,,81.0,...,,22.0,,,10.0,,9.0,21.0,27.0,
1882,,,,85.0,,,,,,80.0,...,8.0,25.0,,,9.0,,17.0,32.0,21.0,
1883,,,,105.0,,,,,,79.0,...,,23.0,,,10.0,,11.0,35.0,25.0,
1884,,,,97.0,,,,,,98.0,...,13.0,31.0,,,14.0,6.0,8.0,58.0,27.0,
1885,,,,88.0,,6.0,,,,88.0,...,6.0,27.0,,,12.0,6.0,14.0,48.0,38.0,
1886,,,,86.0,,,,,,84.0,...,13.0,25.0,,,8.0,,20.0,52.0,43.0,
1887,,,,78.0,,,,,,104.0,...,9.0,34.0,,,23.0,,28.0,46.0,33.0,
1888,,,,90.0,,,,,,137.0,...,11.0,42.0,,,23.0,7.0,30.0,42.0,45.0,
1889,,,,85.0,,,,,,107.0,...,14.0,29.0,,,22.0,,29.0,53.0,55.0,


In [48]:
%matplotlib notebook
import matplotlib
import matplotlib.pyplot as plt

In [54]:
subset = total_births[['Victor', 'Harry', 'Mary', 'Marilyn']]

In [52]:
subset.plot(subplots=True, figsize=(12, 10), grid=False, title="Number of births per year")

<IPython.core.display.Javascript object>

array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f08dacdf110>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x7f08aa4d3790>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x7f08aa4a9690>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x7f08aa3cd210>], dtype=object)

In [55]:
table = top1000.pivot_table('prop', index='year', columns='sex', aggfunc=sum)

In [57]:
table.plot(title='Sum of table1000.prop by year and sex', yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7f08aa1cef50>

In [58]:
df = boys[boys.year == 2010]

In [59]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,name,sex,births,year,prop
year,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010,M,1710685,Jacob,M,22095,2010,0.011540
2010,M,1710686,Ethan,M,17993,2010,0.009398
2010,M,1710687,Michael,M,17324,2010,0.009048
2010,M,1710688,Jayden,M,17158,2010,0.008962
2010,M,1710689,William,M,17036,2010,0.008898
2010,M,1710690,Alexander,M,16746,2010,0.008746
2010,M,1710691,Noah,M,16438,2010,0.008585
2010,M,1710692,Daniel,M,15819,2010,0.008262
2010,M,1710693,Aiden,M,15520,2010,0.008106
2010,M,1710694,Anthony,M,15475,2010,0.008083


In [60]:
prop_cumsum = df.sort_values(by='prop', ascending=False).prop.cumsum()

In [61]:
prop_cumsum[:10]

year  sex         
2010  M    1710685    0.011540
           1710686    0.020938
           1710687    0.029986
           1710688    0.038947
           1710689    0.047845
           1710690    0.056592
           1710691    0.065177
           1710692    0.073439
           1710693    0.081545
           1710694    0.089628
Name: prop, dtype: float64

In [62]:
df = boys[boys.year == 1900]

In [64]:
in1900 = df.sort_values(by='prop', ascending=False).prop.cumsum()

In [65]:
in1900.searchsorted(0.5) + 1

array([25])

In [68]:
def get_quantile_count(group, q=0.5):
    group = group.sort_values(by='prop', ascending=False)
    return group.prop.cumsum().searchsorted(q) + 1

In [69]:
diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)

In [70]:
diversity = diversity.unstack('sex')

In [74]:
diversity

sex,F,M
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1880,[38],[14]
1881,[38],[14]
1882,[38],[15]
1883,[39],[15]
1884,[39],[16]
1885,[40],[16]
1886,[41],[16]
1887,[41],[17]
1888,[42],[17]
1889,[43],[18]


In [80]:
diversity[['F', 'M']] = diversity[['F', 'M']].astype(int)
diversity.dtypes

diversity.plot(title = 'Number of popular names in top 50%')

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7f08aa19a950>

## Revolution "the last letter"

In [81]:
get_last_letter = lambda x: x[-1]
last_letters = names.name.map(get_last_letter)
last_letters.name = 'last_letter'

table = names.pivot_table('births', index=last_letters, columns=['sex', 'year'], aggfunc=sum)

In [83]:
subtable = table.reindex(columns=[1910, 1960, 2010], level='year')

In [84]:
subtable.head()

sex,F,F,F,M,M,M
year,1910,1960,2010,1910,1960,2010
last_letter,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
a,108398.0,691271.0,676151.0,977.0,5212.0,28837.0
b,,694.0,454.0,411.0,3912.0,39229.0
c,5.0,49.0,954.0,482.0,15463.0,23325.0
d,6751.0,3728.0,2635.0,22113.0,262168.0,44780.0
e,133601.0,435056.0,316425.0,28665.0,178817.0,130130.0


In [85]:
subtable.sum()

sex  year
F    1910     396503.0
     1960    2022121.0
     2010    1773440.0
M    1910     194216.0
     1960    2132756.0
     2010    1914629.0
dtype: float64

In [86]:
letter_prop = subtable / subtable.sum().astype(float)

In [87]:
fig, axes = plt.subplots(2, 1, figsize=(10, 8))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female', legend=False)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7f08a9e0a110>

In [88]:
letter_prop = table / table.sum().astype(float)

In [89]:
dny_ts = letter_prop.ix[['d', 'n', 'y'], 'M'].T

In [90]:
dny_ts.head()

last_letter,d,n,y
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1880,0.083057,0.153217,0.075763
1881,0.083242,0.153212,0.077455
1882,0.085332,0.149561,0.077538
1883,0.084051,0.151653,0.079148
1884,0.086121,0.149926,0.080407


In [91]:
dny_ts.plot()

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7f08a996ba50>

## Men's names have become women and vice versa

In [92]:
all_names = top1000.name.unique()

In [93]:
mask = np.array(['lesl' in x.lower() for x in all_names])

In [94]:
lesley_like = all_names[mask]

In [97]:
np.array(['Leslie', 'Lesley', 'Leslee', 'Lesli', 'Lesly'], dtype=object)

array(['Leslie', 'Lesley', 'Leslee', 'Lesli', 'Lesly'], dtype=object)

In [98]:
filtered = top1000[top1000.name.isin(lesley_like)]

In [100]:
filtered.groupby('name').births.sum()

name
Leslee       993
Lesley     35032
Lesli        929
Leslie    376158
Lesly      11155
Name: births, dtype: int64

In [101]:
table = filtered.pivot_table('births', index='year', columns='sex', aggfunc='sum')

In [102]:
table = table.div(table.sum(1), axis=0)

In [103]:
table.tail()

sex,F,M
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2010,1.0,
2011,1.0,
2012,1.0,
2013,1.0,
2014,1.0,


In [104]:
table.plot(style={'M': 'k-', 'F': 'k--'})

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7f08a9845790>