In [1]:
import pandas as pd
import matplotlib as plt
import numpy as np

## Function for Reading Files

In [2]:
def get_data(year):
    new_filename = f'{year}.txt'
    data: pd.DataFrame = pd.read_csv(f'datasets/babynames/yob{new_filename}', names = ['Name', 'Sex', 'BirthCount'])

    data['Year'] = year
    return data

## Joining Files Together

In [6]:
min_year = 1880
max_year = 2011

names = pd.DataFrame([])

for year in range(min_year, max_year):
    frame = get_data(year)
    names = pd.concat([names, frame], ignore_index = True)

In [7]:
names

Unnamed: 0,Name,Sex,BirthCount,Year
0,Mary,F,7065,1880
1,Anna,F,2604,1880
2,Emma,F,2003,1880
3,Elizabeth,F,1939,1880
4,Minnie,F,1746,1880
...,...,...,...,...
1690779,Zymaire,M,5,2010
1690780,Zyonne,M,5,2010
1690781,Zyquarius,M,5,2010
1690782,Zyran,M,5,2010


In [None]:
names[names['Name'] == 'Mary']

Unnamed: 0,Name,Sex,BirthCount,Year
1513354,Zyquarius,M,9,2005
1587335,Zyquarius,M,5,2007
1622340,Zyquarius,M,5,2008
1656941,Zyquarius,M,5,2009
1690781,Zyquarius,M,5,2010


## Grouping Data By Feature

In [None]:
group_s = names.groupby('Sex', as_index = False)

In [None]:
print(f'Dataset Contains {len(group_s.size())} unique Sex groups: \n\n', group_s.size())

## Plotting Births Over Time

In [None]:
total_births = names.pivot_table('BirthCount', index='Year', columns='Sex', aggfunc=sum)

In [None]:
total_births.plot(title = 'Total Births by Sex and Year')

## Most Common Names

In [None]:
def add_proportion(group):
    group['Proportion'] = group['BirthCount'] / group['BirthCount'].sum()
    return group

In [None]:
g = names.groupby(['Year', 'Sex'], group_keys = False)
g.size()

In [None]:
df = g.apply(add_proportion)
df.iloc[df[df['Sex'] == 'M']['Proportion'].idxmax()]

## Sorting Values

In [None]:
def get_top(group, results = 1000):
    if results > 0 and results <= len(group):
        return group.sort_values('BirthCount', ascending = False)[:results]

In [None]:
grouped = names.groupby(['Year', 'Sex'])
top = grouped.apply(get_top)

In [None]:
top

In [None]:
# drop the year grouping
top = top.reset_index(drop=True)
top

## Shuffle and Sampling

In [None]:
shuffle_index = np.random.permutation(top.index)
shuffle_index

In [None]:
shuffled_df = top.reindex(shuffle_index)
sample = shuffled_df.head(1000)

In [None]:
sample

In [None]:
males = sample[sample['Sex'] == 'M']
males

In [None]:
females = sample[sample['Sex'] == 'F']
females

In [None]:
total_births = top.pivot_table('BirthCount', index='Year', columns='Name', aggfunc=sum)

total_births

In [None]:
subset = total_births[['John', 'Harry', 'Mary', 'Marylin']]
subset.plot(subplots = True, figsize = (12,10), ylabel='BirthCount', xlabel='Year')

## Unisex Names Over Time

In [None]:
all_names = pd.Series(top['Name'].unique())
all_names

In [None]:
lesley_like = all_names[all_names.str.contains('Lesl')]
lesley_like

In [None]:
filtered = top[top["Name"].isin(lesley_like)]
filtered

In [None]:
filtered.groupby('Name')['BirthCount'].sum()

In [None]:
table = filtered.pivot_table('BirthCount', index = 'Year', columns = 'Sex', aggfunc = 'sum')

table

In [None]:
table = table.div(table.sum(axis='columns'), axis='index')

table

In [None]:
table.plot(style={'M': 'k-', 'F': 'k--'})