# Birth analysis from split annotations 
- Given a dataset of number of births
by name/year, computes fraction of
names starting with “Lesl” grouped
by gender and year-of-birth
- Code: https://github.com/weld-project/split-annotations/blob/master/python/benchmarks/birth_analysis/birth_analysis.py
- Data: https://github.com/weld-project/split-annotations/blame/master/python/benchmarks/datasets/birth_analysis/babynames.txt.gz

## Notes: 
- This is the original python script, it is not really in a notebook top to bottom way. 

In [1]:
import argparse
import pandas as pd
import sys
import time

In [4]:

import argparse
import pandas as pd
import sys
import time

def get_top1000(group):
    return group.sort_values(by='births', ascending=False)[0:1000]

def analyze(top1000):
    start1 = time.time()
    all_names = pd.Series(top1000.name.unique())
    lesley_like = all_names[all_names.str.lower().str.contains('lesl')]
    filtered = top1000[top1000.name.isin(lesley_like)]
    table = filtered.pivot_table('births', index='year',
                                 columns='sex', aggfunc='sum')

    table = table.div(table.sum(1), axis=0)
    end1 = time.time()
    print("Analysis:", end1 - start1)
    return table

def run(filename):
    years = range(1880, 2011)
    pieces = []
    columns = ['year', 'sex', 'name', 'births']

    sys.stdout.write("Reading data...")
    sys.stdout.flush()
    names = pd.read_csv(filename, names=columns)
    print("done.")
    sys.stdout.flush()

    print("Size of names:", len(names))

    e2e_start = time.time()

    # Time preprocessing step
    start0 = time.time()
    grouped = names.groupby(['year', 'sex'])
    end0 = time.time()
    print("GroupBy:", end0 - start0)
    start0 = end0

    top1000 = grouped.apply(get_top1000)
    top1000.reset_index(inplace=True, drop=True)

    end0 = time.time()
    print("Apply:", end0-start0)
    print("Elements in top1000:", len(top1000))

    result = analyze(top1000)

    e2e_end = time.time()
    print("Total time:", e2e_end - e2e_start)

    print(top1000['births'].sum())

def main(filename: str):
    print("File:", filename)
    mi = run(filename)


main('./data/babynames.txt')

File: ./data/babynames.txt
done.ng data...
Size of names: 1792091
GroupBy: 0.0035479068756103516
Apply: 0.4545860290527344
Elements in top1000: 267877
Analysis: 0.04907989501953125
Total time: 0.5072929859161377
304919459


  top1000 = grouped.apply(get_top1000)


## Notes: 
- Refactored into a more notebook style 
- There are some groupby -> sort -> filter (by name and uniqueness (more like merge?)) -> sum, which may be effective organize sort after groupby

In [2]:
years = range(1880, 2011)
pieces = []
columns = ['year', 'sex', 'name', 'births']

In [3]:
filename = './data/babynames.txt'
print("File:", filename)

print("Reading data...")
names = pd.read_csv(filename, names=columns)
print("done.")

File: ./data/babynames.txt
Reading data...
done.


In [4]:
e2e_start = time.time()
start0 = time.time()
grouped = names.groupby(['year', 'sex']) #  Groups the data by year and sex 
end0 = time.time()
print("GroupBy:", end0 - start0)

GroupBy: 0.002891063690185547


In [5]:
start0 = end0

top1000 = grouped.apply(lambda group: group.sort_values(by='births', ascending=False)[0:1000])
top1000.reset_index(inplace=True, drop=True)

end0 = time.time()
print("Apply:", end0-start0)
print("Elements in top1000:", len(top1000))

Apply: 2.201852798461914
Elements in top1000: 267877


  top1000 = grouped.apply(lambda group: group.sort_values(by='births', ascending=False)[0:1000])


In [6]:
start1 = time.time()
all_names = pd.Series(top1000.name.unique()) # find all unique names 
lesley_like = all_names[all_names.str.lower().str.contains('lesl')]
filtered = top1000[top1000.name.isin(lesley_like)] # filter 
table = filtered.pivot_table('births', index='year',
                             columns='sex', aggfunc='sum') # births summed by year and sex

table = table.div(table.sum(1), axis=0) # Normalize by dividing each row / total_births
end1 = time.time()
result = table
print("Analysis:", end1 - start1)

Analysis: 0.05785799026489258


In [7]:
e2e_end = time.time()
print("Total time:", e2e_end - e2e_start)

print(top1000['births'].sum())

Total time: 5.948984146118164
304919459
