In [1]:
import pandas as pd
pd.set_option('display.max_rows', None) 
pd.set_option('display.max_columns', None) 
pd.set_option('display.colheader_justify', 'center') 

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv('birthday_challenge.txt', delimiter='\t')

In [3]:
df.head()

Unnamed: 0,date of birth,number of people
0,1900_01_01,8
1,1900_01_02,1
2,1900_01_04,1
3,1900_01_06,1
4,1900_01_10,1


In [4]:
df.tail()

Unnamed: 0,date of birth,number of people
29809,1999_11_17,1
29810,1999_11_22,1
29811,1999_11_28,1
29812,1999_12_02,1
29813,1999_12_20,1


In [18]:
df.describe()

Unnamed: 0,number of people
count,29814.0
mean,69.355974
std,64.154373
min,1.0
25%,14.0
50%,47.0
75%,120.0
max,541.0


In [5]:
df.sort_values(by='number of people', ascending=False).head(100)

Unnamed: 0,date of birth,number of people
23579,1968_01_01,541
22849,1966_01_01,533
22484,1965_01_01,533
22118,1964_01_01,527
24310,1970_01_01,508
20657,1960_01_01,505
23214,1967_01_01,479
23005,1966_06_06,452
23945,1969_01_01,451
21023,1961_01_01,445


In [6]:
month_nums = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
total = 0

for i in month_nums:
    matches = (df['date of birth'].str.slice(start=5,stop=7) == i) & (df['date of birth'].str.slice(start=8,stop=10) == i)
    temp_df = df[matches]
    res = temp_df['number of people'].sum()
    total += res
    print(f'Month & day is {i}: {res}')

print('Percentage of people that fit pattern: ' + '{:.2%}'.format(total/df['number of people'].sum()))

Month & day is 01: 19951
Month & day is 02: 10436
Month & day is 03: 9986
Month & day is 04: 9145
Month & day is 05: 8672
Month & day is 06: 11061
Month & day is 07: 7668
Month & day is 08: 7813
Month & day is 09: 7550
Month & day is 10: 11235
Month & day is 11: 7376
Month & day is 12: 7944
Percentage of people that fit pattern: 5.75%


In [7]:
df2 = df.copy()

In [8]:
def remove_year(dob):
    return dob[5:]

In [9]:
df2['date of birth'] = df2['date of birth'].apply(remove_year)

In [10]:
df2['date of birth'].value_counts().describe()

count    366.000000
mean      81.459016
std        3.672514
min       21.000000
25%       80.000000
50%       81.000000
75%       83.000000
max       92.000000
Name: date of birth, dtype: float64

In [11]:
df2['date of birth'].value_counts().head()

01_01    92
07_18    87
10_15    87
06_06    86
02_15    86
Name: date of birth, dtype: int64

In [12]:
df['date of birth'].value_counts().head()

1903_01_21    2
1961_07_11    1
1958_11_18    1
1960_10_31    1
1972_05_13    1
Name: date of birth, dtype: int64

In [13]:
clean_df = pd.read_csv('clean_data.csv')
clean_df.head()

Unnamed: 0,date_of_birth,yearofbirth,monthofbirth,dayofbirth,number_of_people
0,1900_01_01,1900,1,1,8
1,1900_01_02,1900,1,2,1
2,1900_01_04,1900,1,4,1
3,1900_01_06,1900,1,6,1
4,1900_01_10,1900,1,10,1


In [14]:
clean_df.dtypes

date_of_birth       object
yearofbirth          int64
monthofbirth         int64
dayofbirth           int64
number_of_people     int64
dtype: object

In [16]:
display(clean_df.groupby('yearofbirth').count().sort_values(by=['number_of_people', 'yearofbirth'], ascending=[False, True]))

Unnamed: 0_level_0,date_of_birth,monthofbirth,dayofbirth,number_of_people
yearofbirth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1912,366,366,366,366
1916,366,366,366,366
1920,366,366,366,366
1924,366,366,366,366
1928,366,366,366,366
1932,366,366,366,366
1936,366,366,366,366
1940,366,366,366,366
1944,366,366,366,366
1948,366,366,366,366
