## Import

In [2]:
import pandas as pd
import numpy as np

## Load the dataset

We will use the dataset about babies names in France in the period 1900 to 2017

In [3]:
# Load the new dataset. Its fields are separated by tabs.
# We ask pandas to interpret the columns 'annais' and 'dpt' as strings to avoid error with missing
# values
df = pd.read_csv('./data/prenoms-fr-1900-2017.tsv.gz', sep='\t', dtype={'annais':str, 'dpt':str})
df.shape

(3573026, 5)

## Clean the dataset

In [4]:
# Rename some columns to use more meaningful names
df = df.rename(columns={
    'sexe':      'sex',
    'preusuel':  'name',
    'annais':    'year',
    'dpt':       'department',
    'nombre':    'count'})

# Drop rows with missing 'department', 'year' and special 'name'
df.drop(df[df['department'] == 'XX'].index, inplace=True)
df.drop(df[df['year'] == 'XXXX'].index, inplace=True)
df.drop(df[df['name'] == '_PRENOMS_RARES'].index, inplace=True)

# Convert columns 'department' and 'year' to numeric values
df['department'] = pd.to_numeric(df['department'])
df['year']       = pd.to_numeric(df['year'])

## Display a small random sample of the dataset

In [5]:
df.sample(10)

Unnamed: 0,sex,name,year,department,count
562853,1,GASTON,1918,23,17
2610487,2,JUSTINE,1998,86,32
2088537,2,COLETTE,1928,76,88
1187291,1,NESSIM,2013,68,3
1265698,1,PHILIPPE,1909,83,4
2082215,2,CLÉMENCE,2005,77,27
1714409,2,ALINE,1921,50,21
376134,1,DÉSIRÉ,1906,75,43
1187470,1,NESTOR,1929,972,5
80434,1,ALIX,2013,1,3


In [6]:
# In this dataset, the sex is represented as 1 for males and 2 for females
# Define some convenient constants
MALE, FEMALE = 1, 2

# Create two views of the dataframe: one for boys and one for girls
boys = df[df['sex'] == MALE]
girls = df[df['sex'] == FEMALE]

## Exercise 1:

Determine the top 5 names separately for boys and for girls born since 2010:

In [134]:
# Select the individuals born since the year of interst
year = 2017
boys_recent  = boys[boys['year'] >= year]
girls_recent = girls[girls['year'] >= year]

# Create groups by 'name' and sum the number of individuals with each
# name for each year
boys_top_names  = boys_recent.groupby(['name'])['count'].sum()
girls_top_names = girls_recent.groupby(['name'])['count'].sum()

In [8]:
boys_top_names.nlargest(5)

name
GABRIEL    5428
LOUIS      4406
RAPHAËL    4184
JULES      4179
ADAM       4156
Name: count, dtype: int64

In [9]:
girls_top_names.nlargest(5)

name
EMMA      4809
LOUISE    4037
JADE      4024
ALICE     3381
CHLOÉ     3148
Name: count, dtype: int64

## Excercise 2:

**Question 1:** determine the year when the largest number of girls named `'MARIE'` were born:

In [135]:
# Groupe the 'MARIE' par year, sum their count for all the departments
maries_per_year = girls[girls['name'] == "MARIE"].groupby(['year'])['count'].sum()

# Get the maximum value and the position of that maximum in the series
count_maries = maries_per_year.max()
year = maries_per_year.idxmax()

print(f'The largest number of girls named Marie was {count_maries:,} in {year}')

The largest number of girls named Marie was 52,167 in 1901


**Question 2**: what percentage of all the girls born that same year were named `'MARIE'`?

In [122]:
# Count the total number of girls born in the year computed in the previous question
total_girls = girls[girls['year'] == year]['count'].sum()

print(f'{count_maries*100/total_girls:.0f}% of the girls born in {year} were named MARIE')

21% of the girls born in 1901 were named MARIE


## Exercise:

Determine the most popular name for boys and for girls for the whole period included in the dataset

In [131]:
boys_max  = boys.groupby(['name'])['count'].sum().idxmax()
girls_max = girls.groupby(['name'])['count'].sum().idxmax()

print(f'The most popular names over the period 1900-2017 were {boys_max} and {girls_max}')

The most popular names over the period 1900-2017 were JEAN and MARIE


## Exercise

Among the girls born in 1970, were there more named `'ISABELLE'` than `'BRIGITTE'` ?

In [137]:
# Select the girls born in 1970
girls_1970 = girls[girls['year'] == 1970]

# Group those girls by their given name, and sum their counts for ALL the departments
girls_per_name_1970 = girls_1970.groupby(['name'])['count'].sum()

# Select the rows for ISABELLE and BRIGITTE and compare their values
isabelles_1970 = girls_per_name_1970.loc['ISABELLE']
brigittes_1970 = girls_per_name_1970.loc['BRIGITTE']

print(isabelles_1970 > brigittes_1970)

True


## Exercise:

Determine the position of your given name in the ranking of the all the names (boys and girls) the year you were born:

In [89]:
my_year = 2000
my_name = 'SÉBASTIEN'

xx = df[df['name'] == my_name].groupby(['year'])

xx.loc[my_year, :] # ['count'].sum()

AttributeError: Cannot access callable attribute 'loc' of 'DataFrameGroupBy' objects, try using the 'apply' method

In [60]:
sebs_per_year = boys[boys['name'] == "SÉBASTIEN"].groupby(['year'])['count'].sum()
count_sebs = sebs_per_year.max()
year = sebs_per_year.idxmax()


total_boys = boys[boys['year'] == year]['count'].sum()
print(year, count_sebs, total_boys,  (count_sebs*100/total_boys) )

1977 19469 353222 5.511831086398922


In [167]:
boys[boys['year'] == 1978].head()

Unnamed: 0,sex,name,year,department,count
1561,1,ABDALLAH,1978,38,3
1562,1,ABDALLAH,1978,59,3
1563,1,ABDALLAH,1978,69,4
1564,1,ABDALLAH,1978,92,5
2201,1,ABDEL,1978,13,3


In [193]:
osmans       = boys['name'] == "OSMAN"
born_in_1978 = boys['year'] == 1978
dept_42      = boys['department'] == 42

boys[osmans & dept_42]

Unnamed: 0,sex,name,year,department,count
1231324,1,OSMAN,1981,42,4
1231326,1,OSMAN,1982,42,3
1231332,1,OSMAN,1985,42,3
1231358,1,OSMAN,1995,42,4
1231363,1,OSMAN,1997,42,5
1231377,1,OSMAN,2002,42,3
1231385,1,OSMAN,2004,42,5
1231389,1,OSMAN,2005,42,3
1231428,1,OSMAN,2016,42,5


## Excercise 3:

Determine the most popular name the year you were born, for each sex:

In [169]:
year = 2003

In [170]:
boys_born_my_year = boys[boys['year'] == year]
boys_born_my_year.groupby(['name'])['count'].sum().nlargest(5)

name
LUCAS     8291
THÉO      7857
HUGO      7388
THOMAS    6856
ENZO      6747
Name: count, dtype: int64

In [171]:
girls_born_my_year = girls[girls['year'] == year]
girls_born_my_year.groupby(['name'])['count'].sum().nlargest(5)

name
LÉA        8987
MANON      6925
EMMA       6232
CHLOÉ      5801
CAMILLE    5364
Name: count, dtype: int64

In [189]:
girls_born_my_year[girls_born_my_year['name'] == 'MANUELA']['count'].sum()

49

In [10]:
girls.sample(10)

Unnamed: 0,sex,name,year,department,count
3399162,2,SOLANGE,1904,70,3
2968115,2,MARIE-THÉRÈSE,1936,17,7
2875676,2,MARGO,1995,38,5
2926027,2,MARIE-CLAUDE,1969,69,12
2174947,2,DOROTHEE,1915,972,10
3262373,2,PRISCILLIA,1989,77,9
3001991,2,MARTHE,1942,30,5
2402122,2,GERTRUDE,1926,13,3
2821229,2,LÉONTINE,1906,58,18
2900031,2,MARIE,1928,10,41
