# Importer les datasets

In [1]:
import pandas as pd

In [2]:
# data_sample = pd.read_csv("dpt2018.csv",
#                           sep=";",
#                           na_values=['XXXX', 'XX'],
#                           nrows=20)

# import data
data = pd.read_csv("dpt2018.csv",
                   sep=";",
                   na_values=['XXXX', 'XX'])

In [134]:
data_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 5 columns):
sexe        20 non-null int64
preusuel    20 non-null object
annais      14 non-null float64
dpt         14 non-null float64
nombre      20 non-null int64
dtypes: float64(2), int64(2), object(1)
memory usage: 928.0+ bytes


In [136]:
data['sexe_txt'] = data.sexe.replace({1:'homme', 2:'femme'})

In [142]:
data

Unnamed: 0,sexe,preusuel,annais,dpt,nombre,sexe_txt
0,1,A,,,27,homme
1,1,AADAM,,,27,homme
2,1,AADEL,,,55,homme
3,1,AADIL,1983.0,84.0,3,homme
4,1,AADIL,1992.0,92.0,3,homme
...,...,...,...,...,...,...
3624989,2,ÉYA,2014.0,69.0,3,femme
3624990,2,ÉYA,,,23,femme
3624991,2,ÏNAYA,,,21,femme
3624992,2,ÖZGE,,,30,femme


# Quelques exercices basics

## Nombre de prénom masculin/féminin par année

In [104]:
data.loc[:,['annais','sexe_txt', 'nombre']].groupby(['annais','sexe_txt']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,nombre
annais,sexe_txt,Unnamed: 2_level_1
1900.0,femme,225100
1900.0,homme,168332
1901.0,femme,245022
1901.0,homme,186710
1902.0,femme,248817
...,...,...
2016.0,homme,309316
2017.0,femme,271703
2017.0,homme,301974
2018.0,femme,266733


## Proportion de prénom masculin/féminin par an

In [6]:
# work with another variable
data_prop = data.loc[:,['annais','sexe_txt', 'nombre']].groupby(['annais','sexe_txt']).sum()

In [106]:
ratio = lambda x:100 * x / float(x.sum())
data_prop = data_prop.groupby(level=0).apply(ratio)

In [8]:
data_prop

Unnamed: 0_level_0,Unnamed: 1_level_0,nombre
annais,sexe_txt,Unnamed: 2_level_1
1900.0,femme,57.214461
1900.0,homme,42.785539
1901.0,femme,56.753264
1901.0,homme,43.246736
1902.0,femme,56.054257
...,...,...
2016.0,homme,52.697790
2017.0,femme,47.361669
2017.0,homme,52.638331
2018.0,femme,47.271377


## Présenter l'évolution des ratios hommes/femmes au cours du temps

In [130]:
data_prop.squeeze().unstack()

sexe_txt,femme,homme
annais,Unnamed: 1_level_1,Unnamed: 2_level_1
1900.0,57.214461,42.785539
1901.0,56.753264,43.246736
1902.0,56.054257,43.945743
1903.0,55.679047,44.320953
1904.0,55.191002,44.808998
...,...,...
2014.0,47.331198,52.668802
2015.0,47.259920,52.740080
2016.0,47.302210,52.697790
2017.0,47.361669,52.638331


In [9]:
data_prop.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x7f6464688350>

In [121]:
data_prop.squeeze().unstack().plot(kind='bar')

<matplotlib.axes._subplots.AxesSubplot at 0x7f645e754790>

In [11]:
%matplotlib
import matplotlib.pyplot as plt
import numpy as np

ax = data_prop.unstack().plot(kind='bar')
xticks = ax.xaxis.get_major_ticks()
ax.set_xlabel('Année')

for i,tick in enumerate(xticks):
        if i%10 != 0:
            tick.label1.set_visible(False)
xticks[-1].label1.set_visible(True)
plt.show()

Using matplotlib backend: Qt5Agg


## Présenter l'évolution des ratios hommes/femmes au cours du temps tous les 10 ans

### avec une boucle (très longue)

In [38]:
# VERY LONG

d={}
step = 10
decades = range(1900,2020,step)
for j in decades:
    d[j]=range(j, j+step)

def func(x):
    for decade, years in d.items():
        if x in years:
            return decade
data['decennie'] = data.annais.apply(func)

### avec cut de pandas

In [89]:
cut_bins = list(range(1900, 2021, 10))
cut_labels = cut_bins[:-1]

In [94]:
data['decennie']=pd.cut(data.annais,
                        bins=cut_bins,
                        labels=cut_labels)

In [110]:
data_dec = data.groupby(['decennie','sexe_txt']).sum()['nombre']

In [114]:
data_dec.unstack().plot(kind='bar', stacked=True, width=0.9)

<matplotlib.axes._subplots.AxesSubplot at 0x7f645ed70510>

# Reproduire les résultats du Figaro

## En 1900, combien de prénom et proportion de Marie

In [61]:
import pandas as pd
data = pd.read_csv("dpt2018.csv",
                          sep=";",
                          na_values=['XXXX', 'XX'],
                          #nrows=20
                          )
data['sexe_txt'] = data.sexe.replace({1:'homme', 2:'femme'})

In [77]:
nbr_prenom = len(data[data['annais']==1900].preusuel.unique())
print(f'En 1900, il y a {nbr_prenom} qui ont été recensés hors prénom unique')

prenom_rare = data[(data.annais==1900)].preusuel.value_counts()['_PRENOMS_RARES']
print(f'En considérant les prénoms rares, {nbr_prenom + prenom_rare} qui ont été recensés.')

En 1900, il y a 998 qui ont été recensés hors prénom unique
En considérant les prénoms rares, 1159 qui ont été recensés.


In [5]:
#proportion de Marie
nbr_marie = len(data[(data.annais==1900) & (data.preusuel=='MARIE')])
ratio = 100*nbr_marie/nbr_prenom
print(f'En 1900, il y a {nbr_marie} qui ont été nommées')
print(f"Soit {ratio:.0f} % de Marie.")

En 1900, il y a 169 qui ont été nommées
Soit 17 % de Marie.


In [6]:
# plus performant pour calculer le nombre de Marie de 30x.
data[(data.annais==1900)].preusuel.value_counts()['MARIE']

169

In [74]:
[i for i in data.preusuel.dropna().unique() if 'PRENOM' in i]

['_PRENOMS_RARES']

## En 2017 ? Nombre de prénoms ?

In [7]:
nbr_pren_2017 = data[(data.annais==2017)].preusuel.value_counts().size
print(f'En 2017, il y a {nbr_pren_2017} de prénoms donnés.')

En 2017, il y a 4399 de prénoms donnés.


## Evolution de Marie dans le temps ? Et pic des prénoms ?

In [8]:
marie_by_year = data[(data.preusuel=='MARIE')].loc[:, ['annais', 'preusuel', 'nombre']]
marie_by_year = marie_by_year.groupby('annais').sum()

In [29]:
%matplotlib
ax = marie_by_year.plot()
ax.set_xlabel('Années')
ax.legend(['Nombre de Marie'])

Using matplotlib backend: Qt5Agg


<matplotlib.legend.Legend at 0x7f04ff251110>

In [26]:
marie_by_year.idxmax()

nombre    1901.0
dtype: float64

## Evolution des prénoms composés avec Marie dedans ?

In [213]:
mask = data.preusuel.str.contains('MARIE-|-MARIE', na=False)
data.loc[mask].dropna()

Unnamed: 0,sexe,preusuel,annais,dpt,nombre,sexe_txt
109075,1,ANDRÉ-MARIE,1949.0,59.0,3,homme
109076,1,ANDRÉ-MARIE,1950.0,59.0,3,homme
109077,1,ANDRÉ-MARIE,1954.0,59.0,5,homme
109078,1,ANDRÉ-MARIE,1955.0,62.0,4,homme
109079,1,ANDRÉ-MARIE,1969.0,59.0,3,homme
...,...,...,...,...,...,...
3528619,2,THÉRÈSE-MARIE,1966.0,62.0,5,femme
3528620,2,THÉRÈSE-MARIE,1967.0,62.0,3,femme
3528621,2,THÉRÈSE-MARIE,1968.0,62.0,3,femme
3528622,2,THÉRÈSE-MARIE,1969.0,59.0,3,femme


In [214]:
pren_comp = data.loc[mask]

In [215]:
nbr_pren_comp = pren_comp.groupby('annais').nombre.sum()

In [216]:
nbr_pren_comp.plot(title="Evolution de prénoms composés -Marie-")

<matplotlib.axes._subplots.AxesSubplot at 0x7f049593d110>

In [124]:
print(f"L'année où a lieu le pic de prénoms composés avec Marie est l'année : {nbr_pren_comp.idxmax():.0f}")

L'année où a lieu le pic de prénoms composés avec Marie est l'année : 1949


In [None]:
# data.groupby(['annais','preusuel']).nombre.sum()

## Pic de prénoms recensées

In [136]:
a = data.loc[:, ['annais', 'nombre', 'preusuel']].groupby(['annais','preusuel']).sum()
a.groupby(level=0).sum().plot()

<matplotlib.axes._subplots.AxesSubplot at 0x7f049593d110>

## Nombre de prénoms qui représentent 50% de bébés

In [173]:
nbr_prenom_by_year = data.groupby(['annais', 'preusuel']).nombre.sum()

In [212]:
nbr_prenom_by_year.loc[1900].describe()

count      998.000000
mean       394.220441
std       1937.431523
min          3.000000
25%          6.000000
50%         17.000000
75%        113.750000
max      49752.000000
Name: nombre, dtype: float64

In [206]:
nbr_prenom_by_year.loc[(slice(None), '_PRENOMS_RARES')]

annais
1900.0     2934
1901.0     3091
1902.0     3055
1903.0     3024
1904.0     3221
          ...  
2014.0    51682
2015.0    51482
2016.0    52111
2017.0    53190
2018.0    53978
Name: nombre, Length: 119, dtype: int64

In [210]:
nbr_prenom_by_year.drop(level=1, labels="_PRENOMS_RARES").loc[1900].describe()

count      997.000000
mean       391.673019
std       1936.730942
min          3.000000
25%          6.000000
50%         17.000000
75%        113.000000
max      49752.000000
Name: nombre, dtype: float64

In [234]:
nbr_prenom_by_year.to_frame().sort_values('nombre', ascending=True).sort_index(level=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,nombre
annais,preusuel,Unnamed: 2_level_1
1900.0,ABEL,382
1900.0,ABRAHAM,9
1900.0,ACHILLE,152
1900.0,ACHILLES,4
1900.0,ADAM,9
...,...,...
2018.0,ÉZIO,4
2018.0,ÉZÉCHIEL,9
2018.0,ÉZÉKIEL,6
2018.0,ÎMRAN,3


In [235]:
nbr_prenom_by_year = nbr_prenom_by_year.copy()

In [292]:
nbr_prenom_by_year

annais  preusuel  
1900.0  MARIE         49752
        JEAN          14100
        JEANNE        13981
        LOUIS          9051
        MARGUERITE     8058
                      ...  
2018.0  AFNANE            3
        AGATA             3
        AELA              3
        ADRIÀN            3
        AERYN             3
Name: nombre, Length: 240554, dtype: int64

In [259]:
nbr_prenom_by_year.sort_values(ascending=False, inplace=True)
nbr_prenom_by_year.sort_index(level=0, sort_remaining=False, inplace=True)

In [270]:
cumPrenomByYear = nbr_prenom_by_year.groupby(level=0)

In [300]:
%timeit dist_prenom_by_year = (cumPrenomByYear.cumsum()/cumPrenomByYear.sum())<=0.50

13.3 ms ± 181 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [298]:
dist_prenom_by_year.groupby(level=0).sum()

annais
1900.0     23.0
1901.0     24.0
1902.0     24.0
1903.0     24.0
1904.0     24.0
          ...  
2014.0    103.0
2015.0    104.0
2016.0    104.0
2017.0    107.0
2018.0    106.0
Name: nombre, Length: 119, dtype: float64

In [301]:
dist_prenom_by_year.groupby(level=0).sum().plot(title="Nombre de prénom minimal pour représenter 50% des bébés")

<matplotlib.axes._subplots.AxesSubplot at 0x7f048093f3d0>

## La part des 10 prénoms les plus utilisés

In [320]:
#drop _PRENOMS_RARES
prenom_by_year_wout_rare = nbr_prenom_by_year.drop(level=1, labels='_PRENOMS_RARES')

In [394]:
prenom_by_year_wout_rare.nlargest(2)

annais  preusuel
1946.0  JEAN        53718
1901.0  MARIE       53176
Name: nombre, dtype: int64

In [391]:
cumPrenomByYear = prenom_by_year_wout_rare.groupby('annais')

In [332]:
most_used = cumPrenomByYear.nlargest(10).droplevel(0)

In [333]:
share_most_used = most_used.groupby(level=0).sum()

In [340]:
share_most_used

annais
1900.0    130257
1901.0    141938
1902.0    145194
1903.0    143645
1904.0    144889
           ...  
2014.0     48145
2015.0     46981
2016.0     46186
2017.0     43361
2018.0     42587
Name: nombre, Length: 119, dtype: int64

In [334]:
share_most_used[1900]/share_most_used[2018]

3.058609434804048

In [339]:
share_most_used.plot(title='Nombre de bébé qui ont un des 10 prenoms les plus donnés')

<matplotlib.axes._subplots.AxesSubplot at 0x7f04809e50d0>

## Le prénom Loïc et sa diffusion

### avec graphique et chiffre

In [365]:
# comptabiliser par département et par an le nombre de prénom Loïc
nbr_loic = data[data.preusuel=='LOÏC'].groupby(['dpt', 'annais']).nombre.sum()

In [366]:
bretagne = [29,22,35,56]

In [407]:
nbr_loic_in_bret_by_year = nbr_loic.loc[bretagne].groupby('annais').sum()

In [381]:
nbr_loic_elsewhere_by_year = nbr_loic.drop(bretagne).groupby('annais').sum()

In [408]:
nbr_loic_in_bret_by_year.plot()
nbr_loic_elsewhere_by_year.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x7f047f94f790>

### avec Folium

In [16]:
import folium
import numpy as np
from ipywidgets import interact
from folium.plugins import HeatMap
from matplotlib import pyplot as plt

In [6]:
# import data
import pandas as pd
data = pd.read_csv("dpt2018.csv",
                   sep=";",
                   na_values=['XXXX', 'XX'])

# select only LOIC by year and by departement
nbr_prenom = data[data.preusuel=='CAROLINE']
nbr_prenom = nbr_prenom.drop('sexe', axis=1)

In [7]:
# import departement
import pickle
departement = pickle.load(open("dico_adress.pkl","rb"))
france_gps = [46.227638 ,2.213749]

In [8]:
# create a new column COORDS with the coordonates and the number (wieght) of babies 
nbr_prenom['COORDS'] = nbr_prenom.dpt.dropna().apply(lambda x: np.append(departement[x], x))

In [9]:
# to verify the type
# WARNING, there is nan type !!! Clean it with myDataFrame.dropna()
nbr_prenom.COORDS.values

array([array([43.7 ,  7.25,  6.  ]),
       array([42.966667,  1.6     ,  9.      ]),
       array([43.3,  5.4, 13. ]), ..., array([48.9,  2.2, 92. ]),
       array([49.033333,  2.066667, 95.      ]), nan], dtype=object)

In [10]:
nbr_prenom

Unnamed: 0,preusuel,annais,dpt,nombre,COORDS
1995837,CAROLINE,1900.0,6.0,15,"[43.7, 7.25, 6.0]"
1995838,CAROLINE,1900.0,9.0,3,"[42.966667, 1.6, 9.0]"
1995839,CAROLINE,1900.0,13.0,6,"[43.3, 5.4, 13.0]"
1995840,CAROLINE,1900.0,20.0,8,"[42.3083335, 9.091666499999999, 20.0]"
1995841,CAROLINE,1900.0,22.0,9,"[48.516667, -2.783333, 22.0]"
...,...,...,...,...,...
2001029,CAROLINE,2018.0,75.0,9,"[48.866667, 2.333333, 75.0]"
2001030,CAROLINE,2018.0,77.0,3,"[48.533333, 2.666667, 77.0]"
2001031,CAROLINE,2018.0,92.0,3,"[48.9, 2.2, 92.0]"
2001032,CAROLINE,2018.0,95.0,6,"[49.033333, 2.066667, 95.0]"


In [11]:
# to put the year in index, avoid to myDataFrame.loc[myDataFrame.annais==19xx]
idx = nbr_prenom.annais
nbr_prenom.set_index('annais', drop=False)

Unnamed: 0_level_0,preusuel,annais,dpt,nombre,COORDS
annais,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1900.0,CAROLINE,1900.0,6.0,15,"[43.7, 7.25, 6.0]"
1900.0,CAROLINE,1900.0,9.0,3,"[42.966667, 1.6, 9.0]"
1900.0,CAROLINE,1900.0,13.0,6,"[43.3, 5.4, 13.0]"
1900.0,CAROLINE,1900.0,20.0,8,"[42.3083335, 9.091666499999999, 20.0]"
1900.0,CAROLINE,1900.0,22.0,9,"[48.516667, -2.783333, 22.0]"
...,...,...,...,...,...
2018.0,CAROLINE,2018.0,75.0,9,"[48.866667, 2.333333, 75.0]"
2018.0,CAROLINE,2018.0,77.0,3,"[48.533333, 2.666667, 77.0]"
2018.0,CAROLINE,2018.0,92.0,3,"[48.9, 2.2, 92.0]"
2018.0,CAROLINE,2018.0,95.0,6,"[49.033333, 2.066667, 95.0]"


In [17]:
def loic_par_departement(i):
    
    nbr_prenom.annais.hist(figsize=(5, 3), bins=50)
    plt.axvline(i, c='k', ls='--')
    plt.xlabel("Annee")
    plt.ylabel('Nombre de prénom Caroline')
    plt.show()
    
    
    carte = folium.Map(france_gps, zoom_start=5)
    HeatMap(nbr_prenom.loc[nbr_prenom.annais==i].dropna().COORDS.values,
    radius=25, #weight=2,
    max_val=1.2).add_to(carte)
    return carte

In [18]:
interact(loic_par_departement, i=range(1900, 2019));

interactive(children=(Dropdown(description='i', options=(1900, 1901, 1902, 1903, 1904, 1905, 1906, 1907, 1908,…

In [12]:
carte = folium.Map(france_gps, zoom_start=5)

In [13]:
carte

In [24]:
HeatMap(nbr_loic.loc[nbr_loic.annais==2000].dropna().COORDS.values,
        radius=10,
#         weight=2,
        max_val=1.2).add_to(carte)

<folium.plugins.heat_map.HeatMap at 0x7f0836490f50>

In [25]:
carte

In [151]:
list()

TypeError: list expected at most 1 arguments, got 4744

In [173]:
nbr_loic.dropna().COORDS.values

array([array([47.216667, -1.55    , 44.      ]),
       array([47.216667, -1.55    , 44.      ]),
       array([48.516667, -2.783333, 22.      ]), ...,
       array([48.783333,  2.466667, 94.      ]),
       array([49.033333,  2.066667, 95.      ]),
       array([  0.,   0., 973.])], dtype=object)

In [165]:
nbr_loic.COORDS.values.tolist()

[array([47.216667, -1.55    , 44.      ]),
 array([47.216667, -1.55    , 44.      ]),
 array([48.516667, -2.783333, 22.      ]),
 array([48.083333, -1.683333, 35.      ]),
 array([48.516667, -2.783333, 22.      ]),
 array([48.083333, -1.683333, 35.      ]),
 array([47.216667, -1.55    , 44.      ]),
 array([47.466667, -0.55    , 49.      ]),
 array([46.666667, -1.433333, 85.      ]),
 array([48.516667, -2.783333, 22.      ]),
 array([47.216667, -1.55    , 44.      ]),
 array([47.216667, -1.55    , 44.      ]),
 array([48.516667, -2.783333, 22.      ]),
 array([48.083333, -1.683333, 35.      ]),
 array([47.216667, -1.55    , 44.      ]),
 array([47.666667, -2.75    , 56.      ]),
 array([48.516667, -2.783333, 22.      ]),
 array([48. , -4.1, 29. ]),
 array([48.083333, -1.683333, 35.      ]),
 array([47.216667, -1.55    , 44.      ]),
 array([48.516667, -2.783333, 22.      ]),
 array([48.083333, -1.683333, 35.      ]),
 array([47.216667, -1.55    , 44.      ]),
 array([48.516667, -2.7833