In [134]:
import altair as alt
import pandas as pd
import geopandas as gpd # Requires geopandas -- e.g.: conda install -c conda-forge geopandas
# alt.data_transformers.enable('json') # Let Altair/Vega-Lite work with large data sets
alt.data_transformers.enable('default') # work-around to let Altair handle larger data sets
alt.data_transformers.disable_max_rows()

alt.renderers.enable('default') # In some old versions of Jupyter, you may need to enable this.

RendererRegistry.enable('default')

# Reading our names data



In [135]:
names = pd.read_csv('dpt2020.csv', sep=';')
names.drop(names[names.preusuel == '_PRENOMS_RARES'].index, inplace=True)
names.drop(names[names.dpt == 'XX'].index, inplace=True)

names.sample(5)

Unnamed: 0,sexe,preusuel,annais,dpt,nombre
3611913,2,SYLVETTE,1951,30,21
756598,1,JACQUES,1944,21,79
1380897,1,RAPHAËL,1983,28,6
1485469,1,SALOMON,1996,75,5
1876679,2,ANGÈLE,1991,80,3


In [136]:
# change dpt column name to code for future merge
names.rename(columns={'dpt': 'code'}, inplace=True)
names

Unnamed: 0,sexe,preusuel,annais,code,nombre
10885,1,AADIL,1983,84,3
10886,1,AADIL,1992,92,3
10888,1,AAHIL,2016,95,3
10892,1,AARON,1962,75,3
10893,1,AARON,1976,75,3
...,...,...,...,...,...
3727545,2,ZYA,2013,44,4
3727546,2,ZYA,2013,59,3
3727547,2,ZYA,2017,974,3
3727548,2,ZYA,2018,59,3


In [137]:
# Keep a reference around to the plain pandas dataframe, without geometry data, just in case
just_names = names.copy()

just_names.sample(5)

Unnamed: 0,sexe,preusuel,annais,code,nombre
1677464,1,YANIS,2012,69,76
1419614,1,RENÉ,1936,8,77
1812511,2,ALINE,2004,37,7
2826037,2,LEÏNA,2019,76,6
1918272,2,ANNE-LISE,1993,49,6


# Show a name over all years

Now we'll choose a name to show across all years.  To that, we'll group all of the names in a department together (squashing the years together) and use the sum.

In [8]:
grouped = just_names.groupby(['code', 'preusuel', 'sexe',], as_index=False).sum()
grouped = depts.merge(grouped,on='code') # Add geometry data back in
grouped

Unnamed: 0,code,nom,geometry,preusuel,sexe,annais,nombre
0,01,Ain,"POLYGON ((4.78021 46.17668, 4.79458 46.21832, ...",AARON,1,2005200720082009201020112012201320142015201620...,160
1,01,Ain,"POLYGON ((4.78021 46.17668, 4.79458 46.21832, ...",ABBY,2,2008,3
2,01,Ain,"POLYGON ((4.78021 46.17668, 4.79458 46.21832, ...",ABDALLAH,1,20142016,7
3,01,Ain,"POLYGON ((4.78021 46.17668, 4.79458 46.21832, ...",ABDEL,1,1967,3
4,01,Ain,"POLYGON ((4.78021 46.17668, 4.79458 46.21832, ...",ABDELKADER,1,1965,3
...,...,...,...,...,...,...,...
224722,95,Val-d'Oise,"POLYGON ((2.59052 49.07965, 2.57203 49.06149, ...",ÉRIC,1,19681971,7
224723,95,Val-d'Oise,"POLYGON ((2.59052 49.07965, 2.57203 49.06149, ...",ÉTHAN,1,201720182020,14
224724,95,Val-d'Oise,"POLYGON ((2.59052 49.07965, 2.57203 49.06149, ...",ÉVA,2,2012201420152016201820192020,36
224725,95,Val-d'Oise,"POLYGON ((2.59052 49.07965, 2.57203 49.06149, ...",ÉVAN,1,2019,4


Now let's pick a name and check out how it's distribution over the last 120 years across Metropolitan France.  In this example, I choose the name “Lucien,” which I rather like for some reason.

# Visualizations
First some general observations and charts of the dataset. Those helped us to put the data into perspective when diving deeper into the data.

## Visualization 3:
Are there gender effects in the data? Does popularity of names given to both sexes evolve consistently? (Note: this data set treats sex as binary; this is a simplification that carries into this assignment but does not generally hold.)

In [None]:
just_names

Unnamed: 0,sexe,preusuel,annais,code,nombre
10885,1,AADIL,1983,84,3
10886,1,AADIL,1992,92,3
10888,1,AAHIL,2016,95,3
10892,1,AARON,1962,75,3
10893,1,AARON,1976,75,3
...,...,...,...,...,...
3727545,2,ZYA,2013,44,4
3727546,2,ZYA,2013,59,3
3727547,2,ZYA,2017,974,3
3727548,2,ZYA,2018,59,3


### make some processing on the data

In [63]:
# Aggregate the data by year and gender
gender_year_counts = just_names.groupby(['annais', 'sexe']).agg({'nombre': 'sum'}).reset_index()
names_by_gender_year=just_names.groupby(['annais','preusuel', 'sexe']).agg({'nombre': 'sum'}).reset_index()

# names give to both
both_names=just_names.groupby(['preusuel','sexe']).agg({'nombre': 'sum'}).reset_index()# that make unique row per name&sexe
both_names=both_names.groupby('preusuel').agg({"sexe":"sum"}).reset_index()# each name has 1 or 2  or both as gender, if we sum sexe we could know if the name have both
both_names=both_names[both_names['sexe']>=3]# if bith gender are present the sum is > than 3
both_names_to_keep=both_names['preusuel'].to_list()# keep names in a list

# filter data frame
both_names =just_names[just_names['preusuel'].isin(both_names_to_keep)]

# names of genters 1
names1=just_names[just_names["sexe"]==1].reset_index()

# names of gender 2
names2= just_names[just_names['sexe']==2].reset_index()

In [64]:
top10names_both = both_names.groupby(['preusuel'])['nombre'].sum().sort_values(ascending= False).head(10).index.tolist()
top10_both = both_names[both_names.preusuel.isin(top10names_both)]
top10_both = top10_both.groupby(['preusuel', 'annais'])['nombre'].sum()

top10_both = top10_both.to_frame()
top10_both.reset_index(inplace=True)

#names fo gender 1
top10names_1 = names1.groupby(['preusuel'])['nombre'].sum().sort_values(ascending= False).head(10).index.tolist()
top10_1 =names1[names1.preusuel.isin(top10names_1)]
top10_1 = top10_1.groupby(['preusuel', 'annais'])['nombre'].sum()

top10_1 = top10_1.to_frame()
top10_1.reset_index(inplace=True)

# names of gender 2

top10names_2 = names2.groupby(['preusuel'])['nombre'].sum().sort_values(ascending= False).head(10).index.tolist()
top10_2 = names2[names2.preusuel.isin(top10names_2)]
top10_2 = top10_2.groupby(['preusuel', 'annais'])['nombre'].sum()

top10_2 = top10_2.to_frame()
top10_2.reset_index(inplace=True)


In [132]:
top10_1

Unnamed: 0,preusuel,annais,nombre
0,ALAIN,1900,83
1,ALAIN,1901,99
2,ALAIN,1902,106
3,ALAIN,1903,120
4,ALAIN,1904,136
...,...,...,...
1175,RENÉ,1998,3
1176,RENÉ,1999,3
1177,RENÉ,2002,3
1178,RENÉ,2006,3


#### look at a specifique name

In [54]:
both_names[both_names['preusuel']=='MARIE']

Unnamed: 0,sexe,preusuel,annais,code,nombre
1105666,1,MARIE,1900,01,42
1105667,1,MARIE,1900,03,7
1105668,1,MARIE,1900,06,3
1105669,1,MARIE,1900,07,8
1105670,1,MARIE,1900,08,7
...,...,...,...,...,...
3060631,2,MARIE,2020,94,12
3060632,2,MARIE,2020,95,19
3060633,2,MARIE,2020,971,4
3060634,2,MARIE,2020,973,7


### Lets make a line plot on the 10 most popular names give to both gender ohter time, 10 most name give to gender 1 and 10 most of gender 2

In [73]:

area_both = alt.Chart(top10_both).mark_line().encode(
    alt.X("annais:T", title='Year'),
    alt.Y("nombre:Q", title='Number of occurrences'),
    color=alt.Color("preusuel:N", legend=alt.Legend(title="preusuel")),
    #opacity = alt.condition(single, alt.value(1.0), alt.value(0.5)),
    tooltip=[alt.Tooltip(field='preusuel', title="Name")],
).properties(width=1000, height=300, title='Evolution of top 10 names give to both gender')

area_1 = alt.Chart(top10_1).mark_line().encode(
    alt.X("annais:T", title='Year'),
    alt.Y("nombre:Q", title='Number of occurrences'),
    color=alt.Color("preusuel:N", legend=alt.Legend(title="preusuel")),
    #opacity = alt.condition(single, alt.value(1.0), alt.value(0.5)),
    tooltip=[alt.Tooltip(field='preusuel', title="Name")],
).properties(width=1000, height=300, title='Evolution of top 10 names give to  gender 1')

area_2 = alt.Chart(top10_2).mark_line().encode(
    alt.X("annais:T", title='Year'),
    alt.Y("nombre:Q", title='Number of occurrences'),
    color=alt.Color("preusuel:N", legend=alt.Legend(title="preusuel")),
    #opacity = alt.condition(single, alt.value(1.0), alt.value(0.5)),
    tooltip=[alt.Tooltip(field='preusuel', title="Name")],
).properties(width=1000, height=300, title='Evolution of top 10 names give to gender 2')

area_both&area_1&area_2

### Comment
In the line chart we that the most popular names give to both gender are also present in 10 most popular in each gender. By the the legend of the line plot, we can see that the tree plots have in aggregate 20 names. 
the number of names in each cas tend to flaten with time after 1965. That can be due to the mondialisation and the names are more divertified 

### how are the neutrality of names give to both gender


In [114]:
both=both_names.groupby(['preusuel','annais','sexe']).agg({'nombre':'sum'}).reset_index()
both1=both[both['sexe']==1]
both2=both[both['sexe']==2]
final=both2.merge(both1, how="outer",on =["preusuel","annais"]) # outer join to keep any appearence

# fil NAN
final['sexe_x']=final['sexe_x'].fillna(2)
final['sexe_y']=final['sexe_y'].fillna(1)
final=final.fillna(0) # fill the rest ("nombre_x and nombre_y") by 0
final["sexe_x"].isna().sum()

#comput the rapport
final['rapport']=(final['nombre_x'])-(final['nombre_y']) # add 1 to avoid divid by 0
final

Unnamed: 0,preusuel,annais,sexe_x,nombre_x,sexe_y,nombre_y,rapport
0,ABDON,1918,2.0,3.0,1.0,5.0,-2.0
1,ABDON,1922,2.0,4.0,1.0,15.0,-11.0
2,ABDON,1927,2.0,9.0,1.0,11.0,-2.0
3,ABDON,1935,2.0,3.0,1.0,11.0,-8.0
4,ABDON,1937,2.0,3.0,1.0,3.0,0.0
...,...,...,...,...,...,...,...
49268,ÉLIE,2012,2.0,0.0,1.0,26.0,-26.0
49269,ÉLIE,2013,2.0,0.0,1.0,32.0,-32.0
49270,ÉLIE,2014,2.0,0.0,1.0,26.0,-26.0
49271,ÉLIE,2016,2.0,0.0,1.0,62.0,-62.0


### plot rapport of ten populart names give to both gender

In [131]:
#top1000names_both = both_names.groupby(['preusuel'])['nombre'].sum().sort_values(ascending= False).head(10).index.tolist()
names_list=["ALIX","CHARLIE","EDEN","DANY","DOMINIQUE","CAMILLE"]
top10_both_rapport = final[final.preusuel.isin(top10names_both)]
line=alt.Chart(top10_both_rapport).mark_line().encode(
    alt.X("annais:T", title='Year'),
    alt.Y("rapport:Q", title='Number of occurrences'),
    color=alt.Color("preusuel:N", legend=alt.Legend(title="preusuel")),
    #opacity = alt.condition(single, alt.value(1.0), alt.value(0.5)),
    tooltip=[alt.Tooltip(field='preusuel', title="Name")],
).properties(width=1000, height=300, title='Evolution of top 10 names give to both gender')
line

Even if the names are give to both gender some are more neutral than other, Name like PHILIPPE has been more neutral until 1950, It has then been give to many children of gender 1. In other hand names like MARIE or Jean are more categorit to one gender.

### Here some name that have change from on gender to another one

In [126]:
names_list=["ALIX","CHARLIE","EDEN","DANY","DOMINIQUE","CAMILLE"]
top10_both_rapport = final[final.preusuel.isin(names_list)]
line=alt.Chart(top10_both_rapport).mark_line().encode(
    alt.X("annais:T", title='Year'),
    alt.Y("rapport:Q", title='Number of occurrences'),
    color=alt.Color("preusuel:N", legend=alt.Legend(title="preusuel")),
    #opacity = alt.condition(single, alt.value(1.0), alt.value(0.5)),
    tooltip=[alt.Tooltip(field='preusuel', title="Name")],
).properties(width=1000, height=800, title='Evolution of top 10 names give to both gender')
line