In [1]:
import altair as alt
import pandas as pd
import numpy as np
import geopandas as gpd
from pyproj import Proj, transform

In this section, we limit the data to improve chart performance and avoid potential issues with large datasets.

In [2]:
from altair import pipe, limit_rows, to_values
t = lambda data: pipe(data, limit_rows(max_rows=1000000), to_values)
alt.data_transformers.register('custom', t)
alt.data_transformers.enable('custom')

DataTransformerRegistry.enable('custom')

We fetch and process geographic data to create a CSV file with the coordinates of French departments.

In [None]:
geojson_url = 'https://raw.githubusercontent.com/gregoiredavid/france-geojson/master/departements.geojson'

gdf = gpd.read_file(geojson_url)

proj = Proj(init='epsg:2154') # projection Lambert-93, EPSG:2154

x_coords = []
y_coords = []

for geometry in gdf['geometry']:
    centroid = geometry.centroid
    x, y = proj(centroid.x, centroid.y)
    x_coords.append(x)
    y_coords.append(y)

data = {
    'department_code': gdf['code'],
    'department_name': gdf['nom'],
    'x': x_coords,
    'y': y_coords
}

df = pd.DataFrame(data)

csv_file = 'departaments.csv'
df.to_csv(csv_file, index=False)

csv_file


  in_crs_string = _prepare_from_proj_string(in_crs_string)


'departaments.csv'

We load the baby names data and the department coordinates data.

In [4]:
names = pd.read_csv('babynames.csv', sep=';')

In [3]:
dpts = pd.read_csv('departaments.csv')

We filter the baby names data to exclude rare names and names with invalid department numbers.

In [5]:
names_filtered = names[(names['preusuel'] != '_PRENOMS_RARES') &
                                 (names['preusuel'].str.len() > 1) &
                                 (names['dpt'].notnull()) &
                                 (names['dpt'].str.isnumeric())]

We merge the filtered baby names dataset with the department coordinates.

In [66]:
merged_data = pd.merge(names_filtered, dpts, left_on='dpt', right_on='department_code')

grouped_data = merged_data.groupby(['dpt', 'preusuel', 'x', 'y']).agg({'nombre': 'sum'}).reset_index()


In [83]:
grouped_data

Unnamed: 0,dpt,preusuel,x,y,nombre,side_length,x_offset,y_offset
0,01,AARON,881418.799480,6.558245e+06,160.0,12.649111,881418.799480,6.558245e+06
1,01,ABDALLAH,881418.799480,6.558245e+06,7.0,2.645751,881430.026192,6.558252e+06
2,01,ABDEL,881418.799480,6.558245e+06,3.0,1.732051,881431.762970,6.558268e+06
3,01,ABDELKADER,881418.799480,6.558245e+06,3.0,1.732051,881418.799480,6.558284e+06
4,01,ABDULLAH,881418.799480,6.558245e+06,3.0,1.732051,881392.872500,6.558290e+06
...,...,...,...,...,...,...,...,...
7548,95,ANTONI,636512.725411,6.887335e+06,12.0,3.464102,635488.422946,6.885561e+06
7549,95,ANTONIN,636512.725411,6.887335e+06,30.0,5.477226,636512.725411,6.885274e+06
7550,95,AÏDAN,636512.725411,6.887335e+06,27.0,5.196152,637549.519369,6.885539e+06
7551,95,AÏDEN,636512.725411,6.887335e+06,26.0,5.099020,638319.323174,6.886292e+06


We create a chart to display the distribution of names across France.

In [62]:
map = alt.Chart(grouped_data).mark_point(size=1).encode(
    x=alt.X('x:Q', axis=None),
    y=alt.Y('y:Q', axis=None, scale=alt.Scale(zero=False)),
    size='nombre:Q',
    color='preusuel:N',
    tooltip=['preusuel:N', 'nombre:Q', 'dpt:N']
).properties(
    width=800,
    height=800,
    title='Baby Names in French Departments'
)

map



However, we can see that all the names circles are positioned at the same placce, while it would be more indicative for them to be positioned around the department coordinate. We also supposed that squares would be more useful for clearer names placement. To do that we calculate the side length for each square representing a name based on its frequency.

In [68]:
grouped_data['side_length'] = grouped_data['nombre'] ** 0.5

We calculate the offset for the x and y coordinates to position the squares around the department points.

In [80]:
def spiral_positions(center_x, center_y, side_lengths):
    positions = []
    angle = 0
    distance = 0
    angle_step = np.pi / 6
    distance_step = side_lengths.max() * 1.1

    for side_length in side_lengths:
        x_offset = center_x + distance * np.cos(angle)
        y_offset = center_y + distance * np.sin(angle)
        positions.append((x_offset, y_offset))

        angle += angle_step
        distance += distance_step / (2 * np.pi)

    return positions

grouped_data['x_offset'] = 0.0
grouped_data['y_offset'] = 0.0

for dept in grouped_data['dpt'].unique():
    dept_data = grouped_data[grouped_data['dpt'] == dept]
    x_center = dept_data.iloc[0]['x']
    y_center = dept_data.iloc[0]['y']

    side_lengths = dept_data['side_length']
    positions = spiral_positions(x_center, y_center, side_lengths)

    for i, (index, row) in enumerate(dept_data.iterrows()):
        grouped_data.at[index, 'x_offset'] = positions[i][0]
        grouped_data.at[index, 'y_offset'] = positions[i][1]

In [79]:
chart = alt.Chart(grouped_data).mark_square().encode(
    x=alt.X('x_offset:Q', axis=None),
    y=alt.Y('y_offset:Q', axis=None, scale=alt.Scale(zero=False)),
    size=alt.Size('nombre:Q', legend=None),
    color=alt.Color('nombre:Q', scale=alt.Scale(scheme='blues'), legend=None),
    tooltip=['preusuel:N', 'nombre:Q', 'dpt:N']
).properties(
    width=800,
    height=600,
    title='Baby Names in French Departments'
)

chart



Although the distribution appears somewhat unusual, this graph effectively highlights the popular names within each region. We then explored an alternative approach, utilizing a color scale to emphasize the frequency of name usage and spotlight the most popular names.

In [82]:
charts = []

# Iterate through each department and create a chart
for dpt in grouped_data['dpt'].unique():
    data_subset = grouped_data[grouped_data['dpt'] == dpt]

    chart = alt.Chart(data_subset).mark_square().encode(
        x=alt.X('preusuel', axis=None),
        y=alt.Y('nombre', axis=None, scale=alt.Scale(zero=False)),
        size=alt.Size('nombre:Q', scale=alt.Scale(range=[50, 500])),
        color='preusuel:N',
        tooltip=['preusuel', 'nombre']
    ).properties(
        width=40,  # Adjust the width of each chart as needed
        title=f'{dpt}'
    )

    charts.append(chart)

# Combine all charts into a single row using Altair's `hconcat` method
combined_chart = alt.hconcat(*charts)

combined_chart



This approach is effective for observing trends, as the color scale reveals that the same names frequently top the list in many departments.
While color alone may not be sufficient for identifying every name due to the sheer number of names, it is valuable for highlighting overall patterns and tendencies.

However, in some departments some trends differ. This prompted us to question whether geographical factors might be influencing these variations, leading us to create a geographical map.

For that we create a chart to visualize the names distribution for one department.

In [14]:
import math

dpt_len = len(grouped_data[grouped_data['dpt'] == "01"])
square_length = math.ceil(dpt_len ** 0.5)

ngd01 = grouped_data[grouped_data['dpt'] == "01"]
ngd01['x_offset'] = 0
ngd01['y_offset'] = 0
dpt_x = ngd01["x"][0]
dpt_y = ngd01["y"][0]

x_offset_list = []
y_offset_list = []
for x in range(1, int(square_length) + 1):
   for y in range(1, int(square_length) + 1):
    if len(x_offset_list) < dpt_len:
      x_offset_list.append(x * dpt_x)
      y_offset_list.append(y * dpt_y)
ngd01['x_offset'] = x_offset_list
ngd01['y_offset'] = y_offset_list

61
8
[881418.799479974, 881418.799479974, 881418.799479974, 881418.799479974, 881418.799479974, 881418.799479974, 881418.799479974, 881418.799479974, 1762837.598959948, 1762837.598959948, 1762837.598959948, 1762837.598959948, 1762837.598959948, 1762837.598959948, 1762837.598959948, 1762837.598959948, 2644256.398439922, 2644256.398439922, 2644256.398439922, 2644256.398439922, 2644256.398439922, 2644256.398439922, 2644256.398439922, 2644256.398439922, 3525675.197919896, 3525675.197919896, 3525675.197919896, 3525675.197919896, 3525675.197919896, 3525675.197919896, 3525675.197919896, 3525675.197919896, 4407093.99739987, 4407093.99739987, 4407093.99739987, 4407093.99739987, 4407093.99739987, 4407093.99739987, 4407093.99739987, 4407093.99739987, 5288512.796879844, 5288512.796879844, 5288512.796879844, 5288512.796879844, 5288512.796879844, 5288512.796879844, 5288512.796879844, 5288512.796879844, 6169931.596359817, 6169931.596359817, 6169931.596359817, 6169931.596359817, 6169931.596359817, 616

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ngd01['x_offset'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ngd01['y_offset'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ngd01['x_offset'] = x_offset_list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [15]:
chart = alt.Chart(ngd01).mark_square(size=1000).encode(
    x=alt.X('x_offset'),
    y=alt.Y('y_offset'),
    color=alt.Color('nombre:Q', scale=alt.Scale(scheme='blues')),
    tooltip=['preusuel', 'nombre']
).properties(
    title='01'
)

chart



On this graph we can wee the most popular names with colour. We also use size to highlight the most popular names.

In [16]:
chart = alt.Chart(ngd01).mark_square().encode(
    x=alt.X('x_offset', axis=None),
    y=alt.Y('y_offset', axis=None, scale=alt.Scale(zero=False)),
    size=alt.Size('nombre:Q', scale=alt.Scale(range=[0, 3000])),
    color=alt.Color('nombre:Q', scale=alt.Scale(scheme='blues')),
    tooltip=['preusuel', 'nombre']
).properties(
    title='01'
)

chart



For future work, we plan to place the department names charts on the coordinates of each department to visualize the data more effectively.