In [13]:
import pandas as pd
import json
import random
import numpy as np

# Filter values

In [3]:
# Input file
data = 'data.csv'

# Create lists
events = []
event_editions = []

# Read file
with open(data, "r", encoding="utf-8") as f:
    for line in f:
        try:
            data = json.loads(line.strip())  # Convertir en dictionnaire
            type_value = data.get("type")

            if type_value == "api::event.event":
                events.append(data)
            elif type_value == "api::event-edition.event-edition":
                event_editions.append(data)

        except json.JSONDecodeError:
            continue


# Convert lists to df
df_events = pd.DataFrame(events)
df_event_editions = pd.DataFrame(event_editions)

# Work on events

In [4]:
df_events = df_events.drop("type", axis=1)
df_events

Unnamed: 0,id,data
0,15,"{'documentId': 'h4e8nx3mnj9fmd22983hg07x', 'de..."
1,16,"{'documentId': 'n8o21rucbzqrgv372m7ht13z', 'de..."


In [5]:
df_events_filtered = df_events[["id"]].copy()

df_events_filtered["name"] = df_events["data"].apply(lambda x: x.get("name") if isinstance(x, dict) else None)
df_events_filtered["timezone"] = df_events["data"].apply(lambda x: x.get("timezone") if isinstance(x, dict) else None)

df_events_filtered

Unnamed: 0,id,name,timezone
0,15,Marathon du lac d'Annecy,Europe/Paris
1,16,Tabasco,Atlantic/Cape_verde


# Work on event editions

In [6]:
df_event_editions = df_event_editions.drop("type", axis=1)
df_event_editions

Unnamed: 0,id,data
0,23,"{'documentId': 'escwtfkm1a1sd0di8mrzxgtk', 'ev..."
1,24,"{'documentId': 'r0d5fnp7939v19jemcu6edgs', 'ev..."
2,25,"{'documentId': 'mhfo3ftbcqx1l3uls8h50dym', 'ev..."


In [7]:
df_event_editions_filtered = df_event_editions[["id"]].copy()

df_event_editions_filtered["name"] = df_event_editions["data"].apply(lambda x: x.get("name") if isinstance(x, dict) else None)
df_event_editions_filtered["date_start"] = df_event_editions["data"].apply(lambda x: x.get("date_start") if isinstance(x, dict) else None)
df_event_editions_filtered["date_end"] = df_event_editions["data"].apply(lambda x: x.get("date_end") if isinstance(x, dict) else None)
df_event_editions_filtered["annee"] = 2025

df_event_editions_filtered = df_event_editions_filtered.drop("id", axis=1)
df_event_editions_filtered.replace("Marathon du lac d'Annecy 2025", "Marathon du lac d'Annecy", inplace=True)
df_event_editions_filtered

Unnamed: 0,name,date_start,date_end,annee
0,Marathon du lac d'Annecy,,,2025
1,Tabasco party,,,2025
2,Marathon du lac d'Annecy,,,2025


In [8]:
unique_value = df_event_editions_filtered.drop_duplicates()
unique_value.drop(columns=["date_start", "date_end"], axis=1, inplace=True)
unique_value["ville"] = ["Annecy", "T"]
unique_value

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_value.drop(columns=["date_start", "date_end"], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_value["ville"] = ["Annecy", "T"]


Unnamed: 0,name,annee,ville
0,Marathon du lac d'Annecy,2025,Annecy
1,Tabasco party,2025,T


In [9]:
villes = ["Grenoble", "Montpellier", "Lyon", "Marseille", "Genève", "Paris", "Nantes", "Bordeaux"]

annees = list(range(2025, 2031))
evenements_par_annee = [5, 7, 8, 9, 10, 11]  # Nombre croissant d'événements


nouveaux_evenements = []
for annee, nombre in zip(annees, evenements_par_annee):
    for _ in range(nombre):
        ville = random.choice(villes)
        event_name = f"Marathon de {ville}"
        nouveaux_evenements.append({"name": event_name, "annee": annee, "ville": ville})

df_new_events = pd.DataFrame(nouveaux_evenements)
df_new_events

Unnamed: 0,name,annee,ville
0,Marathon de Bordeaux,2025,Bordeaux
1,Marathon de Nantes,2025,Nantes
2,Marathon de Grenoble,2025,Grenoble
3,Marathon de Genève,2025,Genève
4,Marathon de Grenoble,2025,Grenoble
5,Marathon de Montpellier,2026,Montpellier
6,Marathon de Bordeaux,2026,Bordeaux
7,Marathon de Genève,2026,Genève
8,Marathon de Paris,2026,Paris
9,Marathon de Lyon,2026,Lyon


In [10]:
df_final = pd.concat([unique_value, df_new_events])
df_final

Unnamed: 0,name,annee,ville
0,Marathon du lac d'Annecy,2025,Annecy
1,Tabasco party,2025,T
0,Marathon de Bordeaux,2025,Bordeaux
1,Marathon de Nantes,2025,Nantes
2,Marathon de Grenoble,2025,Grenoble
3,Marathon de Genève,2025,Genève
4,Marathon de Grenoble,2025,Grenoble
5,Marathon de Montpellier,2026,Montpellier
6,Marathon de Bordeaux,2026,Bordeaux
7,Marathon de Genève,2026,Genève


In [11]:
df_final.to_csv("events.csv", index=False)

In [12]:
df_for_looker = df_final.copy()

df_for_looker["Pays"] = "France"
df_for_looker

Unnamed: 0,name,annee,ville,Pays
0,Marathon du lac d'Annecy,2025,Annecy,France
1,Tabasco party,2025,T,France
0,Marathon de Bordeaux,2025,Bordeaux,France
1,Marathon de Nantes,2025,Nantes,France
2,Marathon de Grenoble,2025,Grenoble,France
3,Marathon de Genève,2025,Genève,France
4,Marathon de Grenoble,2025,Grenoble,France
5,Marathon de Montpellier,2026,Montpellier,France
6,Marathon de Bordeaux,2026,Bordeaux,France
7,Marathon de Genève,2026,Genève,France


In [14]:
df_for_looker["population"] = np.random.randint(5000, 30001, size=len(df_for_looker))
df_for_looker

Unnamed: 0,name,annee,ville,Pays,population
0,Marathon du lac d'Annecy,2025,Annecy,France,7825
1,Tabasco party,2025,T,France,19648
0,Marathon de Bordeaux,2025,Bordeaux,France,11004
1,Marathon de Nantes,2025,Nantes,France,14839
2,Marathon de Grenoble,2025,Grenoble,France,15672
3,Marathon de Genève,2025,Genève,France,7670
4,Marathon de Grenoble,2025,Grenoble,France,28136
5,Marathon de Montpellier,2026,Montpellier,France,20513
6,Marathon de Bordeaux,2026,Bordeaux,France,22448
7,Marathon de Genève,2026,Genève,France,10617


In [15]:
df_for_looker.to_csv("events2.csv", index=False)