In [5]:
!unzip "/content/Partie1.zip"

Archive:  /content/Partie1.zip
   creating: Partie 1/
  inflating: Partie 1/Olympic_Athlete_Event_Results.csv  
  inflating: Partie 1/Olympics_Games.csv  
  inflating: Partie 1/Olympics_Country.csv  
  inflating: Partie 1/Olympic_Games_Medal_Tally.csv  
  inflating: Partie 1/Olympic_Athlete_Bio.csv  
  inflating: Partie 1/Olympic_Results.csv  


In [6]:
!unzip "/content/World_Athletic_Championships.zip"

Archive:  /content/World_Athletic_Championships.zip
   creating: World_Athletic_Championships/
  inflating: World_Athletic_Championships/.DS_Store  
  inflating: World_Athletic_Championships/World_Athletic_Championships.csv  
   creating: World_Athletic_Championships/results/
  inflating: World_Athletic_Championships/results/0.csv  
  inflating: World_Athletic_Championships/results/1.csv  
  inflating: World_Athletic_Championships/results/10.csv  
  inflating: World_Athletic_Championships/results/100.csv  
  inflating: World_Athletic_Championships/results/101.csv  
  inflating: World_Athletic_Championships/results/102.csv  
  inflating: World_Athletic_Championships/results/103.csv  
  inflating: World_Athletic_Championships/results/104.csv  
  inflating: World_Athletic_Championships/results/105.csv  
  inflating: World_Athletic_Championships/results/106.csv  
  inflating: World_Athletic_Championships/results/107.csv  
  inflating: World_Athletic_Championships/results/108.csv  
  inflat

In [8]:
import pandas as pd
import json
import os

# Chemins des fichiers (à adapter selon ton dossier)
PATH_OLYMPEDIA = "./olympedia/"
PATH_WORLDS = "./world_championships/"

# 1. Charger la Bio des athlètes
bio_df = pd.read_csv(os.path.join(PATH_OLYMPEDIA, 'Olympic_Athlete_Bio.csv'))

# Dictionnaire pour stocker les athlètes en mémoire et calculer les compteurs
athletes_map = {}
events_map = {}
editions_map = {}

for _, row in bio_df.iterrows():
    a_id = str(row['athlete_id'])
    athletes_map[a_id] = {
        "_id": a_id,
        "name": row['name'],
        "sex": row['sex'],
        "born": str(row['born']),
        "height": row['height'],
        "weight": row['weight'],
        "country_origin": row['country_noc'] if 'country_noc' in row else "",
        "total_medals": 0,
        "medals_detail": {"Gold": 0, "Silver": 0, "Bronze": 0}
    }

In [13]:
# Cette fonction traite une ligne de résultat (JO ou Mondial), dénormalise les données (sexe, année) et met à jour les collections events et editions.
results_to_import = []
import re

def process_row(row, comp_name):
    # 1. Extraction de l'année depuis la colonne 'edition' (ex: "1908 Summer Olympics")
    # Si 'year' n'existe pas, on le cherche dans 'edition'
    if 'year' in row and pd.notna(row['year']):
        year = int(row['year'])
    else:
        edition_str = str(row.get('edition', ''))
        match = re.search(r'\d{4}', edition_str)
        year = int(match.group()) if match else 0

    # 2. Gestion du NOM et de l'ID
    # Dans ton fichier c'est 'athlete' et 'athlete_id'
    name = row.get('athlete', row.get('name', row.get('Name', 'Unknown')))
    a_id = str(row.get('athlete_id', name))

    # 3. Récupération des autres infos (avec tes noms de colonnes réels)
    event_name = row.get('event', 'Unknown')
    medal_val = row.get('medal', "na")
    medal = str(medal_val).strip() if pd.notna(medal_val) else "na"
    noc = row.get('country_noc', row.get('noc', 'Unknown'))
    pos = str(row.get('pos', 'na'))

    # 4. Récupération du sexe via la bio (chargée au début)
    sex = athletes_map.get(a_id, {}).get('sex', 'Unknown')

    # Création du document pour la collection 'results'
    res_doc = {
        "athlete_id": a_id,
        "athlete_name": name,
        "sex": sex,
        "year": year,
        "competition": comp_name,
        "event": event_name,
        "pos": pos,
        "medal": medal,
        "noc": noc
    }
    results_to_import.append(res_doc)

    # --- MISE À JOUR DES COMPTEURS POUR LES AUTRES COLLECTIONS ---

    # Mise à jour 'athletes' (total_medals)
    if medal in ["Gold", "Silver", "Bronze"]:
        if a_id in athletes_map:
            athletes_map[a_id]["total_medals"] += 1
            athletes_map[a_id]["medals_detail"][medal] += 1

    # Mise à jour 'events' (nb_editions)
    event_key = f"{event_name}_{sex}".lower().replace(" ", "_")
    if event_key not in events_map:
        events_map[event_key] = {"_id": event_key, "event_name": event_name, "gender": sex, "years": set()}
    events_map[event_key]["years"].add(year)

    # Mise à jour 'editions' (count_disciplines)
    # On utilise l'édition complète comme ID (ex: "1908 Summer Olympics")
    edit_id = str(row.get('edition', year))
    if edit_id not in editions_map:
        editions_map[edit_id] = {"_id": edit_id, "year": year, "competition": comp_name, "disciplines": set()}
    editions_map[edit_id]["disciplines"].add(event_name)

In [14]:
# Traitement du gros fichier de résultats JO
results_jo = pd.read_csv(os.path.join(PATH_OLYMPEDIA, 'Olympic_Athlete_Event_Results.csv'))
for _, row in results_jo.iterrows():
    process_row(row, "Olympics")

# Traitement des 811 fichiers Mondiaux
for i in range(811):
    file_path = os.path.join(PATH_WORLDS, f"{i}.csv")
    if os.path.exists(file_path):
        df_world = pd.read_csv(file_path)
        for _, row in df_world.iterrows():
            process_row(row, "World Championships")

# Finalisation des champs calculés pour 'events' et 'editions'
final_events = []
for k, v in events_map.items():
    v["nb_editions"] = len(v["years"])
    v["years_active"] = sorted(list(v["years"]))
    del v["years"] # On supprime le set temporaire
    final_events.append(v)

final_editions = []
for k, v in editions_map.items():
    v["count_disciplines"] = len(v["disciplines"])
    del v["disciplines"]
    final_editions.append(v)



In [15]:
def save_json(data, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False)

save_json(list(athletes_map.values()), "athletes.json")
save_json(results_to_import, "results.json")
save_json(final_events, "events.json")
save_json(final_editions, "editions.json")

print("Fichiers JSON prêts pour l'importation !")

Fichiers JSON prêts pour l'importation !


In [17]:
# !zip -r [nom_du_fichier_final].zip [chemin_du_dossier_a_compresser]
!zip -r dossier_JSON_MONGODB.zip /content/fichier_JSON

updating: content/fichier_JSON/ (stored 0%)
updating: content/fichier_JSON/athletes.json (deflated 86%)
updating: content/fichier_JSON/editions.json (deflated 89%)
updating: content/fichier_JSON/results.json (deflated 92%)
updating: content/fichier_JSON/events.json (deflated 91%)
