In [None]:
!pip -q install pandas openpyxl xlsxwriter

In [None]:
import pandas as pd

input_path = "Fichier initial-11.xlsx"

# Read without headers so the first row becomes data; we'll drop/skip it per instructions
df_raw = pd.read_excel(input_path, header=None)

# Use data starting from the second line
df = df_raw.iloc[1:].reset_index(drop=True)

# Prepare columns according to the spec
# expression: first column
expression = df.iloc[:, 0]

# annotation: last column
annotation = df.iloc[:, -1]

# tests: try to take the first three columns after the first one; if fewer than 3 exist, fall back/pad
middle = df.iloc[:, 1:-1]

tests = []
if middle.shape[1] >= 3:
    tests = [middle.iloc[:, 0], middle.iloc[:, 1], middle.iloc[:, 2]]
else:
    # take whatever exists and then pad with NaN
    for i in range(min(3, middle.shape[1])):
        tests.append(middle.iloc[:, i])
    # pad missing test columns
    for _ in range(3 - len(tests)):
        tests.append(pd.Series([pd.NA] * len(df)))

# Build the new DataFrame
new_df = pd.DataFrame({
    "expression": expression,
    "annotateur_1": ["H√©l√®ne"] * len(df),
    "annotation_1": annotation,
    "test1_1": tests[0].reset_index(drop=True),
    "test2_1": tests[1].reset_index(drop=True),
    "test3_1": tests[2].reset_index(drop=True),
})

# Export to Excel
output_path = "Fichier initial-11-.xlsx"
new_df.to_excel(output_path, index=False)

output_path

'Fichier initial-11-.xlsx'

In [None]:

base_path = "Fichier final.xlsx"
new_path  = "Fichier initial-11-.xlsx"


In [7]:
import pandas as pd
import numpy as np
import re
from datetime import datetime

# --- Chargement des DataFrames selon l‚Äôoption choisie ---
# Option A (t√©l√©versement) :
# try:
#     base_df = pd.read_excel(io.BytesIO(uploaded[base_key]))
#     new_df  = pd.read_excel(io.BytesIO(uploaded[new_key]))
# except:
# Option B (Drive) :
base_path = "Fichier final.xlsx"
new_path  = "Fichier initial-11-.xlsx"
base_df = pd.read_excel(base_path)
new_df  = pd.read_excel(new_path)


# Find the header row by looking for non-empty, non-NaN values in the first few rows
def find_header_row(df: pd.DataFrame, num_rows_to_check=10) -> int:
    for i in range(min(num_rows_to_check, len(df))):
        if df.iloc[i].notna().any():
            return i
    return 0 # Default to the first row if no suitable header is found in the first 10 rows

base_header_row = find_header_row(base_df)
new_header_row = find_header_row(new_df)

# Reload dataframes with the identified header row
base_df = pd.read_excel(base_path, header=base_header_row)
new_df  = pd.read_excel(new_path, header=new_header_row)


# ---- Aide : affichage rapide des colonnes d√©tect√©es ----
print("Colonnes ‚Äî Fichier initial :", list(base_df.columns))
print("Colonnes ‚Äî Fichier modifi√© :", list(new_df.columns))

# --------- Fonctions d‚Äôinf√©rence de sch√©ma (tol√©rantes) ----------
def _norm(s: str) -> str:
    return re.sub(r"\s+", " ", str(s)).strip().lower()

def deviner_col_expression(df: pd.DataFrame) -> str:
    """
    D√©tecte la colonne 'expression' (ex. 'Expression', '–≤—ã—Ä–∞–∂–µ–Ω–∏–µ', 'locution', 'MWE', 'EP').
    Par d√©faut, prend la premi√®re colonne si rien d‚Äô√©puisette n‚Äôappara√Æt.
    """
    for col in df.columns:
        n = _norm(col)
        if "expression" in n or "–≤—ã—Ä–∞–∂–µ–Ω–∏–µ" in n or "locution" in n or "mwe" in n or "ep" in n:
            return col
    return df.columns[0]

def deviner_col_annotateur(df: pd.DataFrame):
    """
    D√©tecte la colonne avec le nom de l‚Äôannotateur (ex. 'annotateur', '–∏–º—è', 'auteur', 'nom', 'name').
    Retourne None si introuvable.
    """
    for col in df.columns:
        n = _norm(col)
        if "annotateur" in n or "–∏–º—è" in n or "auteur" in n or "nom" in n or "name" in n:
            return col
    return df.columns[1] # Fallback to second column

def deviner_col_annotation(df: pd.DataFrame, exclure=()):
    """
    D√©tecte la colonne du texte d‚Äôannotation (ex. 'annotation', 'commentaire', 'note', '–æ–ø–∏—Å–∞–Ω–∏–µ').
    Sinon, choisit une colonne textuelle libre.
    """
    for col in df.columns:
        if col in exclure:
            continue
        n = _norm(col)
        if "annotation" in n or "commentaire" in n or "note" in n or "–æ–ø–∏—Å–∞–Ω–∏–µ" in n:
            return col
    # Fallback to a text column
    for col in df.columns:
        if col not in exclure and df[col].dtype == 'object':
             return col
    return df.columns[5] # Default fallback


def deviner_cols_tests(df: pd.DataFrame, exclure=(), nombre=3):
    """
    R√©cup√®re jusqu‚Äô√† 3 colonnes de 'tests' (ex. 'Test 1/2/3', '—Ç–µ—Å—Ç', 'VIC', 'COORD', 'META', etc.).
    Compl√®te avec d‚Äôautres colonnes si besoin, pour avoir exactement 3 colonnes de tests.
    """
    tests = []
    for c in df.columns:
        if c in exclure:
            continue
        n = _norm(c)
        if ("test" in n) or ("—Ç–µ—Å—Ç" in n) or any(k in n for k in ["vic", "coord", "meta", "reif", "euf", "opac", "–æ–ø–∞—Ü"]):
            tests.append(c)
    # Compl√©ter si < nombre :
    if len(tests) < nombre:
        for c in df.columns:
            if c in exclure or c in tests:
                continue
            tests.append(c)
            if len(tests) == nombre:
                break
    return tests[:nombre]

# ---- D√©tection des colonnes dans chaque fichier ----
base_expr = deviner_col_expression(base_df)
base_nom  = deviner_col_annotateur(base_df)
base_ann  = deviner_col_annotation(base_df, exclure=(base_expr, base_nom))
base_tests = deviner_cols_tests(base_df, exclure=(base_expr, base_nom, base_ann))

new_expr = deviner_col_expression(new_df)
new_nom  = deviner_col_annotateur(new_df)
new_ann  = deviner_col_annotation(new_df, exclure=(new_expr, new_nom))
new_tests = deviner_cols_tests(new_df, exclure=(new_expr, new_nom, new_ann))


print("üìå Mappage ‚Äî Fichier initial :")
print("  Expression :", base_expr)
print("  Annotateur :", base_nom)
print("  Annotation :", base_ann)
print("  Tests      :", base_tests)
print("üìå Mappage ‚Äî Fichier initial-2 :")
print("  Expression :", new_expr)
print("  Annotateur :", new_nom)
print("  Annotation :", new_ann)
print("  Tests      :", new_tests)


# -------- Construction du tableau canonique √† partir du fichier initial --------
def construire_base(df, expr_col, nom_col, ann_col, test_cols):
    out = pd.DataFrame()
    out["expression"]   = df[expr_col].astype(str).str.strip()

    # Determine the maximum existing slot in the base DataFrame
    max_base_slot = 0
    for col in df.columns:
        m = re.match(r".*_(\d+)$", str(col))
        if m:
            max_base_slot = max(max_base_slot, int(m.group(1)))

    # Copy existing columns from the base DataFrame, up to the maximum slot found
    for k in range(1, max_base_slot + 1):
        out[f"annotateur_{k}"] = df[f"annotateur_{k}"] if f"annotateur_{k}" in df.columns else np.nan
        out[f"annotation_{k}"] = df[f"annotation_{k}"] if f"annotation_{k}" in df.columns else np.nan
        out[f"test1_{k}"] = df[f"test1_{k}"] if f"test1_{k}" in df.columns else np.nan
        out[f"test2_{k}"] = df[f"test2_{k}"] if f"test2_{k}" in df.columns else np.nan
        out[f"test3_{k}"] = df[f"test3_{k}"] if f"test3_{k}" in df.columns else np.nan

    return out, max_base_slot

merged, max_base_slot = construire_base(base_df, base_expr, base_nom, base_ann, base_tests)

# ---- Ajout des annotations du 2e fichier ----
def _assurer_colonnes_slot(df, k: int):
    """Cr√©e les colonnes du slot k si elles n‚Äôexistent pas."""
    for col in [f"annotateur_{k}", f"annotation_{k}", f"test1_{k}", f"test2_{k}", f"test3_{k}"]:#
        if col not in df.columns:
            df[col] = np.nan

for _, row in new_df.iterrows():
    expr = str(row[new_expr]).strip()
    if expr == "" or expr.lower() == "nan":
        continue

    # Find the rows in merged_df that match the expression
    mask = merged["expression"].astype(str).str.strip() == expr

    if not mask.any():
        # Nouvelle expression ‚Üí nouvelle ligne, slot 1
        _assurer_colonnes_slot(merged, 1)
        rec = {
            "expression":   expr,
            "annotateur_1": row[new_nom] if (new_nom in new_df.columns) else np.nan,
            "annotation_1": row[new_ann] if (new_ann in new_df.columns) else np.nan,
            "test1_1": row[new_tests[0]] if (len(new_tests)>0 and new_tests[0] in new_df.columns) else np.nan,
            "test2_1": row[new_tests[1]] if (len(new_tests)>1 and new_tests[1] in new_df.columns) else np.nan,
            "test3_1": row[new_tests[2]] if (len(new_tests)>2 and new_tests[2] in new_df.columns) else np.nan,
        }
        merged.loc[len(merged)] = rec
    else:
        # Expression d√©j√† pr√©sente ‚Üí check for existing identical annotation
        duplicate_found = False
        current_max_slot = 0
        for col in merged.columns:
             m = re.match(r".*_(\d+)$", str(col))
             if m:
                 current_max_slot = max(current_max_slot, int(m.group(1)))

        for k in range(1, current_max_slot + 1):
            annotator_col = f"annotateur_{k}"
            annotation_col = f"annotation_{k}"
            test1_col = f"test1_{k}"
            test2_col = f"test2_{k}"
            test3_col = f"test3_{k}"

            if (annotator_col in merged.columns and
                annotation_col in merged.columns and
                test1_col in merged.columns and
                test2_col in merged.columns and
                test3_col in merged.columns):

                # Check if the annotation in the current slot is identical to the one from the new file
                if (merged.loc[mask, annotator_col].astype(str).str.strip().eq(str(row[new_nom]).strip()).all() and
                    merged.loc[mask, annotation_col].astype(str).str.strip().eq(str(row[new_ann]).strip()).all() and
                    merged.loc[mask, test1_col].astype(str).str.strip().eq(str(row[new_tests[0]]).strip() if len(new_tests)>0 else str(np.nan)).all() and
                    merged.loc[mask, test2_col].astype(str).str.strip().eq(str(row[new_tests[1]]).strip() if len(new_tests)>1 else str(np.nan)).all() and
                    merged.loc[mask, test3_col].astype(str).str.strip().eq(str(row[new_tests[2]]).strip() if len(new_tests)>2 else str(np.nan)).all()):

                    duplicate_found = True
                    break

        if not duplicate_found:
            # Find the first available empty slot
            k = 1
            while f"annotateur_{k}" in merged.columns and merged.loc[mask, f"annotateur_{k}"].notna().any():
                 k += 1

            _assurer_colonnes_slot(merged, k)
            merged.loc[mask, f"annotateur_{k}"] = row[new_nom] if (new_nom in new_df.columns) else np.nan
            merged.loc[mask, f"annotation_{k}"] = row[new_ann] if (new_ann in new_df.columns) else np.nan
            merged.loc[mask, f"test1_{k}"] = row[new_tests[0]] if (len(new_tests)>0 and new_tests[0] in new_df.columns) else np.nan
            merged.loc[mask, f"test2_{k}"] = row[new_tests[1]] if (len(new_tests)>1 and new_tests[1] in new_df.columns) else np.nan
            merged.loc[mask, f"test3_{k}"] = row[new_tests[2]] if (len(new_tests)>2 and new_tests[2] in new_df.columns) else np.nan


# ---- Tri par ‚Äúexpression‚Äù + r√©organisation des colonnes par groupes de slots ----
# merged = merged.sort_values(by="expression", kind="stable").reset_index(drop=True) # Removed sorting

def reordonner_colonnes(df: pd.DataFrame) -> pd.DataFrame:
    cols = ["expression"]
    max_slot = 1
    for c in df.columns:
        m = re.match(r".*_(\d+)$", str(c))
        if m:
            max_slot = max(max_slot, int(m.group(1)))
    for k in range(1, max_slot+1):
        for base in ["annotateur", "annotation", "test1", "test2", "test3"]:
            col = f"{base}_{k}"
            if col in df.columns:
                cols.append(col)
    # Ajouter d‚Äô√©ventuelles colonnes restantes (s√©curit√©)
    for c in df.columns:
        if c not in cols:
            cols.append(c)
    return df[cols]

merged = reordonner_colonnes(merged)
print("‚úÖ Fusion termin√©e. Lignes :", len(merged), "Colonnes :", len(merged.columns))
merged.head(10)

FileNotFoundError: [Errno 2] No such file or directory: 'Fichier final.xlsx'

In [None]:
nom_fichier = f"Fichier final-.xlsx"


# (A) Sauver localement dans l'environnement Colab (puis T√©l√©charger manuellement)
chemin_sortie = nom_fichier



with pd.ExcelWriter(chemin_sortie, engine="xlsxwriter") as writer:
    merged.to_excel(writer, index=False, sheet_name="merged")

print("Fichier √©crit :", chemin_sortie)


Fichier √©crit : Fichier final-.xlsx


In [25]:
# --- –ù–∞—Å—Ç—Ä–æ–π–∫–∏ ---
INPUT_XLSX_PATH = "Annotations finales.xlsx"   # –ø—Ä–∏ –Ω–µ–æ–±—Ö–æ–¥–∏–º–æ—Å—Ç–∏ –∏–∑–º–µ–Ω–∏—Ç–µ –ø—É—Ç—å
SHEET_NAME = 0  # –∏–º—è –∏–ª–∏ –∏–Ω–¥–µ–∫—Å –ª–∏—Å—Ç–∞
COL_EXPR = "expression"     # –∫–æ–ª–æ–Ω–∫–∞ —Å –≤—ã—Ä–∞–∂–µ–Ω–∏–µ–º

# –ß—Ç–æ –¥–µ–ª–∞—Ç—å —Å —É—Å—Ç–∞—Ä–µ–≤—à–∏–º —è—Ä–ª—ã–∫–æ–º "Nom Compos√©":
#   "ignore"      -> –∏–≥–Ω–æ—Ä–∏—Ä–æ–≤–∞—Ç—å –≤ –≥–æ–ª–æ—Å–æ–≤–∞–Ω–∏–∏ (–Ω–æ –ø–æ–∫–∞–∑—ã–≤–∞—Ç—å –≤ —Å–≤–æ–¥–∫–µ)
#   "map_to_autre"-> —Å–≤–æ—Ä–∞—á–∏–≤–∞—Ç—å –≤ "Autre"
#   "keep"        -> –æ—Å—Ç–∞–≤–∏—Ç—å –∫–∞–∫ –ø–æ–ª–Ω–æ—Ü–µ–Ω–Ω—É—é –º–µ—Ç–∫—É
NOM_COMPOSE_HANDLING = "ignore"

# –°–≤–æ—Ä–∞—á–∏–≤–∞—Ç—å –≤—Å–µ –≤–∞—Ä–∏–∞–Ω—Ç—ã "Autre*", "Autre_..." –≤ –æ–±—â–∏–π –∫–ª–∞—Å—Å "Autre"?
FOLD_AUTRE_FAMILY = True

# --- –ò–º–ø–æ—Ä—Ç ---
import re
import pandas as pd
from collections import Counter, defaultdict

# --- –ß—Ç–µ–Ω–∏–µ ---
df = pd.read_excel(INPUT_XLSX_PATH, sheet_name=SHEET_NAME)

# –ù–∞–π–¥—ë–º –≤—Å–µ –∫–æ–ª–æ–Ω–∫–∏ –∞–Ω–Ω–æ—Ç–∞—Ü–∏–π –≤–∏–¥–∞ annotation_1, annotation_2, ...
ann_cols = [c for c in df.columns if re.fullmatch(r"annotation_\d+", str(c))]
if not ann_cols:
    raise ValueError("–ù–µ –Ω–∞–π–¥–µ–Ω—ã –∫–æ–ª–æ–Ω–∫–∏ –∞–Ω–Ω–æ—Ç–∞—Ü–∏–π –≤–∏–¥–∞ 'annotation_1', 'annotation_2', ...")

if COL_EXPR not in df.columns:
    raise ValueError(f"–ù–µ –Ω–∞–π–¥–µ–Ω–∞ –∫–æ–ª–æ–Ω–∫–∞ —Å –≤—ã—Ä–∞–∂–µ–Ω–∏–µ–º: {COL_EXPR}")

# --- –°–ø—Ä–∞–≤–æ—á–Ω–∏–∫–∏ —è—Ä–ª—ã–∫–æ–≤ ---
MAIN_LABELS = [
    "Expression_idiomatique",
    "Collocation_opaque",
    "Collocation_transparente",
    "Expression_libre",
    "Nom_compos√©_transparent",
    "Nom_compos√©_opaque" # Added Nom_compos√©_opaque to labels
    # 'Autre' –Ω–µ –ø–æ–ª–Ω—ã–π —è—Ä–ª—ã–∫; –ª–æ–≤–∏–º —Å–µ–º–µ–π—Å—Ç–≤–æ Autre* –æ—Ç–¥–µ–ª—å–Ω–æ (—Å–º. –Ω–∏–∂–µ)
]

NOM_COMPOSE_LABELS = ["Nom Compos√©", "Nom Compose"]

# –°–æ–±–µ—Ä—ë–º —à–∞–±–ª–æ–Ω—ã –¥–ª—è –ø–æ–∏—Å–∫–∞ –≤ —è—á–µ–π–∫–∞—Ö
# Modified regex to find labels separated by non-word characters like commas or spaces
escaped = [re.escape(x) for x in MAIN_LABELS + NOM_COMPOSE_LABELS]
labels_pattern = r"\b(" + "|".join(escaped) + r")\b"
labels_re = re.compile(labels_pattern, flags=re.IGNORECASE)

# –û—Ç–¥–µ–ª—å–Ω—ã–π —à–∞–±–ª–æ–Ω –¥–ª—è —Å–µ–º–µ–π—Å—Ç–≤–∞ Autre*
autre_family_re = re.compile(r"\bAutre[\w-]*\b", flags=re.IGNORECASE)


def normalize_label(raw: str) -> str:
    low = raw.strip().lower()
    canon_map = {
        "expression_idiomatique": "Expression_idiomatique",
        "collocation_opaque": "Collocation_opaque",
        "collocation_transparente": "Collocation_transparente",
        "expression_libre": "Expression_libre",
        "nom compos√©": "Nom Compos√©",
        "nom compose": "Nom Compos√©",
        "nom_compos√©_transparent": "Collocation_transparente",
        "nom_compos√©_opaque": "Nom_compos√©_opaque" # Keep original for special handling
    }
    return canon_map.get(low, raw.strip())

def extract_from_cell(val):
    """–ò–∑–≤–ª–µ—á—å –º–µ—Ç–∫–∏ –∏–∑ –æ–¥–Ω–æ–π —è—á–µ–π–∫–∏ (—Å—Ç—Ä–æ–∫–∏). –£—á–∏—Ç—ã–≤–∞–µ–º MAIN_LABELS, Nom Compos√© –∏ —Å–µ–º–µ–π—Å—Ç–≤–æ Autre*."""
    if not isinstance(val, str):
        if DEBUG_MODE and DEBUG_EXPR in str(val):
             print(f"  DEBUG: extract_from_cell received non-string value: {val}")
        return []
    text = val.strip()
    if not text:
        return []

    found = []

    # Find main labels and Nom Compos√©
    for m in labels_re.finditer(text):
        normalized = normalize_label(m.group(1))
        if normalized == "Nom_compos√©_opaque":
             # Special handling for Nom_compos√©_opaque: add two labels
             found.append("Collocation_opaque")
             found.append("Expression_idiomatique")
        else:
            found.append(normalized)


    # Find Autre* variations
    for m in autre_family_re.finditer(text):
        raw = m.group(0).strip()
        if FOLD_AUTRE_FAMILY:
            # Avoid adding "Autre" if a more specific Autre* label is already found and not folded
            if FOLD_AUTRE_FAMILY or "Autre" not in found: # Check for "Autre" in found, not raw
                 found.append("Autre")
        else:
            if raw not in found: # Avoid duplicates if not folding
                found.append(raw)

    # Filter out duplicates while preserving order as much as possible (though order doesn't strictly matter for counts)
    # Using a set for uniqueness and then converting back to list
    unique_found = list(dict.fromkeys(found))

    if DEBUG_MODE and DEBUG_EXPR in text:
         print(f"  DEBUG: extract_from_cell('{val}') -> {unique_found}")

    return unique_found


def apply_nom_compose_policy(labels):
    # Ensure "Nom_compos√©_opaque" is not processed here as it's handled in extract_from_cell
    labels_filtered = [l for l in labels if l != "Nom_compos√©_opaque"]

    if NOM_COMPOSE_HANDLING == "ignore":
        return [l for l in labels_filtered if l != "Nom Compos√©"]
    if NOM_COMPOSE_HANDLING == "map_to_autre":
        return ["Autre" if l == "Nom Compos√©" else l for l in labels_filtered]
    if NOM_COMPOSE_HANDLING == "keep":
        return labels_filtered # Keep other labels including Nom Compos√© if policy is keep
    raise ValueError("NOM_COMPOSE_HANDLING –¥–æ–ª–∂–µ–Ω –±—ã—Ç—å 'ignore' | 'map_to_autre' | 'keep'.")

# --- –ü–æ–¥—Å—á—ë—Ç—ã –ø–æ –≤—ã—Ä–∞–∂–µ–Ω–∏—è–º ---
per_expr_raw = defaultdict(Counter)    # —Å Nom Compos√© –∫–∞–∫ –µ—Å—Ç—å; Autre* –≤ –∑–∞–≤–∏—Å–∏–º–æ—Å—Ç–∏ –æ—Ç FOLD_AUTRE_FAMILY
per_expr_vote = defaultdict(Counter)   # –ø–æ—Å–ª–µ –ø–æ–ª–∏—Ç–∏–∫–∏ (–¥–ª—è –≥–æ–ª–æ—Å–æ–≤–∞–Ω–∏—è)

# Debugging settings
DEBUG_MODE = True
DEBUG_EXPR = "aller de soi"


for index, row in df.iterrows():
    expr = row[COL_EXPR]
    # Convert expression to string for consistent comparison
    expr_str = str(expr).strip()

    if DEBUG_MODE and expr_str == DEBUG_EXPR:
        print(f"\n--- Processing row for '{expr_str}' (Index {index}) ---")
        print("Original Row Data:")
        print(row) # Print the entire row data for context


    row_labels = []
    for c in ann_cols:
        # Add debugging inside the loop for extract_from_cell calls
        extracted = extract_from_cell(row[c])
        row_labels.extend(extracted)
        if DEBUG_MODE and expr_str == DEBUG_EXPR and extracted:
            print(f"  DEBUG: From column '{c}': {extracted}")


    # —Å—ã—Ä—ã–µ
    for a in row_labels:
        per_expr_raw[expr_str][a] += 1
    # –¥–ª—è –≥–æ–ª–æ—Å–æ–≤–∞–Ω–∏—è
    for a in apply_nom_compose_policy(row_labels):
        per_expr_vote[expr_str][a] += 1

    # Debugging for the specific expression after processing all columns
    if DEBUG_MODE and expr_str == DEBUG_EXPR:
        print(f"--- Debugging summary for '{expr_str}' ---")
        print("All extracted labels for expression:", row_labels)
        print("Raw votes (before policy):", per_expr_raw[expr_str])
        print("Votes (after policy):", per_expr_vote[expr_str])
        print("Resolved annotation:", resolve_annotation(per_expr_vote[expr_str]))
        print("------------------------------------------\n")


def resolve_annotation(votes: Counter) -> str:
    """–ü—Ä–∞–≤–∏–ª–∞ –±–æ–ª—å—à–∏–Ω—Å—Ç–≤–∞/–Ω–∏—á—å–∏/–µ–¥–∏–Ω—Å—Ç–≤–µ–Ω–Ω–æ–π –º–µ—Ç–∫–∏."""
    if not votes:
        return ""  # –Ω–µ—Ç –∞–Ω–Ω–æ—Ç–∞—Ü–∏–π

    items = votes.most_common()
    labels = [l for l, _ in items]
    counts = [c for _, c in items]

    if len(items) == 1:
        if counts[0] == 1:
            return f"{labels[0]}!"  # –µ–¥–∏–Ω—Å—Ç–≤–µ–Ω–Ω–∞—è –∞–Ω–Ω–æ—Ç–∞—Ü–∏—è
        else:
            return f"{labels[0]} ({counts[0]})" # –Ω–µ—Å–∫–æ–ª—å–∫–æ –∞–Ω–Ω–æ—Ç–∞—Ü–∏–π –æ–¥–Ω–æ–≥–æ —Ç–∏–ø–∞

    if len(items) > 1 and counts[0] == counts[1]:
        tied = [l for l, c in items if c == counts[0]]
        return f"–Ω–∏—á—å—è: {' / '.join(tied)} (–ø–æ {counts[0]})"

    # Multiple annotation types, no tie - choose most frequent and show others with vs
    if len(items) > 1:
        most_frequent_label = labels[0]
        most_frequent_count = counts[0]

        other_counts = ", ".join(f"{labels[i]} ({counts[i]})" for i in range(1, len(items)))
        return f"{most_frequent_label} ({most_frequent_count} vs {other_counts})"

    return "" # Should not reach here


def counter_to_str(counter: Counter) -> str:
    if not counter:
        return ""
    return ", ".join(f"{k}:{v}" for k, v in counter.most_common())

# --- –§–æ—Ä–º–∏—Ä–æ–≤–∞–Ω–∏–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–∞ ---
# Need to re-calculate resolved and summary based on the updated per_expr_vote and per_expr_raw
resolved = df[COL_EXPR].map(lambda e: resolve_annotation(per_expr_vote[str(e).strip()])) # Ensure expression key is string and stripped
summary = df[COL_EXPR].map(lambda e: counter_to_str(per_expr_raw[str(e).strip()])) # Ensure expression key is string and stripped


# Inser the new columns into a copy of the original dataframe to maintain original row order
df_out = df.copy()
expr_pos = list(df_out.columns).index(COL_EXPR)
df_out.insert(expr_pos + 1, "annotation_resolved", resolved)
df_out.insert(expr_pos + 2, "annotation_votes_summary", summary)

# Removed reordering of columns to maintain original row order
# def reordonner_colonnes(df: pd.DataFrame) -> pd.DataFrame:
#     cols = ["expression"]
#     max_slot = 1
#     for c in df.columns:
#         m = re.match(r".*_(\d+)$", str(c))
#         if m:
#             max_slot = max(max_slot, int(m.group(1)))
#     for k in range(1, max_slot+1):
#         for base in ["annotateur", "annotation", "test1", "test2", "test3"]:
#             col = f"{base}_{k}"
#             if col in df.columns:
#                 cols.append(col)
#     # Ajouter d‚Äô√©ventuelles colonnes restantes (s√©curit√©)
#     for c in df.columns:
#         if c not in cols:
#             cols.append(c)
#     return df[cols]

# merged = reordonner_colonnes(merged) # This line is now effectively skipped


print("‚úÖ Fusion termin√©e. Lignes :", len(df_out), "Colonnes :", len(df_out.columns))
display(df_out.head(10))

# --- –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ ---
OUT_XLSX = "Annotations_finales_resolved.xlsx"
OUT_CSV  = "Annotations_finales_resolved.csv"
df_out.to_excel(OUT_XLSX, index=False)
df_out.to_csv(OUT_CSV, index=False, encoding="utf-8")

print("–ì–æ—Ç–æ–≤–æ.")
print("–°–æ—Ö—Ä–∞–Ω–µ–Ω–æ –≤:")
print(" -", OUT_XLSX)
print(" -", OUT_CSV)


--- Processing row for 'aller de soi' (Index 20) ---
Original Row Data:
expression             aller de soi
annotateur_1                  Anna2
annotation_1     Collocation_opaque
test1_1                  Test_MOTIV
test2_1           Test_N_ABSTR/PRED
                        ...        
annotateur_12                   NaN
annotation_12                   NaN
test1_12                        NaN
test2_12                        NaN
test3_12                        NaN
Name: 20, Length: 61, dtype: object
  DEBUG: From column 'annotation_1': ['Collocation_opaque']
  DEBUG: From column 'annotation_2': ['Expression_idiomatique']
  DEBUG: From column 'annotation_3': ['Expression_idiomatique']
--- Debugging summary for 'aller de soi' ---
All extracted labels for expression: ['Collocation_opaque', 'Expression_idiomatique', 'Expression_idiomatique']
Raw votes (before policy): Counter({'Expression_idiomatique': 2, 'Collocation_opaque': 1})
Votes (after policy): Counter({'Expression_idiomatique': 2,

Unnamed: 0,expression,annotation_resolved,annotation_votes_summary,annotateur_1,annotation_1,test1_1,test2_1,test3_1,annotateur_2,annotation_2,...,annotateur_11,annotation_11,test1_11,test2_11,test3_11,annotateur_12,annotation_12,test1_12,test2_12,test3_12
0,abonnement mensuel,Collocation_transparente (3),Collocation_transparente:3,Amalia,Collocation_transparente,Test_SEM_REST_OUI,Test_ID_OUI,Test_METON_NON,Devika,Collocation_transparente,...,,,,,,,,,,
1,aboyer (le chien),Autre (2 vs Collocation_transparente (1)),"Autre:2, Collocation_transparente:1",Pingping,----------,----------,----------,-----------,Devika,Autre (Expliquer dans Commentaires),...,,,,,,,,,,
2,accent aigu,Collocation_opaque (2 vs Collocation_transpare...,"Collocation_opaque:2, Collocation_transparente:1",Anna,Collocation_opaque,Test_LEX,Test_ID_OUI,Test_METON_OUI,Devika,Collocation_transparente,...,,,,,,,,,,
3,accent am√©ricain,Collocation_transparente (2 vs Expression_libr...,"Collocation_transparente:2, Expression_libre:1",Amalia,Expression_libre,Test_SEM_REST_NON,----------,-----------,Devika,Collocation_transparente,...,,,,,,,,,,
4,acquisition des connaissances,–Ω–∏—á—å—è: Collocation_transparente / Autre (–ø–æ 1),"Collocation_transparente:1, Autre:1",Anna,Collocation_transparente,Test_SEM_REST_OUI,Test_ID_OUI,Test_METON_NON,Amalia,Autre (expliquer dans le commentaire),...,,,,,,,,,,
5,acqu√©rir la nationalit√©,Collocation_transparente (8 vs Collocation_opa...,"Collocation_transparente:8, Collocation_opaque:2",Anna,Collocation_transparente,Test_MORPH,Test_V_SPEC,Test_METON_NON,Amalia,Collocation_transparente,...,,,,,,,,,,
6,action humanitaire,Collocation_transparente (2 vs Collocation_opa...,"Collocation_transparente:2, Collocation_opaque:1",Pingping,Collocation_transparente,Test_MODIF,Test_ID_OUI,Test_METON_NON,Anna,Collocation_transparente,...,,,,,,,,,,
7,activit√© humaine,Collocation_transparente (2 vs Collocation_opa...,"Collocation_transparente:2, Collocation_opaque:1",Anna,Collocation_transparente,Test_SEM_REST_OUI,Test_ID_OUI,Test_METON_NON,Devika,Collocation_transparente,...,,,,,,,,,,
8,adresser la parole,Collocation_opaque (3 vs Collocation_transpare...,"Collocation_opaque:3, Collocation_transparente:1",Pingping,Collocation_transparente,Test_MORPH,Test_V_SPEC,Test_METON_NON,Anna,Collocation_opaque,...,,,,,,,,,,
9,agence de publicit√©,Collocation_transparente (7 vs Expression_libr...,"Collocation_transparente:7, Expression_libre:1",Cristian,Collocation_transparente,Test_SEM_REST_OUI,Test_ID_OUI,-----------,Ioana,Collocation_transparente,...,,,,,,,,,,


–ì–æ—Ç–æ–≤–æ.
–°–æ—Ö—Ä–∞–Ω–µ–Ω–æ –≤:
 - Annotations_finales_resolved.xlsx
 - Annotations_finales_resolved.csv


Nom_compos√©_transparent se transforme par d√©faut en Collocation_transparente, car c'est une ancienne annotation.

171 cas - pas de majorit√©

42 cas - une seule annotation

je pr√©f√©rerais que ces entr√©es soient annot√©s avant d'autres