In [42]:
import numpy as np
import pandas as pd

pd.options.plotting.backend = "plotly"

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.metrics import concordance_index_ipcw
from sksurv.util import Surv

maf_df = pd.read_csv("../../data/molecular_train.csv")
maf_eval = pd.read_csv("../../data/molecular_val.csv")


In [43]:
import re
import time
import requests
import pandas as pd

# -------------------------------------------------------------------
# 0. df_all = train + eval (seules colonnes nécessaires ici)
# -------------------------------------------------------------------
# On suppose que maf_df et maf_eval existent déjà et contiennent au
# moins les colonnes "GENE" et "PROTEIN_CHANGE".
df_all = pd.concat(
    [
        maf_df[["GENE", "PROTEIN_CHANGE"]].copy(),
        maf_eval[["GENE", "PROTEIN_CHANGE"]].copy(),
    ],
    ignore_index=True,
)

# -------------------------------------------------------------------
# 1. Normalisation des gènes
# -------------------------------------------------------------------

# Mapping d’alias si besoin (à compléter si tu as d'autres alias)
GENE_ALIASES = {
    "MLL": "KMT2A",
    "MLL2": "KMT2D",
    "MLL3": "KMT2C",
    "MLL4": "KMT2B",
}

df_all["GENE_std"] = (
    df_all["GENE"]
    .astype(str)
    .str.strip()
    .str.upper()
    .replace(GENE_ALIASES)
)

# -------------------------------------------------------------------
# 2. Normalisation / parsing des annotations protéiques
# -------------------------------------------------------------------

# Regex pour les annotations nucléotidiques déguisées (splice-like)
splice_like_re = re.compile(
    r"^p\.(\d+(?:\+|-)\d+[ACGT-]*>[ACGT-]+)$"
)

def normalize_protein_change(pc: str | None) -> str | None:
    """
    Met les annotations protéiques dans une forme canonique :
    - enlève les parenthèses p.(...) -> p....
    - ajoute 'p.' si besoin.
    """
    if not isinstance(pc, str):
        return pc
    pc = pc.strip()
    m = re.match(r"^p\.\((.+)\)$", pc)
    if m:
        pc = "p." + m.group(1)
    if not pc:
        return pc
    if pc.startswith("p."):
        return pc
    if re.match(r"^[A-Za-z]\.", pc):
        # cas du style 'p.R132C' déjà correct, ou 'c.123T>G' etc.
        # ici on considère que si ça commence par 'p.' ou 'c.', c'est déjà labellisé.
        # Mais pour rester proche de ton code initial :
        return "p." + pc[2:]
    return "p." + pc


def is_splice_like(pc: str) -> bool:
    """Vrai si c'est une annotation en fait nucléotidique (c.804+1G>A etc.)."""
    m = splice_like_re.match(pc)
    return m is not None


# Regex pour récupérer les positions AA : AA ou '*' suivi d’un nombre
AA_POS_RE = re.compile(r"[A-Z\*](\d+)")


def extract_max_position_from_pc(pc_raw: str | None) -> float | None:
    """
    Renvoie la position AA max trouvée dans une annotation protéique.
    - Ignore les cas indéterminés / spéciaux
    - Ne regarde que les annotations vraiment protéiques
    - Ne prend que les entiers qui suivent un AA ou '*'
    """
    if pc_raw is None:
        return None

    pc = normalize_protein_change(pc_raw)

    if not isinstance(pc, str):
        return None

    # Cas sans effet protéique clair
    if pc in ("p.?", "?"):
        return None

    # Cas spéciaux purement symboliques
    if pc in ("p.FLT3_ITD", "p.MLL_PTD", "FLT3_ITD", "MLL_PTD"):
        return None

    # Cas nucléotidiques déguisés
    if is_splice_like(pc):
        return None

    core = pc[2:] if pc.startswith("p.") else pc

    positions = [int(m.group(1)) for m in AA_POS_RE.finditer(core)]
    if not positions:
        return None

    # Filtre grossier pour éviter des coordonnées aberrantes
    positions = [p for p in positions if p <= 4000]
    if not positions:
        return None

    return max(positions)


# Calcul du max par gène (pour info et fallback)
df_all["max_pos"] = df_all["PROTEIN_CHANGE"].map(extract_max_position_from_pc)

max_pos_by_gene = (
    df_all
    .dropna(subset=["GENE_std", "max_pos"])
    .groupby("GENE_std")["max_pos"]
    .max()
    .to_dict()
)

print("Exemples de max_pos_by_gene :", list(max_pos_by_gene.items())[:10])

# -------------------------------------------------------------------
# 3. Extraction AA_ref + position pour les substitutions simples
# -------------------------------------------------------------------

# p.R201Q, p.Y453*, etc.
AA_SUB_RE = re.compile(r"^p\.([A-Z\*])(\d+)([A-Z\*])$")


def extract_ref_aa_and_pos(pc_raw: str | None) -> tuple[str | None, int | None]:
    """
    Retourne (AA_ref, position) pour les substitutions simples p.R201Q, p.Y453*, etc.
    Sinon (None, None).
    """
    if pc_raw is None:
        return (None, None)

    pc = normalize_protein_change(pc_raw)
    if not isinstance(pc, str):
        return (None, None)

    if pc in ("p.?", "?") or is_splice_like(pc):
        return (None, None)

    m = AA_SUB_RE.match(pc)
    if not m:
        return (None, None)

    aa_ref, pos_str, aa_alt = m.groups()
    return aa_ref, int(pos_str)


# Regroupe des variants simples par gène pour calibrer les isoformes
variants_by_gene: dict[str, list[tuple[str, int]]] = {}

for gene, pc in zip(df_all["GENE_std"], df_all["PROTEIN_CHANGE"]):
    if not isinstance(gene, str):
        continue
    aa_ref, pos = extract_ref_aa_and_pos(pc)
    if aa_ref is None or pos is None:
        continue
    variants_by_gene.setdefault(gene, []).append((aa_ref, pos))

# -------------------------------------------------------------------
# 4. Récupération des séquences UniProt par gène
# -------------------------------------------------------------------

def parse_fasta(text: str) -> list[tuple[str, str]]:
    lines = text.strip().splitlines()
    records: list[tuple[str, str]] = []
    header = None
    seq_chunks = []
    for line in lines:
        if line.startswith(">"):
            if header is not None and seq_chunks:
                records.append((header, "".join(seq_chunks)))
            header = line[1:].strip()
            seq_chunks = []
        else:
            seq_chunks.append(line.strip())
    if header is not None and seq_chunks:
        records.append((header, "".join(seq_chunks)))
    return records


def fetch_uniprot_fasta_for_gene(gene_symbol: str) -> list[tuple[str, str]]:
    """
    Retourne une liste de (header, seq) pour un gène donné (symbol HGNC).
    Essaye d'abord gene_exact:, puis gene: en fallback.
    """
    # Requête "propre"
    base_query = f"gene_exact:{gene_symbol}+AND+organism_id:9606+AND+reviewed:true"
    url = (
        "https://rest.uniprot.org/uniprotkb/search"
        f"?query={base_query}&format=fasta&size=500"
    )
    r = requests.get(url)
    if r.status_code == 200 and r.text.strip():
        return parse_fasta(r.text)

    # Fallback plus permissif (pour des synonymes)
    fallback_query = f"gene:{gene_symbol}+AND+organism_id:9606+AND+reviewed:true"
    url = (
        "https://rest.uniprot.org/uniprotkb/search"
        f"?query={fallback_query}&format=fasta&size=500"
    )
    r = requests.get(url)
    if r.status_code == 200 and r.text.strip():
        return parse_fasta(r.text)

    print(f"[WARN] Pas de résultat clean pour {gene_symbol}, status={r.status_code}")
    return []


# Liste des gènes standardisés
GENES = sorted(df_all["GENE_std"].dropna().unique().tolist())

# -------------------------------------------------------------------
# 5. Choix de la "meilleure" isoforme pour chaque gène
# -------------------------------------------------------------------

def choose_best_isoform_for_gene(gene: str, records: list[tuple[str, str]]) -> str | None:
    """
    records : liste de (header, seq) UniProt
    Choisit l'isoforme qui maximise le nombre de variants simples correctement
    référencés (AA_ref, position).
    Si aucun variant simple utilisable, on se rabat sur un choix par longueur.
    """
    var_list = variants_by_gene.get(gene, [])

    # Aucun variant de calibration : retombera sur ta logique de longueur + max_pos
    if not var_list:
        if not records:
            return None
        seqs = [seq for _, seq in records]
        max_pos = max_pos_by_gene.get(gene)
        if max_pos is not None:
            candidates = [s for s in seqs if len(s) >= max_pos]
            if candidates:
                # parmi les isoformes compatibles, on prend la plus courte
                return min(candidates, key=len)
        # Sinon on prend la plus longue
        return max(seqs, key=len)

    best_seq = None
    best_score = -1

    for header, seq in records:
        score = 0
        for aa_ref, pos in var_list:
            if 1 <= pos <= len(seq) and seq[pos - 1] == aa_ref:
                score += 1
        # on favorise celui qui match le plus de variants;
        # tie-breaker : séquence plus courte
        if score > best_score or (score == best_score and best_seq is not None and len(seq) < len(best_seq)):
            best_score = score
            best_seq = seq

    # Si vraiment aucun match, on retombe sur la logique max_pos/longueur
    if best_seq is None and records:
        seqs = [seq for _, seq in records]
        max_pos = max_pos_by_gene.get(gene)
        if max_pos is not None:
            candidates = [s for s in seqs if len(s) >= max_pos]
            if candidates:
                return min(candidates, key=len)
        return max(seqs, key=len)

    return best_seq


gene_to_protein_seq: dict[str, str] = {}

for g in GENES:
    records = fetch_uniprot_fasta_for_gene(g)
    if not records:
        continue

    best_seq = choose_best_isoform_for_gene(g, records)
    if best_seq is None:
        continue

    gene_to_protein_seq[g] = best_seq
    print(
        f"{g} → len = {len(best_seq)} "
        f"(nb variants de calibration: {len(variants_by_gene.get(g, []))})"
    )

    # on évite de flooder l'API
    time.sleep(0.2)

# -------------------------------------------------------------------
# 6. Flag "aa_ref_ok" : cohérence AA de référence vs séquence UniProt
# -------------------------------------------------------------------

def is_aa_reference_consistent(row) -> bool | None:
    """
    Compare l'AA de référence et la position de PROTEIN_CHANGE à la séquence
    retenue pour ce gène. Renvoie:
    - True si cohérent
    - False si clairement discordant
    - None si non testable (pas de séquence, pas de (AA, pos) exploitable, etc.)
    """
    gene = row["GENE_std"]
    pc = row["PROTEIN_CHANGE"]
    seq = gene_to_protein_seq.get(gene)
    if seq is None:
        return None
    aa_ref, pos = extract_ref_aa_and_pos(pc)
    if aa_ref is None or pos is None:
        return None
    if not (1 <= pos <= len(seq)):
        return False
    return seq[pos - 1] == aa_ref

df_all["aa_ref_ok"] = df_all.apply(is_aa_reference_consistent, axis=1)

print(df_all["aa_ref_ok"].value_counts(dropna=False))

# -------------------------------------------------------------------
# 7. Propagation vers maf_df / maf_eval (optionnel mais pratique)
# -------------------------------------------------------------------

n_train = len(maf_df)
maf_df["aa_ref_ok"] = df_all.loc[: n_train - 1, "aa_ref_ok"].values
maf_eval["aa_ref_ok"] = df_all.loc[n_train:, "aa_ref_ok"].reset_index(drop=True).values

# Tu peux ensuite filtrer les discordances par ex.:
# bad_rows = maf_df[maf_df["aa_ref_ok"] == False]
# ou les comparer avec problem_type == "discordance_aa_reference"


Exemples de max_pos_by_gene : [('ABL1', 258.0), ('ARID1A', 1154.0), ('ARID2', 1776.0), ('ASXL1', 1433.0), ('ASXL2', 1135.0), ('ATM', 3008.0), ('ATRX', 2356.0), ('BCL10', 190.0), ('BCL2', 131.0), ('BCOR', 1745.0)]
ABL1 → len = 1130 (nb variants de calibration: 2)
ARID1A → len = 2285 (nb variants de calibration: 3)
ARID2 → len = 1835 (nb variants de calibration: 15)
ASXL1 → len = 1541 (nb variants de calibration: 292)
ASXL2 → len = 1435 (nb variants de calibration: 18)
ATM → len = 3056 (nb variants de calibration: 6)
ATRX → len = 2492 (nb variants de calibration: 14)
BAP1 → len = 1491 (nb variants de calibration: 0)
BCL10 → len = 233 (nb variants de calibration: 1)
BCL2 → len = 239 (nb variants de calibration: 1)
BCOR → len = 1755 (nb variants de calibration: 104)
BCORL1 → len = 1785 (nb variants de calibration: 49)
BLM → len = 1417 (nb variants de calibration: 3)
BRAF → len = 766 (nb variants de calibration: 25)
BRCA2 → len = 3418 (nb variants de calibration: 4)
BRCC3 → len = 316 (nb va

In [44]:
gene_to_protein_seq

{'ABL1': 'MLEICLKLVGCKSKKGLSSSSSCYLEEALQRPVASDFEPQGLSEAARWNSKENLLAGPSENDPNLFVALYDFVASGDNTLSITKGEKLRVLGYNHNGEWCEAQTKNGQGWVPSNYITPVNSLEKHSWYHGPVSRNAAEYLLSSGINGSFLVRESESSPGQRSISLRYEGRVYHYRINTASDGKLYVSSESRFNTLAELVHHHSTVADGLITTLHYPAPKRNKPTVYGVSPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTLKEDTMEVEEFLKEAAVMKEIKHPNLVQLLGVCTREPPFYIITEFMTYGNLLDYLRECNRQEVNAVVLLYMATQISSAMEYLEKKNFIHRDLAARNCLVGENHLVKVADFGLSRLMTGDTYTAHAGAKFPIKWTAPESLAYNKFSIKSDVWAFGVLLWEIATYGMSPYPGIDLSQVYELLEKDYRMERPEGCPEKVYELMRACWQWNPSDRPSFAEIHQAFETMFQESSISDEVEKELGKQGVRGAVSTLLQAPELPTKTRTSRRAAEHRDTTDVPEMPHSKGQGESDPLDHEPAVSPLLPRKERGPPEGGLNEDERLLPKDKKTNLFSALIKKKKKTAPTPPKRSSSFREMDGQPERRGAGEEEGRDISNGALAFTPLDTADPAKSPKPSNGAGVPNGALRESGGSGFRSPHLWKKSSTLTSSRLATGEEEGGGSSSKRFLRSCSASCVPHGAKDTEWRSVTLPRDLQSTGRQFDSSTFGGHKSEKPALPRKRAGENRSDQVTRGTVTPPPRLVKKNEEAADEVFKDIMESSPGSSPPNLTPKPLRRQVTVAPASGLPHKEEAGKGSALGTPAAAEPVTPTSKAGSGAPGGTSKGPAEESRVRRHKHSSESPGRDKGKLSRLKPAPPPPPAASAGKAGGKPSQSPSQEAAGEAVLGAKTKATSLVDAVNSDAAKPSQPGEGLKKPVLPATPKPQSAKPSGTPISPAPVPSTLPSASS

In [45]:
maf_df["protein_seq"] = maf_df["GENE"].map(gene_to_protein_seq)
maf_eval["protein_seq"] = maf_eval["GENE"].map(gene_to_protein_seq)


print(f"maf_df: {maf_df['protein_seq'].notna().sum()} séquences sur {len(maf_df)} lignes")
print(f"maf_eval: {maf_eval['protein_seq'].notna().sum()} séquences sur {len(maf_eval)} lignes")

maf_df: 10839 séquences sur 10935 lignes
maf_eval: 3056 séquences sur 3089 lignes


In [46]:
import re

# --- Regex HGVS ---------------------------------------------------------

missense_or_nonsense_re = re.compile(r"^p\.([A-Z])(\d+)([A-Z\*])$")
frameshift_re = re.compile(r"^p\.([A-Z])(\d+)fs.*$")
insertion_re = re.compile(r"^p\.([A-Z])(\d+)_([A-Z])(\d+)ins([A-Z]+)$")
delins_re = re.compile(r"^p\.([A-Z])(\d+)_([A-Z])(\d+)delins(\*|[A-Z]+)$")
del_with_seq_re = re.compile(r"^p\.([A-Z])(\d+)_([A-Z])(\d+)del([A-Z]+)$")
simple_stop_re = re.compile(r"^p\.([A-Z])(\d+)\*$")
simple_del_range_re = re.compile(r"^p\.([A-Z])(\d+)_([A-Z])(\d+)del$")
simple_del_single_re = re.compile(r"^p\.([A-Z])(\d+)del$")
simple_del_single_re = re.compile(r"^p\.([A-Z])(\d+)del$")
single_del_with_aa_re = re.compile(r"^p\.([A-Z])(\d+)del([A-Z])$")
insertion_re = re.compile(r"^p\.([A-Z])(\d+)_([A-Z])(\d+)ins(\*|[A-Z]+|[A-Z]+\*)$")
delins_re = re.compile(r"^p\.([A-Z])(\d+)_([A-Z])(\d+)delins(\*|[A-Z]+|[A-Z]+\*)$")
single_delins_re = re.compile(r"^p\.([A-Z])(\d+)delins(\*|[A-Z]+|[A-Z]+\*)$")
stoploss_re = re.compile(r"^p\.\*(\d+)([A-Z])$")
same_stop_re = re.compile(r"^p\.\*(\d+)\*$")
multi_missense_2_re = re.compile(r"^p\.([A-Z])(\d+)([A-Z])_([A-Z])(\d+)([A-Z])$")
synonymous_re = re.compile(r"^p\.([A-Z])(\d+)=?$")
del_with_seq_and_ins_re = re.compile(r"^p\.([A-Z])(\d+)_([A-Z])(\d+)del([A-Z]+)ins(\*|[A-Z]+|[A-Z]+\*)$")
sub_plus_del_re = re.compile(r"^p\.([A-Z])(\d+)([A-Z])_([A-Z])(\d+)del([A-Z]+)$")
sub_plus_delins_re = re.compile(r"^p\.([A-Z])(\d+)([A-Z])_([A-Z])(\d+)del([A-Z]+)ins(\*|[A-Z]+|[A-Z]+\*)$")
flt3_itd_re = re.compile(r"^p\.FLT3_ITD$")
t549re_star_re = re.compile(r"^p\.T549RE\*$")

splice_like_re = re.compile(r"^p\.(\d+(?:\+|-)\d+[ACGT-]*>[ACGT-]+)$")
pos_only_del_range_re = re.compile(r"^p\.(\d+)_(\d+)del$")
pos_only_del_single_re = re.compile(r"^p\.(\d+)del$")





def _normalize_protein_change(protein_change: str) -> str | None:
    """
    Essaie de mettre protein_change dans un format 'p.<...>' :
    - si ça commence déjà par 'p.', on ne touche à rien
    - si ça commence par une autre lettre + '.', on remplace par 'p.'
      ex: 'c.123A>T' -> 'p.123A>T'
    - sinon, on ajoute 'p.' devant
      ex: 'R132C' -> 'p.R132C'
    """
    if not isinstance(protein_change, str):
        return protein_change

    pc = protein_change.strip()

    # enlever des parenthèses autour de la partie après 'p.'
    m = re.match(r"^p\.\((.+)\)$", pc)
    if m:
        pc = "p." + m.group(1)

    if not pc:
        return pc

    if pc.startswith("p."):
        return pc

    # Cas 'c.', 'g.', 'm.' etc. → faire comme si c'était un 'p.'
    if re.match(r"^[A-Za-z]\.", pc):
        # 'c.123A>T' -> 'p.123A>T' : on enlève la 1ère lettre
        return "p." + pc[2:]

    # Cas sans préfixe du tout : 'R132C', 'S810fs*?' etc.
    return "p." + pc


def _context_around(seq: str, pos: int, window: int = 10) -> str:
    """
    Retourne un voisinage de la séquence autour d'une position 1-based.
    Longueur max ~window (<= window aux extrémités).
    """
    if not seq:
        return ""
    half = window // 2
    center = pos - 1  # pos est 1-based
    start = max(0, center - half)
    end = min(len(seq), center + half)
    return seq[start:end]


def _log_problem(
    problems: dict,
    problem_type: str,
    gene,
    protein_change,
    line_info=None,
    extra: dict | None = None,
):
    """
    Ajoute une entrée dans le dict de problèmes.
    """
    if problems is None:
        return
    entry = {
        "gene": gene,
        "protein_change": protein_change,
        "line": line_info,
    }
    if extra:
        entry.update(extra)
    problems.setdefault(problem_type, []).append(entry)


def apply_protein_change(
    protein_seq: str,
    protein_change: str,
    line_info=None,
    gene=None,
    problems: dict | None = None,
    context_window: int = 10,
) -> str:
    """
    Applique une mutation HGVS protéique sur une séquence d'acides aminés.
    Retourne la séquence mutée.
    """

    # On garde la valeur originale pour debug si besoin
    original_protein_change = protein_change

        # Normalisation "intelligente" du préfixe
    protein_change = _normalize_protein_change(protein_change)

    
    if not isinstance(protein_change, str) or not protein_change.startswith("p."):
        _log_problem(
            problems,
            "mutation_hgvs_non_valide",
            gene,
            original_protein_change,
            line_info=line_info,
            extra={"protein_change_normalized": protein_change},
        )
        return protein_seq

    # Cas "p.?" : effet protéique indéterminé → on ne touche pas à la séquence
    if protein_change.strip() in ("p.?", "?"):
        _log_problem(
            problems,
            "protein_change_indetermine",
            gene,
            original_protein_change,
            line_info=line_info,
        )
        return protein_seq


        # Cas d'annotation en réalité nucléotidique de type c.804+1G>A
    # mais préfixée par "p."
    m_splice_like = splice_like_re.match(protein_change)
    if m_splice_like:
        _log_problem(
            problems,
            "protein_change_indetermine",
            gene,
            original_protein_change,
            line_info=line_info,
            extra={"raison": "pattern_splice_nucleotidique"},
        )
        return protein_seq



    # 1. Missense / nonsense : p.R132C ou p.S770*
    m_missense = missense_or_nonsense_re.match(protein_change)
    if m_missense:
        aa_ref, pos_str, aa_alt = m_missense.groups()
        pos = int(pos_str)

        if pos < 1 or pos > len(protein_seq):
            _log_problem(
                problems,
                "position_hors_sequence",
                gene,
                protein_change,
                line_info=line_info,
                extra={
                    "pos": pos,
                    "seq_len": len(protein_seq),
                    "type": "missense/nonsense",
                },
            )
            return protein_seq

        if protein_seq[pos - 1] != aa_ref:
            ctx = _context_around(protein_seq, pos, context_window)
            _log_problem(
                problems,
                "discordance_aa_reference",
                gene,
                protein_change,
                line_info=line_info,
                extra={
                    "pos": pos,
                    "aa_ref_hgvs": aa_ref,
                    "aa_seq": protein_seq[pos - 1],
                    "context": ctx,
                    "type": "missense/nonsense",
                },
            )
        if aa_alt == "*":
            return protein_seq[: pos - 1]
        return protein_seq[: pos - 1] + aa_alt + protein_seq[pos:]

    # 2. Frameshift : p.H1912fs*38, p.S810fs*?, p.Y631fs*>5, etc.
    m_fs = frameshift_re.match(protein_change)
    if m_fs:
        aa_ref, pos_str = m_fs.groups()
        pos = int(pos_str)

        if pos < 1 or pos > len(protein_seq):
            _log_problem(
                problems,
                "position_hors_sequence",
                gene,
                protein_change,
                line_info=line_info,
                extra={
                    "pos": pos,
                    "seq_len": len(protein_seq),
                    "type": "frameshift",
                },
            )
            return protein_seq

        if protein_seq[pos - 1] != aa_ref:
            ctx = _context_around(protein_seq, pos, context_window)
            _log_problem(
                problems,
                "discordance_aa_reference",
                gene,
                protein_change,
                line_info=line_info,
                extra={
                    "pos": pos,
                    "aa_ref_hgvs": aa_ref,
                    "aa_seq": protein_seq[pos - 1],
                    "context": ctx,
                    "type": "frameshift",
                },
            )

        # Modèle simple : on tronque à l'AA précédent
        return protein_seq[: pos - 1]
    
        # Missense double : p.Q699H_K700E, p.F531L_P532S, etc.
    m_multi2 = multi_missense_2_re.match(protein_change)
    if m_multi2:
        aa1_ref, pos1_str, aa1_alt, aa2_ref, pos2_str, aa2_alt = m_multi2.groups()
        pos1 = int(pos1_str)
        pos2 = int(pos2_str)

        if (pos1 < 1 or pos1 > len(protein_seq) or
            pos2 < 1 or pos2 > len(protein_seq) or
            pos1 == pos2):
            _log_problem(
                problems,
                "position_hors_sequence",
                gene,
                protein_change,
                line_info=line_info,
                extra={
                    "pos1": pos1,
                    "pos2": pos2,
                    "seq_len": len(protein_seq),
                    "type": "missense_double",
                },
            )
            return protein_seq

        # Vérif AA de référence
        aa_seq1 = protein_seq[pos1 - 1]
        aa_seq2 = protein_seq[pos2 - 1]
        if aa_seq1 != aa1_ref or aa_seq2 != aa2_ref:
            ctx1 = _context_around(protein_seq, pos1, context_window)
            ctx2 = _context_around(protein_seq, pos2, context_window)
            _log_problem(
                problems,
                "discordance_aa_reference",
                gene,
                protein_change,
                line_info=line_info,
                extra={
                    "pos1": pos1,
                    "pos2": pos2,
                    "aa1_hgvs": aa1_ref,
                    "aa2_hgvs": aa2_ref,
                    "aa1_seq": aa_seq1,
                    "aa2_seq": aa_seq2,
                    "context_pos1": ctx1,
                    "context_pos2": ctx2,
                    "type": "missense_double",
                },
            )

        # Application des deux substitutions
        seq_list = list(protein_seq)
        seq_list[pos1 - 1] = aa1_alt
        seq_list[pos2 - 1] = aa2_alt
        return "".join(seq_list)


    # Stop-loss / extension : p.*342S, p.*636C
    m_stoploss = stoploss_re.match(protein_change)
    if m_stoploss:
        pos_str, aa_alt = m_stoploss.groups()
        pos = int(pos_str)

        # On ne sait pas rallonger la séquence (pas de séquence aval),
        # donc on laisse la protéine inchangée mais on log un problème dédié.
        _log_problem(
            problems,
            "stop_loss_non_implemente",
            gene,
            protein_change,
            line_info=line_info,
            extra={
                "pos": pos,
                "aa_alt": aa_alt,
            },
        )
        return protein_seq
    
        # Substitution + deletion du codon suivant : p.K139N_T140delT
    m_sub_del = sub_plus_del_re.match(protein_change)
    if m_sub_del:
        aa1_ref, pos1_str, aa1_alt, aa2_ref, pos2_str, deleted_seq = m_sub_del.groups()
        pos1 = int(pos1_str)
        pos2 = int(pos2_str)

        if (pos1 < 1 or pos1 > len(protein_seq) or
            pos2 < 1 or pos2 > len(protein_seq) or
            pos2 <= pos1):
            _log_problem(
                problems,
                "position_hors_sequence",
                gene,
                protein_change,
                line_info=line_info,
                extra={
                    "pos1": pos1,
                    "pos2": pos2,
                    "seq_len": len(protein_seq),
                    "type": "sub_plus_del",
                },
            )
            return protein_seq

        aa_seq1 = protein_seq[pos1 - 1]
        aa_seq2 = protein_seq[pos2 - 1]
        region = protein_seq[pos2 - 1: pos2 - 1 + len(deleted_seq)]

        if aa_seq1 != aa1_ref or aa_seq2 != aa2_ref or region != deleted_seq:
            ctx1 = _context_around(protein_seq, pos1, context_window)
            ctx2 = _context_around(protein_seq, pos2, context_window)
            _log_problem(
                problems,
                "discordance_aa_reference",
                gene,
                protein_change,
                line_info=line_info,
                extra={
                    "pos1": pos1,
                    "pos2": pos2,
                    "aa1_hgvs": aa1_ref,
                    "aa2_hgvs": aa2_ref,
                    "aa1_seq": aa_seq1,
                    "aa2_seq": aa_seq2,
                    "region_seq": region,
                    "deleted_seq_hgvs": deleted_seq,
                    "context_pos1": ctx1,
                    "context_pos2": ctx2,
                    "type": "sub_plus_del",
                },
            )

        # On applique les deux : substitution puis deletion
        seq_list = list(protein_seq)
        seq_list[pos1 - 1] = aa1_alt

        start_del = pos2 - 1
        end_del = start_del + len(deleted_seq)
        seq_list = seq_list[:start_del] + seq_list[end_del:]

        return "".join(seq_list)

        # Substitution + delins du codon suivant : p.R702G_T703delTinsG
    m_sub_delins = sub_plus_delins_re.match(protein_change)
    if m_sub_delins:
        aa1_ref, pos1_str, aa1_alt, aa2_ref, pos2_str, deleted_seq, inserted = m_sub_delins.groups()
        pos1 = int(pos1_str)
        pos2 = int(pos2_str)

        if (pos1 < 1 or pos1 > len(protein_seq) or
            pos2 < 1 or pos2 > len(protein_seq) or
            pos2 <= pos1):
            _log_problem(
                problems,
                "position_hors_sequence",
                gene,
                protein_change,
                line_info=line_info,
                extra={
                    "pos1": pos1,
                    "pos2": pos2,
                    "seq_len": len(protein_seq),
                    "type": "sub_plus_delins",
                },
            )
            return protein_seq

        aa_seq1 = protein_seq[pos1 - 1]
        aa_seq2 = protein_seq[pos2 - 1]
        region = protein_seq[pos2 - 1: pos2 - 1 + len(deleted_seq)]

        if aa_seq1 != aa1_ref or aa_seq2 != aa2_ref or region != deleted_seq:
            ctx1 = _context_around(protein_seq, pos1, context_window)
            ctx2 = _context_around(protein_seq, pos2, context_window)
            _log_problem(
                problems,
                "discordance_aa_reference",
                gene,
                protein_change,
                line_info=line_info,
                extra={
                    "pos1": pos1,
                    "pos2": pos2,
                    "aa1_hgvs": aa1_ref,
                    "aa2_hgvs": aa2_ref,
                    "aa1_seq": aa_seq1,
                    "aa2_seq": aa_seq2,
                    "region_seq": region,
                    "deleted_seq_hgvs": deleted_seq,
                    "context_pos1": ctx1,
                    "context_pos2": ctx2,
                    "type": "sub_plus_delins",
                },
            )

        seq_list = list(protein_seq)
        seq_list[pos1 - 1] = aa1_alt

        start_del = pos2 - 1
        end_del = start_del + len(deleted_seq)

        # Cas STOP : comme pour les autres delins
        if inserted == "*":
            return "".join(seq_list[: pos1])  # STOP après pos1

        if inserted.endswith("*"):
            inserted_aa = inserted[:-1]
            seq_left = "".join(seq_list[: pos1]) + inserted_aa
            return seq_left  # tronqué au STOP

        # Cas sans STOP
        seq_left = "".join(seq_list[: start_del])
        seq_right = "".join(seq_list[end_del:])
        return seq_left + inserted + seq_right



        # 3. Insertion : p.L98_C99insNFL, p.G1192_C1193ins*, p.G1192_C1193insHR*
    m_ins = insertion_re.match(protein_change)
    if m_ins:
        aa1, pos1_str, aa2, pos2_str, inserted = m_ins.groups()
        pos1 = int(pos1_str)
        pos2 = int(pos2_str)

        if pos1 < 1 or pos1 > len(protein_seq) or pos2 < 1 or pos2 > len(protein_seq):
            _log_problem(
                problems,
                "position_hors_sequence",
                gene,
                protein_change,
                line_info=line_info,
                extra={
                    "pos1": pos1,
                    "pos2": pos2,
                    "seq_len": len(protein_seq),
                    "type": "insertion",
                },
            )
            return protein_seq

        if protein_seq[pos1 - 1] != aa1 or protein_seq[pos2 - 1] != aa2:
            ctx1 = _context_around(protein_seq, pos1, context_window)
            ctx2 = _context_around(protein_seq, pos2, context_window)
            _log_problem(
                problems,
                "discordance_aa_reference",
                gene,
                protein_change,
                line_info=line_info,
                extra={
                    "pos1": pos1,
                    "pos2": pos2,
                    "aa1_hgvs": aa1,
                    "aa2_hgvs": aa2,
                    "aa1_seq": protein_seq[pos1 - 1],
                    "aa2_seq": protein_seq[pos2 - 1],
                    "context_pos1": ctx1,
                    "context_pos2": ctx2,
                    "type": "insertion",
                },
            )

        prefix = protein_seq[:pos1]

        # Cas insertion avec STOP : ins* ou insHR*
        if inserted == "*":
            # STOP directement après aa1 → on tronque après pos1
            return prefix
        if inserted.endswith("*"):
            inserted_aa = inserted[:-1]  # ex: "HR" dans "HR*"
            return prefix + inserted_aa  # STOP ensuite → on tronque ici

        # Cas classique sans STOP
        suffix = protein_seq[pos1:]
        return prefix + inserted + suffix


        # 4. delins : p.L614_K618delins*, p.L614_K618delinsABC, p.Q430_K434delinsHR*
    m_delins = delins_re.match(protein_change)
    if m_delins:
        aa1, pos1_str, aa2, pos2_str, inserted = m_delins.groups()
        pos1 = int(pos1_str)
        pos2 = int(pos2_str)

        if pos1 < 1 or pos2 > len(protein_seq) or pos1 > pos2:
            _log_problem(
                problems,
                "position_hors_sequence",
                gene,
                protein_change,
                line_info=line_info,
                extra={
                    "pos1": pos1,
                    "pos2": pos2,
                    "seq_len": len(protein_seq),
                    "type": "delins",
                },
            )
            return protein_seq

        if protein_seq[pos1 - 1] != aa1 or protein_seq[pos2 - 1] != aa2:
            ctx1 = _context_around(protein_seq, pos1, context_window)
            ctx2 = _context_around(protein_seq, pos2, context_window)
            _log_problem(
                problems,
                "discordance_aa_reference",
                gene,
                protein_change,
                line_info=line_info,
                extra={
                    "pos1": pos1,
                    "pos2": pos2,
                    "aa1_hgvs": aa1,
                    "aa2_hgvs": aa2,
                    "aa1_seq": protein_seq[pos1 - 1],
                    "aa2_seq": protein_seq[pos2 - 1],
                    "context_pos1": ctx1,
                    "context_pos2": ctx2,
                    "type": "delins",
                },
            )

        # Cas delins* → on met un STOP à pos1
        if inserted == "*":
            return protein_seq[: pos1 - 1]

        # Cas delinsXYZ* → on remplace par XYZ puis STOP
        if inserted.endswith("*"):
            inserted_aa = inserted[:-1]
            return protein_seq[: pos1 - 1] + inserted_aa

        # Cas classique sans STOP : on remplace [pos1, pos2] par inserted
        return protein_seq[: pos1 - 1] + inserted + protein_seq[pos2:]


        # 4bis. delins sur un seul AA : p.Y301delins*, p.Y301delinsHR*
    m_single_delins = single_delins_re.match(protein_change)
    if m_single_delins:
        aa_ref, pos_str, inserted = m_single_delins.groups()
        pos = int(pos_str)

        if pos < 1 or pos > len(protein_seq):
            _log_problem(
                problems,
                "position_hors_sequence",
                gene,
                protein_change,
                line_info=line_info,
                extra={
                    "pos": pos,
                    "seq_len": len(protein_seq),
                    "type": "delins_single",
                },
            )
            return protein_seq

        if protein_seq[pos - 1] != aa_ref:
            ctx = _context_around(protein_seq, pos, context_window)
            _log_problem(
                problems,
                "discordance_aa_reference",
                gene,
                protein_change,
                line_info=line_info,
                extra={
                    "pos": pos,
                    "aa_ref_hgvs": aa_ref,
                    "aa_seq": protein_seq[pos - 1],
                    "context": ctx,
                    "type": "delins_single",
                },
            )

        if inserted == "*":
            return protein_seq[: pos - 1]

        if inserted.endswith("*"):
            inserted_aa = inserted[:-1]
            return protein_seq[: pos - 1] + inserted_aa

        return protein_seq[: pos - 1] + inserted + protein_seq[pos:]

    # 4ter. delins avec séquence explicite : p.V1900_Y1902delVFYinsH
    m_del_with_seq_and_ins = del_with_seq_and_ins_re.match(protein_change)
    if m_del_with_seq_and_ins:
        aa1, pos1_str, aa2, pos2_str, deleted_seq, inserted = m_del_with_seq_and_ins.groups()
        pos1 = int(pos1_str)
        pos2 = int(pos2_str)

        if pos1 < 1 or pos2 > len(protein_seq) or pos1 > pos2:
            _log_problem(
                problems,
                "position_hors_sequence",
                gene,
                protein_change,
                line_info=line_info,
                extra={
                    "pos1": pos1,
                    "pos2": pos2,
                    "seq_len": len(protein_seq),
                    "type": "delins_avec_sequence",
                },
            )
            return protein_seq

        region = protein_seq[pos1 - 1:pos2]
        if protein_seq[pos1 - 1] != aa1 or protein_seq[pos2 - 1] != aa2 or region != deleted_seq:
            ctx = _context_around(protein_seq, pos1, context_window)
            _log_problem(
                problems,
                "discordance_sequence_supprimee",
                gene,
                protein_change,
                line_info=line_info,
                extra={
                    "pos1": pos1,
                    "pos2": pos2,
                    "region_seq": region,
                    "deleted_seq_hgvs": deleted_seq,
                    "aa1_hgvs": aa1,
                    "aa2_hgvs": aa2,
                    "aa1_seq": protein_seq[pos1 - 1],
                    "aa2_seq": protein_seq[pos2 - 1],
                    "context": ctx,
                    "type": "delins_avec_sequence",
                },
            )

        # Cas delins* → STOP à pos1
        if inserted == "*":
            return protein_seq[: pos1 - 1]

        # Cas delinsXYZ* → remplacement par XYZ puis STOP
        if inserted.endswith("*"):
            inserted_aa = inserted[:-1]
            return protein_seq[: pos1 - 1] + inserted_aa

        # Cas classique sans STOP : on remplace [pos1, pos2] par inserted
        return protein_seq[: pos1 - 1] + inserted + protein_seq[pos2:]

    # 5. Deletion simple : p.A78_R83del
    m_del = simple_del_range_re.match(protein_change)
    if m_del:
        aa1, pos1_str, aa2, pos2_str = m_del.groups()
        pos1 = int(pos1_str)
        pos2 = int(pos2_str)

        if pos1 < 1 or pos2 > len(protein_seq) or pos1 > pos2:
            _log_problem(
                problems,
                "position_hors_sequence",
                gene,
                protein_change,
                line_info=line_info,
                extra={
                    "pos1": pos1,
                    "pos2": pos2,
                    "seq_len": len(protein_seq),
                    "type": "deletion_simple",
                },
            )
            return protein_seq

        if protein_seq[pos1 - 1] != aa1 or protein_seq[pos2 - 1] != aa2:
            ctx1 = _context_around(protein_seq, pos1, context_window)
            ctx2 = _context_around(protein_seq, pos2, context_window)
            _log_problem(
                problems,
                "discordance_aa_reference",
                gene,
                protein_change,
                line_info=line_info,
                extra={
                    "pos1": pos1,
                    "pos2": pos2,
                    "aa1_hgvs": aa1,
                    "aa2_hgvs": aa2,
                    "aa1_seq": protein_seq[pos1 - 1],
                    "aa2_seq": protein_seq[pos2 - 1],
                    "context_pos1": ctx1,
                    "context_pos2": ctx2,
                    "type": "deletion_simple",
                },
            )

        return protein_seq[: pos1 - 1] + protein_seq[pos2:]

    # 5bis. Deletion simple sur un seul AA : p.A123del
    m_del_single = simple_del_single_re.match(protein_change)
    if m_del_single:
        aa_ref, pos_str = m_del_single.groups()
        pos = int(pos_str)

        if pos < 1 or pos > len(protein_seq):
            _log_problem(
                problems,
                "position_hors_sequence",
                gene,
                protein_change,
                line_info=line_info,
                extra={
                    "pos": pos,
                    "seq_len": len(protein_seq),
                    "type": "deletion_simple_single",
                },
            )
            return protein_seq

        if protein_seq[pos - 1] != aa_ref:
            ctx = _context_around(protein_seq, pos, context_window)
            _log_problem(
                problems,
                "discordance_aa_reference",
                gene,
                protein_change,
                line_info=line_info,
                extra={
                    "pos": pos,
                    "aa_ref_hgvs": aa_ref,
                    "aa_seq": protein_seq[pos - 1],
                    "context": ctx,
                    "type": "deletion_simple_single",
                },
            )

        # on enlève l'AA à la position pos
        return protein_seq[: pos - 1] + protein_seq[pos:]
    
    # 5ter. Deletion simple sur un seul AA avec AA répété : p.A123delA
    m_del_single_with_aa = single_del_with_aa_re.match(protein_change)
    if m_del_single_with_aa:
        aa_ref, pos_str, aa_del = m_del_single_with_aa.groups()
        pos = int(pos_str)

        if pos < 1 or pos > len(protein_seq):
            _log_problem(
                problems,
                "position_hors_sequence",
                gene,
                protein_change,
                line_info=line_info,
                extra={
                    "pos": pos,
                    "seq_len": len(protein_seq),
                    "type": "deletion_simple_single_with_aa",
                },
            )
            return protein_seq

        # Vérif : l'AA de référence et l'AA indiqué après 'del' devraient être identiques
        if aa_ref != aa_del:
            _log_problem(
                problems,
                "incoherence_aa_del",
                gene,
                protein_change,
                line_info=line_info,
                extra={
                    "pos": pos,
                    "aa_ref_hgvs": aa_ref,
                    "aa_del_hgvs": aa_del,
                    "type": "deletion_simple_single_with_aa",
                },
            )

        # Vérif de cohérence avec la séquence
        if protein_seq[pos - 1] != aa_ref:
            ctx = _context_around(protein_seq, pos, context_window)
            _log_problem(
                problems,
                "discordance_aa_reference",
                gene,
                protein_change,
                line_info=line_info,
                extra={
                    "pos": pos,
                    "aa_ref_hgvs": aa_ref,
                    "aa_seq": protein_seq[pos - 1],
                    "context": ctx,
                    "type": "deletion_simple_single_with_aa",
                },
            )

        # On enlève l'AA à la position pos
        return protein_seq[: pos - 1] + protein_seq[pos:]

    # 5quater. Deletion avec positions seules : p.58_59del, p.390_390del, p.123del
    m_pos_only_del_range = pos_only_del_range_re.match(protein_change)
    if m_pos_only_del_range:
        pos1_str, pos2_str = m_pos_only_del_range.groups()
        pos1 = int(pos1_str)
        pos2 = int(pos2_str)

        if pos1 < 1 or pos2 > len(protein_seq) or pos1 > pos2:
            _log_problem(
                problems,
                "position_hors_sequence",
                gene,
                protein_change,
                line_info=line_info,
                extra={
                    "pos1": pos1,
                    "pos2": pos2,
                    "seq_len": len(protein_seq),
                    "type": "deletion_simple_pos_only",
                },
            )
            return protein_seq

        # Pas de vérif AA possible -> on applique juste la délétion
        return protein_seq[: pos1 - 1] + protein_seq[pos2:]

    m_pos_only_del_single = pos_only_del_single_re.match(protein_change)
    if m_pos_only_del_single:
        pos = int(m_pos_only_del_single.group(1))

        if pos < 1 or pos > len(protein_seq):
            _log_problem(
                problems,
                "position_hors_sequence",
                gene,
                protein_change,
                line_info=line_info,
                extra={
                    "pos": pos,
                    "seq_len": len(protein_seq),
                    "type": "deletion_simple_pos_only_single",
                },
            )
            return protein_seq

        return protein_seq[: pos - 1] + protein_seq[pos:]

    # 6. Deletion avec séquence explicite : p.A78_R83delAVLDGR
    m_del_with_seq = del_with_seq_re.match(protein_change)
    if m_del_with_seq:
        aa1, pos1_str, aa2, pos2_str, deleted_seq = m_del_with_seq.groups()
        pos1 = int(pos1_str)
        pos2 = int(pos2_str)

        if pos1 < 1 or pos2 > len(protein_seq) or pos1 > pos2:
            _log_problem(
                problems,
                "position_hors_sequence",
                gene,
                protein_change,
                line_info=line_info,
                extra={
                    "pos1": pos1,
                    "pos2": pos2,
                    "seq_len": len(protein_seq),
                    "type": "deletion_avec_sequence",
                },
            )
            return protein_seq

        region = protein_seq[pos1 - 1:pos2]
        if region != deleted_seq:
            ctx = _context_around(protein_seq, pos1, context_window)
            _log_problem(
                problems,
                "discordance_sequence_supprimee",
                gene,
                protein_change,
                line_info=line_info,
                extra={
                    "pos1": pos1,
                    "pos2": pos2,
                    "region_seq": region,
                    "deleted_seq_hgvs": deleted_seq,
                    "context": ctx,
                    "type": "deletion_avec_sequence",
                },
            )

        return protein_seq[: pos1 - 1] + protein_seq[pos2:]

        # Stop conservé : p.*342*
    m_same_stop = same_stop_re.match(protein_change)
    if m_same_stop:
        pos_str = m_same_stop.group(1)
        pos = int(pos_str)

        # Optionnel : vérifier que la longueur de la protéine colle à peu près
        # Ici on considère que c'est "no-op"
        _log_problem(
            problems,
            "mutation_silencieuse_stop",
            gene,
            protein_change,
            line_info=line_info,
            extra={"pos": pos},
        )
        return protein_seq

    m_syn = synonymous_re.match(protein_change)
    if m_syn:
        aa_ref, pos_str = m_syn.groups()
        pos = int(pos_str)
        # Optionnel : vérifier que l’AA est bien le même
        if 1 <= pos <= len(protein_seq) and protein_seq[pos - 1] != aa_ref:
            ctx = _context_around(protein_seq, pos, context_window)
            _log_problem(
                problems,
                "discordance_aa_reference",
                gene,
                protein_change,
                line_info=line_info,
                extra={
                    "pos": pos,
                    "aa_ref_hgvs": aa_ref,
                    "aa_seq": protein_seq[pos - 1],
                    "context": ctx,
                    "type": "synonyme",
                },
            )
        # Pas d’effet protéique : on renvoie la séquence telle quelle
        return protein_seq


        # ITD de FLT3 sans détail : p.FLT3_ITD
    m_flt3_itd = flt3_itd_re.match(protein_change)
    if m_flt3_itd:
        _log_problem(
            problems,
            "itd_non_implemente",
            gene,
            protein_change,
            line_info=line_info,
            extra={"type": "FLT3_ITD"},
        )
        # Effet impossible à appliquer → on renvoie la séquence inchangée
        return protein_seq

    m_t549 = t549re_star_re.match(protein_change)
    if m_t549:
        _log_problem(
            problems,
            "notation_non_standard",
            gene,
            protein_change,
            line_info=line_info,
            extra={"comment": "pattern T549RE* non standard / ambigu"},
        )
        return protein_seq


    # 7. Cas non gérés
    _log_problem(
        problems,
        "pattern_hgvs_non_gere",
        gene,
        protein_change,
        line_info=line_info,
    )
    return protein_seq


In [47]:
def mutate_row(row, dataset):
    if pd.isna(row['protein_seq']) or pd.isna(row['PROTEIN_CHANGE']):
        return None
    
    return apply_protein_change(
        row['protein_seq'],
        row['PROTEIN_CHANGE'],
        line_info=(dataset, row.name),  # on stocke (dataset, index)
        gene=row['GENE'],
        problems=problems,
        context_window=10,
    )

# Application
problems = {}
maf_df["mutated_protein_seq"] = maf_df.apply(lambda r: mutate_row(r, "train"), axis=1)
maf_eval["mutated_protein_seq"] = maf_eval.apply(lambda r: mutate_row(r, "eval"), axis=1)


In [48]:
import pandas as pd

records = []
for problem_type, entries in problems.items():
    for e in entries:
        dataset, line_idx = e.get("line")  # tuple: ("train" ou "eval", index)
        records.append({
            "problem_type": problem_type,
            "GENE": e.get("gene"),
            "line": line_idx,
            "dataset": dataset,
        })

problems_df = pd.DataFrame(records)



# Extraire les sets de gènes
genes_train = set(problems_df.loc[problems_df["dataset"] == "train", "GENE"])
genes_eval  = set(problems_df.loc[problems_df["dataset"] == "eval",  "GENE"])

# Gènes communs et exclusifs
genes_communs = genes_train & genes_eval
genes_only_train = genes_train - genes_eval
genes_only_eval  = genes_eval - genes_train

print("Nombre de gènes problématiques dans TRAIN :", len(genes_train))
print("Nombre de gènes problématiques dans EVAL  :", len(genes_eval))
print("Nombre de gènes problématiques communs    :", len(genes_communs))
print()
print("Gènes problématiques communs :", genes_communs)
print()
print("Gènes problématiques uniquement dans TRAIN :", genes_only_train)
print()
print("Gènes problématiques uniquement dans EVAL :", genes_only_eval)

nb_train = (problems_df["dataset"] == "train").sum()
nb_eval = (problems_df["dataset"] == "eval").sum()

print("Nombre de lignes problématiques dans TRAIN :", nb_train)
print("Nombre de lignes problématiques dans EVAL  :", nb_eval)



Nombre de gènes problématiques dans TRAIN : 58
Nombre de gènes problématiques dans EVAL  : 22
Nombre de gènes problématiques communs    : 16

Gènes problématiques communs : {'TP53', 'ETV6', 'FLT3', 'DDX41', 'TET2', 'BCOR', 'PHF6', 'ZRSR2', 'EZH2', 'STAG2', 'CREBBP', 'DNMT3A', 'NF1', 'BCORL1', 'WT1', 'RUNX1'}

Gènes problématiques uniquement dans TRAIN : {'KMT2C', 'RAD50', 'CHEK2', 'STAG1', 'ATRX', 'ARID1A', 'CDKN2A', 'GATA2', 'SMG1', 'SETD2', 'SUZ12', 'KDM6A', 'MPL', 'PAPD5', 'ARID2', 'DDX54', 'SAMHD1', 'CSF3R', 'EP300', 'BAP1', 'KDM5C', 'SH2B3', 'BRCC3', 'LUC7L2', 'ROBO2', 'GATA1', 'MGA', 'SMC1A', 'NFE2', 'RB1', 'IRF1', 'CSNK1A1', 'SF1', 'CDKN2B', 'CTCF', 'CBL', 'ASXL1', 'CUX1', 'RAD21', 'KMT2D', 'CDKN1B', 'ABL1'}

Gènes problématiques uniquement dans EVAL : {'FANCG', 'GNAS', 'ETNK1', 'NSD1', 'SMC3', 'PIGA'}
Nombre de lignes problématiques dans TRAIN : 1551
Nombre de lignes problématiques dans EVAL  : 433


In [49]:
summary = (
    problems_df
    .groupby(["problem_type", "dataset"])
    .agg(
        n_genes=("GENE", "nunique"),   # nombre de gènes distincts
        n_lines=("line", "nunique"),   # nombre de lignes distinctes
    )
    .reset_index()
)

print(summary)


                      problem_type dataset  n_genes  n_lines
0         discordance_aa_reference    eval        9      310
1         discordance_aa_reference   train       22      966
2   discordance_sequence_supprimee   train        3        3
3               itd_non_implemente    eval        1       36
4               itd_non_implemente   train        1       26
5        mutation_silencieuse_stop   train        1        1
6            notation_non_standard   train        1        1
7           position_hors_sequence    eval        2        7
8           position_hors_sequence   train        5       22
9       protein_change_indetermine    eval       16       80
10      protein_change_indetermine   train       46      529
11        stop_loss_non_implemente   train        2        3


In [50]:
max_examples = 5

for problem_type, entries in problems.items():
    print(f"\n==============================")
    print(f"Problème : {problem_type}")
    print(f"Nombre total d'occurrences : {len(entries)}")

    # Filtrer les cas p.? et variantes équivalentes
    filtered = [e for e in entries if e.get("protein_change") not in ("p.?", "?")]

    # Filtre spécifique pour pattern_hgvs_non_gere
    if problem_type == "pattern_hgvs_non_gere":
        filtered = [e for e in filtered if e.get("protein_change") != "p.MLL_PTD"]

    # Si tout est filtré, le signaler explicitement
    if not filtered:
        print("Aucun exemple (tous les cas sont filtrés).")
        continue

    for e in filtered[:max_examples]:
        dataset, line_idx = e.get("line", (None, None))
        gene = e.get("gene")
        pc = e.get("protein_change")

        extra_keys = [k for k in e.keys() if k not in {"gene", "protein_change", "line"}]
        extra_info = {k: e[k] for k in extra_keys}

        print(f"- dataset={dataset}, ligne={line_idx}, GENE={gene}, protein_change={pc}")
        if extra_info:
            print(f"  → infos supplémentaires : {extra_info}")



Problème : protein_change_indetermine
Nombre total d'occurrences : 609
- dataset=eval, ligne=109, GENE=SMC3, protein_change=c.804+1G>A
  → infos supplémentaires : {'raison': 'pattern_splice_nucleotidique'}
- dataset=eval, ligne=125, GENE=WT1, protein_change=c.1198+1G>A
  → infos supplémentaires : {'raison': 'pattern_splice_nucleotidique'}
- dataset=eval, ligne=157, GENE=WT1, protein_change=c.872+1G>A
  → infos supplémentaires : {'raison': 'pattern_splice_nucleotidique'}
- dataset=eval, ligne=171, GENE=WT1, protein_change=c.646+1G>A
  → infos supplémentaires : {'raison': 'pattern_splice_nucleotidique'}
- dataset=eval, ligne=231, GENE=ETV6, protein_change=c.163+1G>A
  → infos supplémentaires : {'raison': 'pattern_splice_nucleotidique'}

Problème : discordance_aa_reference
Nombre total d'occurrences : 1276
- dataset=train, ligne=5, GENE=CHEK2, protein_change=p.W454*
  → infos supplémentaires : {'pos': 454, 'aa_ref_hgvs': 'W', 'aa_seq': 'E', 'context': 'PEVWAEVSEK', 'type': 'missense/nons

In [51]:
for problem_type, entries in problems.items():
    for e in entries:
        pc = e.get("protein_change")

        # On saute les cas p.MLL_PTD, p.? et ? 
        if pc in ("p.MLL_PTD", "p.?", "?"):
            continue

        dataset, line_idx = e.get("line", (None, None))
        gene = e.get("gene")

        print(
            f"problem_type={problem_type} | "
            f"dataset={dataset} | line={line_idx} | "
            f"GENE={gene} | protein_change={pc}"
        )


problem_type=protein_change_indetermine | dataset=eval | line=109 | GENE=SMC3 | protein_change=c.804+1G>A
problem_type=protein_change_indetermine | dataset=eval | line=125 | GENE=WT1 | protein_change=c.1198+1G>A
problem_type=protein_change_indetermine | dataset=eval | line=157 | GENE=WT1 | protein_change=c.872+1G>A
problem_type=protein_change_indetermine | dataset=eval | line=171 | GENE=WT1 | protein_change=c.646+1G>A
problem_type=protein_change_indetermine | dataset=eval | line=231 | GENE=ETV6 | protein_change=c.163+1G>A
problem_type=protein_change_indetermine | dataset=eval | line=232 | GENE=ETV6 | protein_change=c.164-2A>-
problem_type=protein_change_indetermine | dataset=eval | line=247 | GENE=ETV6 | protein_change=c.1153-1G>A
problem_type=protein_change_indetermine | dataset=eval | line=252 | GENE=ETV6 | protein_change=c.1254-2A>G
problem_type=protein_change_indetermine | dataset=eval | line=253 | GENE=ETV6 | protein_change=c.1254-2A>G
problem_type=protein_change_indetermine | dat

In [52]:
import pandas as pd

def _get_seq(dataset, idx):
    if dataset == "train":
        return maf_df.loc[idx, "protein_seq"]
    elif dataset == "eval":
        return maf_eval.loc[idx, "protein_seq"]
    return None

for e in problems.get("discordance_aa_reference", []):
    gene = e.get("gene")
    pc = e.get("protein_change")
    dataset, line_idx = e.get("line", (None, None))

    seq = _get_seq(dataset, line_idx)
    if seq is None or pd.isna(seq):
        print(f"{gene} {pc} : séquence introuvable (dataset={dataset}, line={line_idx})")
        continue

    # Récupérer les positions impliquées dans la discordance
    positions = []
    if "pos" in e:   # cas simples
        positions.append(int(e["pos"]))
    if "pos1" in e:  # cas doubles (missense double, sub+del, sub+delins…)
        positions.append(int(e["pos1"]))
    if "pos2" in e:
        positions.append(int(e["pos2"]))

    # On peut avoir plusieurs positions pour un même problème
    for pos in sorted(set(positions)):
        if pos < 1 or pos > len(seq):
            print(f"{gene} {pc} (pos {pos}) : position hors séquence")
            continue

        center = pos - 1  # 0-based
        start = max(center - 5, 0)
        end   = min(center + 5, len(seq) - 1)

        ctx = seq[start:end + 1]
        center_in_ctx = center - start

        # 5 AA à gauche / 5 à droite (moins si on est proche d'une extrémité)
        left  = ctx[max(0, center_in_ctx - 5): center_in_ctx]
        aa_c  = seq[center]
        right = ctx[center_in_ctx + 1: center_in_ctx + 1 + 5]

        print(f"{gene} {pc} (pos {pos}) : {left}[{aa_c}]{right}")


CHEK2 p.W454* (pos 454) : PEVWA[E]VSEKA
EZH2 p.N322S (pos 322) : NKPCG[P]QCYQH
CSF3R p.S810fs*6 (pos 810) : PSQED[D]CVFGP
RUNX1 p.K215fs*22 (pos 215) : RVSPH[H]PAPTP
RUNX1 p.S141W (pos 141) : FVGRS[G]RGKSF
RUNX1 p.R201* (pos 201) : ERLSE[L]EQLRR
RUNX1 p.Q335* (pos 335) : PTPVT[S]GIGIG
RUNX1 p.G170fs*7 (pos 170) : AIKIT[V]DGPRE
ROBO2 p.G1309E (pos 1309) : HRREG[M]TDEEA
WT1 p.P249S (pos 249) : GATLK[G]VAAGS
CUX1 p.R1223W (pos 1223) : SDSQP[C]EPPSV
RUNX1 p.H190fs*24 (pos 190) : DQTKP[G]SLSFS
EZH2 p.N376fs*11 (pos 376) : NVLES[K]DTDSD
WT1 p.A382fs*6 (pos 382) : HTGVK[P]FQCKT
RUNX1 p.D123fs*11 (pos 123) : RNATA[A]MKNQV
EZH2 p.T350fs*74 (pos 350) : TPPKR[P]GGRRR
EZH2 p.N699fs*8 (pos 699) : NCYAK[V]MMVNG
EZH2 p.S669C (pos 669) : SFLFN[L]NNDFV
EZH2 p.R583* (pos 583) : RECDP[D]LCLTC
LUC7L2 p.V128fs*6 (pos 128) : AERVH[E]LNEEI
RUNX1 p.D198N (pos 198) : SFSER[L]SELEQ
RUNX1 p.G199fs*17 (pos 199) : FSERL[S]ELEQL
RUNX1 p.R162K (pos 162) : PQVAT[Y]HRAIK
EZH2 p.R469* (pos 469) : RQVYE[F]RVKES
RUNX1 p.

In [53]:
maf_df["mutated"] = (maf_df["mutated_protein_seq"] != maf_df["protein_seq"]).astype(int)
maf_eval["mutated"] = (maf_eval["mutated_protein_seq"] != maf_eval["protein_seq"]).astype(int)

In [55]:
maf_df[["mutated"]].value_counts()

mutated
1          10342
0            593
Name: count, dtype: int64

In [54]:
maf_df.to_csv("../../data/molecular_train_with_mutations.csv")
maf_eval.to_csv("../../data/molecular_test_with_mutations.csv")