In [2]:
#packages
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

# SUPPLEMENTARY INFO DETAILS - MOESM@ to MOESM7

### MOESM2-MOESM7 dataset explanation: 

1. **MOESM2 serves as training dataset for population-based predictors(ClinVar-derived deleterious vs neutral mutations).** Columns include mutation identifiers, gene names, genomic coordinates, outputs (scores, binary calls) from the four population-based tools (VEST, MutPred, CONDEL, PredictSNP).
2. **MOESM3 is the testing dataset for population-based algorithms and DriverDetect.** Same format as MOESM2 but held out for validation.
3. **MOESM4 is training dataset for cancer-based predictors(cBioPortal, OncoKB, IntOGen, gnomAD combined).** Columns include dtabase source, gene/trnascript IDs, genomic positions, Ref/Alt amino acids, and outputs from FATHMM, CHASM, TransFIC.
4. **MOESM5 is testing dataset for cancer-based algorithms and DriverDetect** Held-out dataset, same format as MOESM4.
6. **MOESM6 are raw score outputs from all population-based tools for all training+testing mutations.** 4 numerical score columns adn 4 binary classification columns.
7. **MOESM7 are raw scores outputs from all cancer-based tools plus combined tool accuracy metrics.** Contain FATHMM, CHASM, TransFIC sub-scores, and per-combination accuracy/F1/MCC/ metrics.
8. **MOESM8 is the summary of DriverDetect's own output scores (for population- and cancer-based datasets)**


## DATASET CURATION OF 41598_2024_71422_MOESM4_ESM.xlsx

In [20]:
MOESM_4 = '41598_2024_71422_MOESM4_ESM.xlsx'
df = pd.read_excel(MOESM_4)
df = df[df['Validity'] != 'Invalid']
df = df.reset_index(drop=True
# Check result
print(df['Source'].unique())
print(df)

['cBioPortal' 'GnomAD' 'IntOGen' 'OncoKB']
       Test No.      Source Validity    Gene                ENST Gene Code  \
0     TRAIN1256  cBioPortal    Valid   BRCA1   ENST00000357654.9    P38398   
1     TRAIN1148  cBioPortal    Valid   BRCA2   ENST00000380152.8    P51587   
2     TRAIN1227  cBioPortal    Valid    CDH1  ENST00000261769.10    P12830   
3     TRAIN1341  cBioPortal    Valid    CDH1  ENST00000261769.10    P12830   
4     TRAIN1485  cBioPortal    Valid    CDH1  ENST00000261769.10    P12830   
...         ...         ...      ...     ...                 ...       ...   
1565  TRAIN0863      OncoKB    Valid  PIK3CA   ENST00000263967.4    P42336   
1566  TRAIN0318      OncoKB    Valid    PTEN   ENST00000371953.8    P60484   
1567  TRAIN0352      OncoKB    Valid    PTEN   ENST00000371953.8    P60484   
1568  TRAIN0158      OncoKB    Valid   SMAD4   ENST00000342988.8    Q13485   
1569  TRAIN0061      OncoKB    Valid    TP53   ENST00000269305.9    P04637   

               ENST.

In [5]:
selected_columns = ['Source', 'Gene', 'ENST', 'Gene Code', 'ENST.1', 'Gene Name', 'Mutation', 'Type']
df_filtered = df[selected_columns]

# Save CSV file
df_filtered.to_csv('filtered_mutation_data.csv', index=False)
print("Saved filtered data to 'filtered_mutation_data.csv'")
print(df_filtered.head())

Saved filtered data to 'filtered_mutation_data.csv'
       Source   Gene                ENST Gene Code           ENST.1  \
0  cBioPortal  BRCA1   ENST00000357654.9    P38398  ENST00000357654   
1  cBioPortal  BRCA2   ENST00000380152.8    P51587  ENST00000380152   
2  cBioPortal   CDH1  ENST00000261769.10    P12830  ENST00000261769   
3  cBioPortal   CDH1  ENST00000261769.10    P12830  ENST00000261769   
4  cBioPortal   CDH1  ENST00000261769.10    P12830  ENST00000261769   

     Gene Name Mutation    Type  
0  BRCA1_HUMAN   G1788V  Driver  
1  BRCA2_HUMAN   R2336C  Driver  
2  CADH1_HUMAN    D288N  Driver  
3  CADH1_HUMAN    D254Y  Driver  
4  CADH1_HUMAN    R732Q  Driver  


In [13]:
import time
import pandas as pd
import requests
from multiprocessing.dummy import Pool
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# ——— Configuration ———
ENSEMBL = "https://rest.ensembl.org"
HEADERS = {"Accept": "text/plain"}

# Retry session
session = requests.Session()
retries = Retry(total=3, backoff_factor=0.3, status_forcelist=[429, 500, 502, 503, 504])
session.mount("https://", HTTPAdapter(max_retries=retries))

def fetch_protein_sequence(ensembl_id):
    url = f"{ENSEMBL}/sequence/id/{ensembl_id}?type=protein"
    try:
        r = session.get(url, headers=HEADERS, timeout=10)
        if r.status_code == 200:
            text = r.text
            lines = text.splitlines()
            seq = "".join(line for line in lines if not line.startswith(">"))
            return ensembl_id, seq
    except requests.RequestException:
        pass
    return ensembl_id, None

# ——— 1) Load filtered mutation data ———
df = pd.read_csv("filtered_mutation_data.csv")

# ——— 2) Build transcript list ———
transcripts = pd.unique(
    pd.concat([df["ENST"].dropna(), df["ENST.1"].dropna()]).astype(str)
)
transcripts = [t for t in transcripts if t.startswith("ENST")]

# ——— 3) Fetch sequences in parallel ———
start = time.perf_counter()
with Pool(10) as pool:
    pairs = pool.map(fetch_protein_sequence, transcripts)
print(f"Fetched {len(pairs)} transcripts in {time.perf_counter() - start:.2f}s")

# ——— 4) Build cache ———
protein_cache = dict(pairs)

# ——— 5) Mutation overlay ———
def overlay_mutation(row):
    mut = row["Mutation"]
    try:
        ref, pos, alt = mut[0], int(mut[1:-1]), mut[-1]
    except Exception:
        return pd.Series({"wild_seq": None, "mut_seq": None, "error": "bad_mutation_format"})

    wt = protein_cache.get(str(row["ENST"])) or protein_cache.get(str(row["ENST.1"]))
    if not wt:
        return pd.Series({"wild_seq": None, "mut_seq": None, "error": "no_transcript"})

    if pos < 1 or pos > len(wt):
        return pd.Series({"wild_seq": wt, "mut_seq": None, "error": "pos_oob"})

    if wt[pos - 1] != ref:
        return pd.Series({"wild_seq": wt, "mut_seq": None, "error": f"ref_mismatch({wt[pos - 1]})"})

    m = list(wt)
    m[pos - 1] = alt
    return pd.Series({"wild_seq": wt, "mut_seq": "".join(m), "error": None})

# ——— 6) Apply mutation overlay ———
start = time.perf_counter()
results = df.apply(overlay_mutation, axis=1)
df[["wild_seq", "mut_seq", "error"]] = results
print(f"Overlay completed in {time.perf_counter() - start:.2f}s")

# ——— 7) Drop failed rows ———
df_success = df[df["mut_seq"].notna()].reset_index(drop=True)
print(df_success.head())

# ——— 8) Save to new CSV ———
df_success.to_csv("mutation_with_sequences.csv", index=False)
print(f"Saved {len(df_success)} successful entries to 'mutation_with_sequences.csv'")

Fetched 138 transcripts in 21.86s
Overlay completed in 0.76s
       Source   Gene                ENST Gene Code           ENST.1  \
0  cBioPortal  BRCA1   ENST00000357654.9    P38398  ENST00000357654   
1  cBioPortal  BRCA2   ENST00000380152.8    P51587  ENST00000380152   
2  cBioPortal   CDH1  ENST00000261769.10    P12830  ENST00000261769   
3  cBioPortal   CDH1  ENST00000261769.10    P12830  ENST00000261769   
4  cBioPortal   CDH1  ENST00000261769.10    P12830  ENST00000261769   

     Gene Name Mutation    Type  \
0  BRCA1_HUMAN   G1788V  Driver   
1  BRCA2_HUMAN   R2336C  Driver   
2  CADH1_HUMAN    D288N  Driver   
3  CADH1_HUMAN    D254Y  Driver   
4  CADH1_HUMAN    R732Q  Driver   

                                            wild_seq  \
0  MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF...   
1  MPIGSKERPTFFEIFKTRCNKADLGPISLNWFEELSSEAPPYNSEP...   
2  MGPWSRSLSALLLLLQVSSWLCQEPEPCHPGFDAESYTFTVPRRHL...   
3  MGPWSRSLSALLLLLQVSSWLCQEPEPCHPGFDAESYTFTVPRRHL...   
4  MGPWSRSLSALLLLLQVS