In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#file MOESM_3
MOESM_3 = '41598_2024_71422_MOESM3_ESM.xlsx'
df = pd.read_excel(MOESM_3)
print(df.head())

    Gene             ENSP        Variation      VAR             ENST Mutation  \
0   RAF1  ENSP00000251849              NaN  VAR1429  ENST00000251849    P261T   
1   HRAS  ENST00000311189  ENSP00000309845  VAR0775              NaN     G12D   
2   PTEN  ENSP00000361021              NaN  VAR0714  ENST00000371953    P204A   
3   KRAS              NaN  ENSP00000308495  VAR0966  ENST00000311936    F156I   
4  STAT3  ENSP00000264657              NaN  VAR1222  ENST00000264657    R382Q   

  Ref AA  Position Final AA  Randomized  ... PredictSNP  \
0      P       261        T    0.548635  ...  -0.718713   
1      G        12        D    0.293770  ...  -0.869084   
2      P       204        A    0.868490  ...  -0.654946   
3      F       156        I    0.675611  ...  -0.869084   
4      R       382        Q    0.754193  ...  -0.755661   

   PredictSNP Prediction MutPred  MutPred Prediction   VEST  Vest Prediction  \
0            Deleterious   0.595         Deleterious  0.785      Deleterious  

In [18]:
import time
import pandas as pd
import requests
from multiprocess import Pool
from time import sleep
from Bio.Seq import Seq

# ——— Setup a shared session ———
session = requests.Session()
session.headers.update({"Content-Type": "application/json", "Accept": "application/json"})

# ——— 1) Fetch protein by ENSP ———
def fetch_ensembl_protein_sequence(ensp_id, retry=2, pause=0.1):
    url = f"https://rest.ensembl.org/sequence/id/{ensp_id}"
    headers = {"Content-Type": "text/plain"}
    for _ in range(retry):
        r = session.get(url, headers=headers, timeout=5)
        if r.status_code == 200:
            return r.text.replace("\n", "")
        sleep(pause)
    return None

# ——— 2) Fetch cDNA by ENST and translate ———
def translate_cdna_from_enst(enst_id, retry=2, pause=0.1):
    url = f"https://rest.ensembl.org/sequence/id/{enst_id}?type=cdna"
    headers = {"Content-Type": "text/plain"}
    for _ in range(retry):
        r = session.get(url, headers=headers, timeout=5)
        if r.status_code == 200:
            cdna = r.text.replace("\n", "")
            # Trim to multiple of 3
            rem = len(cdna) % 3
            if rem:
                cdna = cdna[:-rem]
            protein = str(Seq(cdna).translate(to_stop=True))
            return protein
        sleep(pause)
    return None


# ——— 3) Fetch variation metadata by VAR → canonical transcript/protein ———
def fetch_variation_metadata(var_id):
    url = f"https://rest.ensembl.org/variation/human/{var_id}"
    r = session.get(url, timeout=5)
    if r.status_code == 200:
        return r.json()
    return {}

def select_canonical_from_variation(var_info):
    # pick the first transcript consequence marked canonical
    for tc in var_info.get("transcript_consequences", []):
        if tc.get("is_canonical"):
            return tc.get("protein_id") or tc.get("transcript_id")
    # fallback to any first transcript
    tc0 = var_info.get("transcript_consequences", [{}])[0]
    return tc0.get("protein_id") or tc0.get("transcript_id")

# ——— 4) Gene → ENSG → canonical transcript → translate ———
def map_symbol_to_ensg(gene_symbol):
    url = f"https://rest.ensembl.org/xrefs/symbol/human/{gene_symbol}"
    r = session.get(url, timeout=5)
    if r.status_code == 200:
        for entry in r.json():
            if entry.get("type") == "gene":
                return entry["id"]
    return None

def select_canonical_transcript_by_gene(ensg):
    url = f"https://rest.ensembl.org/lookup/id/{ensg}?expand=1"
    r = session.get(url, timeout=5)
    if r.status_code == 200:
        for t in r.json().get("Transcript", []):
            if t.get("is_canonical"):
                return t["id"]
    return None

# ——— Master retrieval logic ———
def retrieve_wt_protein(row):
    # 1) ENSP
    if pd.notna(row.get("ENSP")):
        seq = fetch_ensembl_protein_sequence(row["ENSP"])
        if seq: return seq

    # 2) ENST
    if pd.notna(row.get("ENST")):
        seq = translate_cdna_from_enst(row["ENST"])
        if seq: return seq

    # 3) VAR
    if pd.notna(row.get("VAR")):
        info = fetch_variation_metadata(row["VAR"])
        can_id = select_canonical_from_variation(info)
        if can_id and can_id.startswith("ENSP"):
            seq = fetch_ensembl_protein_sequence(can_id)
        else:
            seq = translate_cdna_from_enst(can_id)
        if seq: return seq

    # 4) Gene
    if pd.notna(row.get("Gene")):
        ensg = map_symbol_to_ensg(row["Gene"])
        if ensg:
            tx = select_canonical_transcript_by_gene(ensg)
            if tx:
                seq = translate_cdna_from_enst(tx)
                if seq: return seq

    return None

# ——— Variant processing ———
def process_variant(row):
    wt = retrieve_wt_protein(row)
    pos, ref, mut = int(row["Position"]), row["Ref AA"], row["Final AA"]
    if not wt or pos<1 or pos>len(wt) or wt[pos-1]!=ref:
        return {"wild_seq": wt, "mut_seq": None, "error":"fetch/mismatch"}
    m = list(wt); m[pos-1]=mut
    return {"wild_seq": wt, "mut_seq": "".join(m), "error":None}

# ——— Main ———
df = pd.read_excel("41598_2024_71422_MOESM3_ESM.xlsx")
cols = ["Gene","ENSP","Variation","VAR","ENST","Position","Ref AA","Final AA"]
df = df[cols].head(200).copy()

t0 = time.perf_counter()
with Pool(4) as pool:
    res = pool.map(process_variant, df.to_dict("records"))
print("Elapsed:", time.perf_counter()-t0)

out = pd.DataFrame(res)
df[["wild_seq","mut_seq","error"]] = out
df

Elapsed: 28.08272779599065


Unnamed: 0,Gene,ENSP,Variation,VAR,ENST,Position,Ref AA,Final AA,wild_seq,mut_seq,error
0,RAF1,ENSP00000251849,,VAR1429,ENST00000251849,261,P,T,MEHIQGAWKTISNGFGFKDAVFDGSSCISPTIVQQFGYQRRASDDG...,MEHIQGAWKTISNGFGFKDAVFDGSSCISPTIVQQFGYQRRASDDG...,
1,HRAS,ENST00000311189,ENSP00000309845,VAR0775,,12,G,D,AGGCCCGCCCGAGTCTCCGCCGCCCGTGCCCTGCGCCCGCAACCCG...,,fetch/mismatch
2,PTEN,ENSP00000361021,,VAR0714,ENST00000371953,204,P,A,MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVY...,MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVY...,
3,KRAS,,ENSP00000308495,VAR0966,ENST00000311936,156,F,I,LGGGRGGGGSSGGGSGGGEGGGGSASTPGPRHFGLGASAAQALKAA...,,fetch/mismatch
4,STAT3,ENSP00000264657,,VAR1222,ENST00000264657,382,R,Q,MAQWNQLQQLDTRYLEQLHQLYSDSFPMELRQFLAPWIESQDWAYA...,MAQWNQLQQLDTRYLEQLHQLYSDSFPMELRQFLAPWIESQDWAYA...,
...,...,...,...,...,...,...,...,...,...,...,...
195,ASH1L,ENSP00000376204,,VAR0593,ENST00000392403,1081,A,T,MDPRNTAMLGLGSDSEGFSRKSPSAISTGTLVSKREVELEKNTKEE...,MDPRNTAMLGLGSDSEGFSRKSPSAISTGTLVSKREVELEKNTKEE...,
196,ASXL2,ENSP00000391447,,VAR0522,ENST00000435504,1034,R,S,MREKGRRKKGRTWAEAAKTVLEKYPNTPMSHKEILQVIQREGLKEI...,MREKGRRKKGRTWAEAAKTVLEKYPNTPMSHKEILQVIQREGLKEI...,
197,BRCA2,ENST00000380152,ENSP00000369497,VAR0472,,1396,Q,R,AGAGGCGGAGCCGCTGTGGCACTGCTGCGCCTCTGCTGCGCCTCGG...,,fetch/mismatch
198,HRAS,ENST00000311189,ENSP00000309845,VAR0569,,86,N,T,AGGCCCGCCCGAGTCTCCGCCGCCCGTGCCCTGCGCCCGCAACCCG...,,fetch/mismatch


In [19]:
# Number of missing mutant sequences
missing = df["mut_seq"].isna().sum()
total   = len(df)
print(f"{missing} out of {total} variants have no mut_seq "
      f"({missing/total*100:.1f}%).")

86 out of 200 variants have no mut_seq (43.0%).
