In [1]:
pip install -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import requests

def fetch_uniprot_fasta(uniprot_id):
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
    r = requests.get(url)
    r.raise_for_status()  # catch errors early
    lines = r.text.split("\n")
    seq = "".join(line.strip() for line in lines if not line.startswith(">"))
    return seq


In [3]:
HLA_TO_UNIPROT = {
    "HLA-B8": "P01889",
    "HLA-B*08:01": "P01889",
    "HLA-E*01:03": "P30511",
    "HLA-E*01:01": "P30512"
}


In [4]:
print(fetch_uniprot_fasta("P01889")[:60])   # HLA-B*08:01
print(fetch_uniprot_fasta("P30511")[:60])   # HLA-E*01:03


MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTSVSRPGRGEPRFISVGYVDDTQFVRF
MAPRSLLLLLSGALALTDTWAGSHSLRYFSTAVSRPGRGEPRYIAVEYVDDTQFLRFDSD


In [7]:
import pandas as pd

# Load file
df = pd.read_excel("Reduced_Dataset.xlsx")

# Remove the IEDB header row
df_clean = df.iloc[1:].copy()

# Rename columns
df_clean.columns = ["peptide", "hla", "tcr_alpha", "tcr_beta"]

# Strip whitespace
df_clean = df_clean.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# -----------------------------------------------
# REMOVE rows missing TCR α or TCR β
# -----------------------------------------------
df_filtered = df_clean[
    df_clean["tcr_alpha"].notna() &
    df_clean["tcr_beta"].notna() &
    (df_clean["tcr_alpha"] != "") &
    (df_clean["tcr_beta"] != "")
]

# Save filtered CSV
df_filtered.to_csv("Reduced_Dataset_TCRonly.csv", index=False)

print("Original rows:", len(df_clean))
print("Rows with full TCR alpha + beta:", len(df_filtered))

df_filtered.head()


  df_clean = df_clean.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Original rows: 225505
Rows with full TCR alpha + beta: 1480


Unnamed: 0,peptide,hla,tcr_alpha,tcr_beta
1,VMAPRTLIL,HLA-E*01:03,KTTQPPSMDCAEGRAANLPCNHSTISGNEYVYWYRQIHSQGPQYII...,GVTQFPSHSVIEKGQTVTLRCDPISGHDNLYWYRRVMGKEIKFLLH...
2,VMAPRTLIL,"HLA-E*01:01, HLA-E*01:03",KTTQPPSMDCAEGRAANLPCNHSTISGNEYVYWYRQIHSQGPQYII...,GVTQFPSHSVIEKGQTVTLRCDPISGHDNLYWYRRVMGKEIKFLLH...
3,FLRGRFYGL,HLA-B8,KTTQPNSMESNEEEPVHLPCNHSTISGTDYIHWYRQLPSQGPEYVI...,GVSQSPRYKVAKRGQDVALRCDPISGHVSLFWYQQALGQGPEFLTY...
4,FLRGRAYGL,HLA-B8,KTTQPNSMESNEEEPVHLPCNHSTISGTDYIHWYRQLPSQGPEYVI...,GVSQSPRYKVAKRGQDVALRCDPISGHVSLFWYQQALGQGPEFLTY...
27,LLFGYPVYV,"HLA-A*02:01, HLA-A*02:01 K66A, E63Q mutant",QQVKQNSPSLSVQEGRISILNCDYTNSMFDYFLWYKKYPAEGPTFL...,NAGVTQTPKFQVLKTGQSMTLQCAQDMNHEYMSWYRQDPGMGLRLI...


In [16]:
# ============================================
# 1. IMPORTS
# ============================================
import pandas as pd
import requests
import json
import os

# ============================================
# 2. LOAD TCR-ONLY REDUCED DATASET
# ============================================
df = pd.read_csv("Reduced_Dataset_TCRonly.csv")

print("Loaded filtered dataset:")
display(df.head())

# Rename columns ONLY if needed
df.columns = ["peptide", "hla", "tcr_alpha", "tcr_beta"]

# Clean whitespace + convert NaN to empty strings
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
df["hla"] = df["hla"].fillna("").astype(str).str.strip()

# Extract first allele if multiple appear
df["hla"] = df["hla"].str.split(",").str[0]

print("Cleaned HLA values:")
display(df["hla"].head())

# ============================================
# 3. HLA → UNIPROT MAPPING TABLE
# ============================================
HLA_TO_UNIPROT = {
    "HLA-B8": "P01889",
    "HLA-B*08:01": "P01889",
    "HLA-E*01:03": "P30511",
    "HLA-E*01:01": "P30512"
}

# ============================================
# 4. FUNCTION TO FETCH FASTA FROM UNIPROT
# ============================================
def fetch_uniprot_fasta(uniprot_id):
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
    r = requests.get(url)
    r.raise_for_status()
    lines = r.text.split("\n")
    seq = "".join([l.strip() for l in lines if not l.startswith(">")])
    return seq

# Fetch β2-microglobulin (same for all humans)
B2M_SEQ = fetch_uniprot_fasta("P61769")
print("β2m sequence length:", len(B2M_SEQ))

# ============================================
# 5. FETCH HLA HEAVY CHAIN SEQUENCES
# ============================================
mhc_seqs = []
b2m_seqs = []

for hla in df["hla"]:
    key = hla.replace(" ", "")
    uniprot_id = HLA_TO_UNIPROT.get(key, None)

    if uniprot_id:
        mhc_seq = fetch_uniprot_fasta(uniprot_id)
    else:
        mhc_seq = "UNKNOWN"  # mark invalid rows

    mhc_seqs.append(mhc_seq)
    b2m_seqs.append(B2M_SEQ)

df["mhc_heavy_chain"] = mhc_seqs
df["beta_2_microglobulin"] = b2m_seqs

print("Dataset after adding HLA sequences:")
display(df.head())

# ============================================
# 6. REMOVE ROWS WITH UNKNOWN MHC HEAVY CHAIN
# ============================================
df = df[df["mhc_heavy_chain"] != "UNKNOWN"].reset_index(drop=True)

print(f"Remaining rows after removing UNKNOWN MHC: {len(df)}")
display(df.head())

# ============================================
# 7. SAVE FINAL CSV
# ============================================
output_csv = "Processed_Dataset_With_UniProt.csv"
df.to_csv(output_csv, index=False)
print(f"Saved filtered processed dataset to: {output_csv}")

# ============================================
# 8. GENERATE AF3 JSON INPUT FILES
# ============================================
os.makedirs("AF3_inputs", exist_ok=True)

for i, row in df.iterrows():
    input_dict = {
        "protein_sequences": {
            "tcr_alpha": row["tcr_alpha"],
            "tcr_beta": row["tcr_beta"],
            "mhc_heavy_chain": row["mhc_heavy_chain"],
            "beta_2_microglobulin": row["beta_2_microglobulin"],
            "peptide": row["peptide"]
        }
    }

    with open(f"AF3_inputs/af3_input_{i}.json", "w") as f:
        json.dump(input_dict, f, indent=2)

print("Generated AF3 JSON files for all valid rows in: AF3_inputs/")


Loaded filtered dataset:


Unnamed: 0,peptide,hla,tcr_alpha,tcr_beta
0,VMAPRTLIL,HLA-E*01:03,KTTQPPSMDCAEGRAANLPCNHSTISGNEYVYWYRQIHSQGPQYII...,GVTQFPSHSVIEKGQTVTLRCDPISGHDNLYWYRRVMGKEIKFLLH...
1,VMAPRTLIL,"HLA-E*01:01, HLA-E*01:03",KTTQPPSMDCAEGRAANLPCNHSTISGNEYVYWYRQIHSQGPQYII...,GVTQFPSHSVIEKGQTVTLRCDPISGHDNLYWYRRVMGKEIKFLLH...
2,FLRGRFYGL,HLA-B8,KTTQPNSMESNEEEPVHLPCNHSTISGTDYIHWYRQLPSQGPEYVI...,GVSQSPRYKVAKRGQDVALRCDPISGHVSLFWYQQALGQGPEFLTY...
3,FLRGRAYGL,HLA-B8,KTTQPNSMESNEEEPVHLPCNHSTISGTDYIHWYRQLPSQGPEYVI...,GVSQSPRYKVAKRGQDVALRCDPISGHVSLFWYQQALGQGPEFLTY...
4,LLFGYPVYV,"HLA-A*02:01, HLA-A*02:01 K66A, E63Q mutant",QQVKQNSPSLSVQEGRISILNCDYTNSMFDYFLWYKKYPAEGPTFL...,NAGVTQTPKFQVLKTGQSMTLQCAQDMNHEYMSWYRQDPGMGLRLI...


Cleaned HLA values:


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


0    HLA-E*01:03
1    HLA-E*01:01
2         HLA-B8
3         HLA-B8
4    HLA-A*02:01
Name: hla, dtype: object

β2m sequence length: 119
Dataset after adding HLA sequences:


Unnamed: 0,peptide,hla,tcr_alpha,tcr_beta,mhc_heavy_chain,beta_2_microglobulin
0,VMAPRTLIL,HLA-E*01:03,KTTQPPSMDCAEGRAANLPCNHSTISGNEYVYWYRQIHSQGPQYII...,GVTQFPSHSVIEKGQTVTLRCDPISGHDNLYWYRRVMGKEIKFLLH...,MAPRSLLLLLSGALALTDTWAGSHSLRYFSTAVSRPGRGEPRYIAV...,MSRSVALAVLALLSLSGLEAIQRTPKIQVYSRHPAENGKSNFLNCY...
1,VMAPRTLIL,HLA-E*01:01,KTTQPPSMDCAEGRAANLPCNHSTISGNEYVYWYRQIHSQGPQYII...,GVTQFPSHSVIEKGQTVTLRCDPISGHDNLYWYRRVMGKEIKFLLH...,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...,MSRSVALAVLALLSLSGLEAIQRTPKIQVYSRHPAENGKSNFLNCY...
2,FLRGRFYGL,HLA-B8,KTTQPNSMESNEEEPVHLPCNHSTISGTDYIHWYRQLPSQGPEYVI...,GVSQSPRYKVAKRGQDVALRCDPISGHVSLFWYQQALGQGPEFLTY...,MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTSVSRPGRGEPRF...,MSRSVALAVLALLSLSGLEAIQRTPKIQVYSRHPAENGKSNFLNCY...
3,FLRGRAYGL,HLA-B8,KTTQPNSMESNEEEPVHLPCNHSTISGTDYIHWYRQLPSQGPEYVI...,GVSQSPRYKVAKRGQDVALRCDPISGHVSLFWYQQALGQGPEFLTY...,MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTSVSRPGRGEPRF...,MSRSVALAVLALLSLSGLEAIQRTPKIQVYSRHPAENGKSNFLNCY...
4,LLFGYPVYV,HLA-A*02:01,QQVKQNSPSLSVQEGRISILNCDYTNSMFDYFLWYKKYPAEGPTFL...,NAGVTQTPKFQVLKTGQSMTLQCAQDMNHEYMSWYRQDPGMGLRLI...,UNKNOWN,MSRSVALAVLALLSLSGLEAIQRTPKIQVYSRHPAENGKSNFLNCY...


Remaining rows after removing UNKNOWN MHC: 38


Unnamed: 0,peptide,hla,tcr_alpha,tcr_beta,mhc_heavy_chain,beta_2_microglobulin
0,VMAPRTLIL,HLA-E*01:03,KTTQPPSMDCAEGRAANLPCNHSTISGNEYVYWYRQIHSQGPQYII...,GVTQFPSHSVIEKGQTVTLRCDPISGHDNLYWYRRVMGKEIKFLLH...,MAPRSLLLLLSGALALTDTWAGSHSLRYFSTAVSRPGRGEPRYIAV...,MSRSVALAVLALLSLSGLEAIQRTPKIQVYSRHPAENGKSNFLNCY...
1,VMAPRTLIL,HLA-E*01:01,KTTQPPSMDCAEGRAANLPCNHSTISGNEYVYWYRQIHSQGPQYII...,GVTQFPSHSVIEKGQTVTLRCDPISGHDNLYWYRRVMGKEIKFLLH...,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...,MSRSVALAVLALLSLSGLEAIQRTPKIQVYSRHPAENGKSNFLNCY...
2,FLRGRFYGL,HLA-B8,KTTQPNSMESNEEEPVHLPCNHSTISGTDYIHWYRQLPSQGPEYVI...,GVSQSPRYKVAKRGQDVALRCDPISGHVSLFWYQQALGQGPEFLTY...,MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTSVSRPGRGEPRF...,MSRSVALAVLALLSLSGLEAIQRTPKIQVYSRHPAENGKSNFLNCY...
3,FLRGRAYGL,HLA-B8,KTTQPNSMESNEEEPVHLPCNHSTISGTDYIHWYRQLPSQGPEYVI...,GVSQSPRYKVAKRGQDVALRCDPISGHVSLFWYQQALGQGPEFLTY...,MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTSVSRPGRGEPRF...,MSRSVALAVLALLSLSGLEAIQRTPKIQVYSRHPAENGKSNFLNCY...
4,FLRGRAYGL,HLA-B*08:01,HMRKEVEQDPGPFNVPEGATVAFNCTYSNSASQSFFWYRQDSRKEP...,HMNAGVTQTPKFRVLKTGQSMTLLCAQDMNHEYMYWYRQDPGMGLR...,MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTSVSRPGRGEPRF...,MSRSVALAVLALLSLSGLEAIQRTPKIQVYSRHPAENGKSNFLNCY...


Saved filtered processed dataset to: Processed_Dataset_With_UniProt.csv
Generated AF3 JSON files for all valid rows in: AF3_inputs/


In [15]:
# ============================================
# 8. GENERATE AF3 JSONL INPUT FILES (Vertex AI Format)
# ============================================
os.makedirs("AF3_reaL_inputs", exist_ok=True)

for i, row in df.iterrows():
    job = {
        "protein_sequences": {
            "tcr_alpha": row["tcr_alpha"],
            "tcr_beta": row["tcr_beta"],
            "peptide": row["peptide"],
            "mhc_heavy_chain": row["mhc_heavy_chain"],
            "beta_2_microglobulin": row["beta_2_microglobulin"]
        }
    }

    with open(f"AF3_real_inputs/af3_input_{i}.jsonl", "w") as f:
        f.write(json.dumps(job))
        
print("Generated Vertex-AI compatible JSONL files in AF3_real_inputs/")


Generated Vertex-AI compatible JSONL files in AF3_real_inputs/


In [17]:
# ============================================
# 8. GENERATE AlphaFold Server JSON FILES
# ============================================
os.makedirs("AFserver_inputs", exist_ok=True)

for i, row in df.iterrows():
    
    job = {
        "name": f"TCR_pMHC_job_{i}",
        "modelSeeds": [],
        "sequences": [
            { "proteinChain": {"sequence": row["tcr_alpha"], "count": 1 }},
            { "proteinChain": {"sequence": row["tcr_beta"], "count": 1 }},
            { "proteinChain": {"sequence": row["mhc_heavy_chain"], "count": 1 }},
            { "proteinChain": {"sequence": row["beta_2_microglobulin"], "count": 1 }},
            { "proteinChain": {"sequence": row["peptide"], "count": 1 }}
        ],
        "dialect": "alphafoldserver",
        "version": 1
    }
    
    # IMPORTANT: each file must be a LIST containing one job
    json_blob = [job]

    with open(f"AFserver_inputs/afserver_job_{i}.json", "w") as f:
        json.dump(json_blob, f, indent=2)

print("Generated AlphaFold Server JSON files: AFserver_inputs/")


Generated AlphaFold Server JSON files: AFserver_inputs/
