### Install Packages

In [18]:
pip install -r requirements.txt

Collecting biopython
  Downloading biopython-1.86-cp310-cp310-macosx_11_0_arm64.whl (2.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: biopython
Successfully installed biopython-1.86

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


### Data Cleanup: Removing Empty Cells


In [36]:
import pandas as pd

# Load file
df = pd.read_excel("updated_reduced_data.xlsx")

# Remove the IEDB header row
df_clean = df.iloc[1:].copy()

# Rename columns
df_clean.columns = ["peptide", "hla", "tcr_alpha", "tcr_beta"]

# Strip whitespace (correct way in new pandas)
df_clean = df_clean.map(lambda x: x.strip() if isinstance(x, str) else x)

# Convert string NaN → actual NA (no inplace needed)
df_clean["tcr_alpha"] = df_clean["tcr_alpha"].replace(["NaN", "nan", "None"], pd.NA)
df_clean["tcr_beta"]  = df_clean["tcr_beta"].replace(["NaN", "nan", "None"], pd.NA)

# -----------------------------------------------
# REMOVE rows missing TCR α or β
# -----------------------------------------------
mask = (
    df_clean["tcr_alpha"].notna() &
    df_clean["tcr_beta"].notna() &
    (df_clean["tcr_alpha"] != "") &
    (df_clean["tcr_beta"] != "")
)

df_filtered = df_clean.loc[mask].copy()

# Save filtered CSV
df_filtered.to_csv("new_reduced_data_TCRonly.csv", index=False)

print("Original rows:", len(df_clean))
print("Rows removed:", len(df_clean) - len(df_filtered))
print("Rows with full TCR alpha + beta:", len(df_filtered))

df_filtered.head()


Original rows: 6
Rows removed: 0
Rows with full TCR alpha + beta: 6


Unnamed: 0,peptide,hla,tcr_alpha,tcr_beta
1,IMDQVPFSV,HLA-A*02:01,MAQSVSQHNHHVILSEAASLELGCNYSYGGTVNLFWYVQYPGQHLQ...,MGITQSPKYLFRKEGQNVTLSCEQNLNHDAMYWYRQDPGQGLRLIY...
2,TRLALIAPK,HLA-B*27:05,KQEVTQIPAALSVPEGENLVLNCSFTDSAIYNLQWFRQDPGKGLTS...,GVTQTPKHLITATGQRVTLRCSPRSGDLSVYWYQQSLDQGLQFLIQ...
3,TRLALIAPK,HLA-B*27:05,KQEVTQIPAALSVPEGENLVLNCSFTDSAIYNLQWFRQDPGKGLTS...,GVTQTPKHLITATGQRVTLRCSPRSGDLSVYWYQQSLDQGLQFLIQ...
4,LRVMMLAPF,HLA-B*27:05,KQEVTQIPAALSVPEGENLVLNCSFTDSAIYNLQWFRQDPGKGLTS...,GVTQTPKHLITATGQRVTLRCSPRSGDLSVYWYQQSLDQGLQFLIQ...
5,TRLALIAPK,HLA-B*27:05,KQEVTQIPAALSVPEGENLVLNCSFTDSAIYNLQWFRQDPGKGLTS...,GVTQTPKHLITATGQRVTLRCSPRSGDLSVYWYQQSLDQGLQFLIQ...


### Data Generation in AF3 Formatting:

In [38]:
# ============================================
# 1. IMPORTS
# ============================================
import pandas as pd
import requests
import json
import os

# ============================================
# 2. LOAD TCR-ONLY REDUCED DATASET
# ============================================
df = pd.read_csv("new_reduced_data_TCRonly.csv")

print("Loaded filtered dataset:")
display(df.head())

# Rename to standardised column names
df.columns = ["peptide", "hla", "tcr_alpha", "tcr_beta"]

# Strip whitespace
df = df.map(lambda x: x.strip() if isinstance(x, str) else x)
df["hla"] = df["hla"].fillna("").astype(str).str.strip()

# If multiple HLAs → use first
df["hla"] = df["hla"].str.split(",").str[0]

print("Cleaned HLA values:")
display(df["hla"].head())

# ============================================
# 3. EXPANDED HLA → UNIPROT MAPPING TABLE
# ============================================

HLA_TO_UNIPROT = {
    # B27 family
    "HLA-B*27:05": "P03989",
    "HLA-B*27:09": "P30480",

    # A02 family
    "HLA-A*02:01": "P01892",
    "HLA-A*02:05": "P30512",

    # B8 family (example)
    "HLA-B8": "P01889",
    "HLA-B*08:01": "P01889",

    # HLA-E
    "HLA-E*01:03": "P30511",
    "HLA-E*01:01": "P30512"
}

# ============================================
# 4. FUNCTION TO FETCH FASTA FROM UNIPROT
# ============================================
def fetch_uniprot_fasta(uniprot_id):
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
    r = requests.get(url)
    if r.status_code != 200:
        print(f"[ERROR] Could not fetch UniProt {uniprot_id}")
        return None
    seq = "".join(l.strip() for l in r.text.split("\n") if not l.startswith(">"))
    return seq

# Fetch β2-microglobulin
B2M_SEQ = fetch_uniprot_fasta("P61769")
print("β2m sequence length:", len(B2M_SEQ))

# ============================================
# 5. FETCH MHC HEAVY CHAIN SEQUENCES
# ============================================
heavy_chain_seqs = []

for hla in df["hla"]:
    key = hla.replace(" ", "")
    uniprot_id = HLA_TO_UNIPROT.get(key, None)

    if uniprot_id:
        seq = fetch_uniprot_fasta(uniprot_id)
    else:
        seq = None

    heavy_chain_seqs.append(seq)

df["mhc_heavy_chain"] = heavy_chain_seqs
df["beta_2_microglobulin"] = B2M_SEQ

# Remove entries with missing MHC sequence
df = df[df["mhc_heavy_chain"].notna()].reset_index(drop=True)

print("Dataset after adding MHC heavy chain sequences:")
display(df.head())

print(f"Remaining valid rows: {len(df)}")

# ============================================
# 6. SAVE FINAL CSV
# ============================================
output_csv = "Processed_Human_TCR_MHC_Dataset.csv"
df.to_csv(output_csv, index=False)
print(f"Saved dataset to: {output_csv}")

# ============================================
# 7. GENERATE VALID ALPHAFOLD 3 JSON FILES
# ============================================

os.makedirs("Human_AF3_inputs", exist_ok=True)

for i, row in df.iterrows():
    job_json = [
        {
            "name": f"TCR_MHC_job_{i}",
            "modelSeeds": [],
            "sequences": [
                { "proteinChain": { "sequence": row["tcr_alpha"], "count": 1 } },
                { "proteinChain": { "sequence": row["tcr_beta"], "count": 1 } },
                { "proteinChain": { "sequence": row["mhc_heavy_chain"], "count": 1 } },
                { "proteinChain": { "sequence": row["beta_2_microglobulin"], "count": 1 } },
                { "proteinChain": { "sequence": row["peptide"], "count": 1 } }
            ],
            "dialect": "alphafoldserver",
            "version": 1
        }
    ]

    with open(f"Human_AF3_inputs/af3_job_{i}.json", "w") as f:
        json.dump(job_json, f, indent=2)

print("✔️ Generated AlphaFold 3 JSON files (AlphaFold Server format).")


Loaded filtered dataset:


Unnamed: 0,peptide,hla,tcr_alpha,tcr_beta
0,IMDQVPFSV,HLA-A*02:01,MAQSVSQHNHHVILSEAASLELGCNYSYGGTVNLFWYVQYPGQHLQ...,MGITQSPKYLFRKEGQNVTLSCEQNLNHDAMYWYRQDPGQGLRLIY...
1,TRLALIAPK,HLA-B*27:05,KQEVTQIPAALSVPEGENLVLNCSFTDSAIYNLQWFRQDPGKGLTS...,GVTQTPKHLITATGQRVTLRCSPRSGDLSVYWYQQSLDQGLQFLIQ...
2,TRLALIAPK,HLA-B*27:05,KQEVTQIPAALSVPEGENLVLNCSFTDSAIYNLQWFRQDPGKGLTS...,GVTQTPKHLITATGQRVTLRCSPRSGDLSVYWYQQSLDQGLQFLIQ...
3,LRVMMLAPF,HLA-B*27:05,KQEVTQIPAALSVPEGENLVLNCSFTDSAIYNLQWFRQDPGKGLTS...,GVTQTPKHLITATGQRVTLRCSPRSGDLSVYWYQQSLDQGLQFLIQ...
4,TRLALIAPK,HLA-B*27:05,KQEVTQIPAALSVPEGENLVLNCSFTDSAIYNLQWFRQDPGKGLTS...,GVTQTPKHLITATGQRVTLRCSPRSGDLSVYWYQQSLDQGLQFLIQ...


Cleaned HLA values:


0    HLA-A*02:01
1    HLA-B*27:05
2    HLA-B*27:05
3    HLA-B*27:05
4    HLA-B*27:05
Name: hla, dtype: object

β2m sequence length: 119
Dataset after adding MHC heavy chain sequences:


Unnamed: 0,peptide,hla,tcr_alpha,tcr_beta,mhc_heavy_chain,beta_2_microglobulin
0,IMDQVPFSV,HLA-A*02:01,MAQSVSQHNHHVILSEAASLELGCNYSYGGTVNLFWYVQYPGQHLQ...,MGITQSPKYLFRKEGQNVTLSCEQNLNHDAMYWYRQDPGQGLRLIY...,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...,MSRSVALAVLALLSLSGLEAIQRTPKIQVYSRHPAENGKSNFLNCY...
1,TRLALIAPK,HLA-B*27:05,KQEVTQIPAALSVPEGENLVLNCSFTDSAIYNLQWFRQDPGKGLTS...,GVTQTPKHLITATGQRVTLRCSPRSGDLSVYWYQQSLDQGLQFLIQ...,MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTSVSRPGRGEPRF...,MSRSVALAVLALLSLSGLEAIQRTPKIQVYSRHPAENGKSNFLNCY...
2,TRLALIAPK,HLA-B*27:05,KQEVTQIPAALSVPEGENLVLNCSFTDSAIYNLQWFRQDPGKGLTS...,GVTQTPKHLITATGQRVTLRCSPRSGDLSVYWYQQSLDQGLQFLIQ...,MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTSVSRPGRGEPRF...,MSRSVALAVLALLSLSGLEAIQRTPKIQVYSRHPAENGKSNFLNCY...
3,LRVMMLAPF,HLA-B*27:05,KQEVTQIPAALSVPEGENLVLNCSFTDSAIYNLQWFRQDPGKGLTS...,GVTQTPKHLITATGQRVTLRCSPRSGDLSVYWYQQSLDQGLQFLIQ...,MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTSVSRPGRGEPRF...,MSRSVALAVLALLSLSGLEAIQRTPKIQVYSRHPAENGKSNFLNCY...
4,TRLALIAPK,HLA-B*27:05,KQEVTQIPAALSVPEGENLVLNCSFTDSAIYNLQWFRQDPGKGLTS...,GVTQTPKHLITATGQRVTLRCSPRSGDLSVYWYQQSLDQGLQFLIQ...,MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTSVSRPGRGEPRF...,MSRSVALAVLALLSLSGLEAIQRTPKIQVYSRHPAENGKSNFLNCY...


Remaining valid rows: 6
Saved dataset to: Processed_Human_TCR_MHC_Dataset.csv
✔️ Generated AlphaFold 3 JSON files (AlphaFold Server format).


## Data transformation for Boltz-2 formatting

In [None]:
# ============================================
# 1. CONVERT TO BOLTZ-2 YAML FORMAT
# ============================================
import yaml
import json
import os

# Create output directory for Boltz-2 inputs
os.makedirs("boltz2_inputs", exist_ok=True)

# Get all JSON files from AFserver_inputs
json_files = [f for f in os.listdir("AFserver_inputs") if f.endswith('.json')]

for json_file in json_files:
    # Read the AlphaFold Server JSON
    with open(f"AFserver_inputs/{json_file}", 'r') as f:
        af_data = json.load(f)
    
    # AlphaFold Server format is a list with one job
    job = af_data[0]
    
    # Convert to Boltz-2 format
    boltz_input = {'sequences': []}
    
    # Map the sequences with appropriate IDs
    chain_names = ["tcr_alpha", "tcr_beta", "mhc_heavy_chain", 
                   "beta_2_microglobulin", "peptide"]
    
    for idx, seq_entry in enumerate(job['sequences']):
        protein_seq = seq_entry['proteinChain']['sequence']
        chain_id = chain_names[idx] if idx < len(chain_names) else f"chain_{idx}"
        
        boltz_input['sequences'].append({
            'protein': {
                'id': chain_id,
                'sequence': protein_seq
            }
        })
    
    # Create output filename (replace .json with .yaml)
    yaml_filename = json_file.replace('.json', '.yaml').replace('afserver_job', 'boltz2_job')
    
    # Write YAML file
    with open(f"boltz2_inputs/{yaml_filename}", 'w') as f:
        yaml.dump(boltz_input, f, default_flow_style=False, sort_keys=False)

print(f"Converted {len(json_files)} JSON files to YAML format")