## this notebook is used for parsing the raw design for statistical analysis

In [1]:
import os
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import sys
import pandas as pd
import json
from pathlib import Path
import abnumber
from collections import Counter

REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname("__file__"), '..'))
if REPO_ROOT not in sys.path:
    sys.path.insert(0, REPO_ROOT)

from scripts.data_processing import Inverse_Folding_Design, Batch_Designs, Dataset, Exps
from scripts.utils import get_cdr_residue_idx_list, calculate_seq_identity, fasta2seq, calculate_seq_similarity_blosum62, parse_blosum62, restype_1to3, get_sequence_by_biopython
aa_list = restype_1to3.keys()
BLOSUM62_MATRIX = parse_blosum62('../data/resources/BLOSUM62.txt')


pd.set_option('display.max_columns', None)


The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.




In [3]:
## load data
df_skempi = pd.read_csv("../data/resources/skempi_v2.csv", sep=';')
EXP_INFO_JSON = "../data/metadata.json"
df_fab_info = pd.read_csv("../data/resources/df_fab_info.csv", index_col=0)
df_vhh_info = pd.read_csv("../data/resources/df_vhh_info.csv", index_col=0)

## Define exps

In [4]:
exps = Exps(df_fab= df_fab_info, df_vhh= df_vhh_info, exp_info_json=EXP_INFO_JSON)
exps.collect_results()


Got the following exp_info
design id: lm_design_vanilla-fab-fullseq
{'results_dir': '../data/design_raw/lm_design_fab_0916_full_seq', 'method': 'lm_design_vanilla', 'antibody_type': 'fab', 'info': 'full antibody seq design using lm_design vanilla on Github, with T=0.2', 'NAME': 'LM Design'}
design id: lm_design_api-fab-fullseq_T02
{'results_dir': '../data/design_raw/lm_design_fab_0913_full_seq', 'method': 'lm_design_biomap', 'antibody_type': 'fab', 'info': 'full antibody seq design using BioMap api, with T=0.2', 'NAME': 'LM Design'}
design id: lm_design_api_noAG-fab-fullseq
{'results_dir': '../data/design_raw/lm_design_fab_0919_noAgChain', 'method': 'lm_design_biomap', 'antibody_type': 'fab', 'info': 'full antibody seq design w/o ag chain in pdb complex using BioMap api', 'NAME': 'LM Design'}
design id: antifold-fab-fullseq
{'results_dir': '../data/design_raw/AntiFold_Batch_fab_0903', 'method': 'antifold', 'antibody_type': 'fab', 'info': 'antifold github version', 'NAME': 'AntiFold'}
d

### Save and load exps

In [19]:
import pickle

pkl_dir = "../data/processed/exps_pickle"
version = '250217'

version_path = Path(pkl_dir)/Path(f"{version}.pkl")
print(f"saving to {version_path}")
with open(version_path, 'wb') as f:
    pickle.dump(exps, f)


saving to ../data/processed/exps_pickle/250217.pkl


In [9]:
# load from saved
# with open(version_path, 'rb') as f:
#     exps = pickle.load(f)

### Save df_pos and df_pdb

In [6]:
def collect_dfs(exps):

    df_pdbs, df_poses, df_infos = [], [], []
    for k in exps.results.keys():
        print(f"working on {k}")
        df_pdb = exps.results[k].df_pdb
        df_pdb['task_id'] = k
        df_pdb['pdb_name'] = df_pdb.index
        df_pdbs.append(df_pdb)


        df_pos = exps.results[k].df_pos
        df_pos['task_id'] = k
        df_poses.append(df_pos)


        df_info = exps.results[k].df_info
        df_info['task_id'] = k
        df_infos.append(df_info)

    df_pdb_all = pd.concat(df_pdbs, ignore_index=True)
    df_pos_all = pd.concat(df_poses, ignore_index=True)
    df_info_all = pd.concat(df_infos, ignore_index=True)
    return df_pdb_all, df_pos_all, df_info_all

In [7]:
df_pdb_all, df_pos_all, df_info_all = collect_dfs(exps)

working on lm_design_vanilla-fab-fullseq
working on lm_design_api-fab-fullseq_T02
working on lm_design_api_noAG-fab-fullseq
working on antifold-fab-fullseq
working on antifold_noAG-fab-fullseq
working on antifold_relaxed-fab-fullseq
working on esm_if-fab-fullseq
working on esm_if_noAG-fab-fullseq
working on mpnn-fab-fullseq
working on abmpnn-fab-fullseq
working on lm_design_vanilla-vhh-fullseq
working on antifold-vhh-fullseq
working on esm_if-vhh-fullseq
working on mpnn-vhh-fullseq
working on abmpnn-vhh-fullseq


In [23]:
df_pos_all.to_csv("../data/processed/df_pos_all_XXX.csv")
df_pdb_all.to_csv("../data/processed/df_pdb_all_XXX.csv")
df_info_all.to_csv("../data/processed/df_info_all_XXX.csv")




### collect seqs for re-fold

In [13]:
class Design_methods:
    design_methods_fab = ['esm_if-fab-fullseq', 'mpnn-fab-fullseq', 'lm_design_vanilla-fab-fullseq', 'antifold-fab-fullseq']
    design_methods_vhh = ['esm_if-vhh-fullseq', 'mpnn-vhh-fullseq', 'lm_design_vanilla-vhh-fullseq', 'antifold-vhh-fullseq', 'abmpnn-vhh-fullseq']
    design_methods_vhh2 = ['esm_if-vhh-fullseq', 'mpnn-vhh-fullseq', 'lm_design_vanilla-vhh-fullseq', 'antifold-vhh-fullseq']
    design_methods_lm_fab_compare = ['lm_design_api-fab-fullseq_T07', 'lm_design_api-fab-fullseq_T02', 'lm_design_vanilla-fab-fullseq', 'lm_design_api-fab-cdronly','lm_design_api-fab-noag']
    design_methods_lm_vhh_compare = ['lm_design_api-vhh-fullseq_T07', 'lm_design_api-vhh-fullseq_T02', 'lm_design_vanilla-vhh-fullseq', 'lm_design_api-vhh-cdronly','lm_design_api-vhh-noag']
    antifold_ag_noag = ['antifold-fab-fullseq', 'antifold_noAG-fab-fullseq']
    lm_design_ag_noag = ['lm_design_api-fab-fullseq_T02', 'lm_design_api_noAG-fab-fullseq']
    esm_if_ag_noag = ['esm_if-fab-fullseq', 'esm_if_noAG-fab-fullseq']

In [11]:
def collect_seqs(exps, methods, num=5):

    # collect raw sequences

    dfs = []
    for k in methods:
        print(f"working on {k}")
        df_raw = exps.results[k].df_raw
        df_raw['task_id'] = k
        # df_pdb['pdb_name'] = df_pdb.index
        dfs.append(df_raw)

    df_seqs = pd.concat(dfs, ignore_index=True)


    df_seqs['design_seq_id'] = df_seqs['task_id'] + "--" + df_seqs['pdb_name'] + "--" + df_seqs['design_id']

    cols = ['task_id', 'design_seq_id', 'pdb_name', 'design_seq_H', 'design_seq_L']

    df_seqs = df_seqs[cols]

    # sample raw seqs
    dfs = []
    for k in methods:
        df = df_seqs[df_seqs['task_id'] == k].groupby("pdb_name").apply(lambda x:x.tail(num)).reset_index(level=0, drop=True)
        dfs.append(df)

    df_seqs_sample = pd.concat(dfs, ignore_index=True)
    return df_seqs_sample

In [14]:
df_seqs_fab = collect_seqs(exps, Design_methods.design_methods_fab)
df_seqs_vhh = collect_seqs(exps, Design_methods.design_methods_vhh2)

print(df_seqs_fab.shape, df_seqs_vhh.shape)

working on esm_if-fab-fullseq
working on mpnn-fab-fullseq
working on lm_design_vanilla-fab-fullseq
working on antifold-fab-fullseq
working on esm_if-vhh-fullseq
working on mpnn-vhh-fullseq
working on lm_design_vanilla-vhh-fullseq
working on antifold-vhh-fullseq
(4060, 5) (1220, 5)


In [16]:
df_seqs_vhh = df_seqs_vhh.merge(df_vhh_info, on="pdb_name")
df_seqs_fab = df_seqs_fab.merge(df_fab_info, on="pdb_name")

print(df_seqs_fab.shape, df_seqs_vhh.shape)



(4060, 26) (1220, 19)
