In [2]:
!pip install hmmlearn biopython

Collecting hmmlearn
  Downloading hmmlearn-0.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting biopython
  Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Downloading hmmlearn-0.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (165 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.0/166.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython, hmmlearn
Successfully installed biopython-1.86 hmmlearn-0.3.3


In [3]:
from google.colab import files
uploaded = files.upload()

Saving CoV-AbDab_080224.csv to CoV-AbDab_080224.csv


In [4]:
import pandas as pd
df = pd.read_csv("/content/CoV-AbDab_080224.csv")

In [5]:
#create training labels
def label_neutralization(row):
    neut = str(row["Neutralising Vs"]).strip()
    non = str(row["Not Neutralising Vs"]).strip()

    if neut not in ["", "nan", "NaN"] and neut != "None":
        return 1
    if non not in ["", "nan", "NaN"] and non != "None":
        return 0
    return None   # unknown

df["label"] = df.apply(label_neutralization, axis=1)
df = df.dropna(subset=["label"])
df["label"] = df["label"].astype(int)

In [6]:
#extracting the CDRH3 sequences
df = df[df["CDRH3"].notna()] #remove rows with missing CDRH3
neutralizing_seqs = df[df["label"] == 1]["CDRH3"].tolist() #looks at label =1 (neutralizing), remove rows with CDRH3, makes it into list
non_neut_seqs = df[df["label"] == 0]["CDRH3"].tolist() #looks at label =0 (non- neutralizing), remove rows CDRH3, makes it into list

In [7]:
#converting amino acid sequences to integers
import numpy as np

AA = "ACDEFGHIKLMNPQRSTVWY"
aa_to_idx = {aa:i for i,aa in enumerate(AA)}

def one_hot(seq):
    arr = np.zeros((len(seq), 20), dtype=int)
    for i, aa in enumerate(seq):
        if aa in aa_to_idx:
            arr[i, aa_to_idx[aa]] = 1
    return arr


In [8]:
neut_encoded = [one_hot(s) for s in neutralizing_seqs]
non_neut_encoded = [one_hot(s) for s in non_neut_seqs]

In [9]:
#default trained for 10 iterations
from hmmlearn.hmm import MultinomialHMM

def train_hmm(seqs, n_states=6):
    model = MultinomialHMM(n_components=n_states)
    model.n_features = 20  # 20 amino acids

    lengths = [len(s) for s in seqs]
    X = np.vstack(seqs)  # shape = (total_len, 20)

    model.fit(X, lengths)
    return model


In [10]:
HMM_neut = train_hmm(neut_encoded, n_states=6)
HMM_non = train_hmm(non_neut_encoded, n_states=6)

https://github.com/hmmlearn/hmmlearn/issues/335
https://github.com/hmmlearn/hmmlearn/issues/340
https://github.com/hmmlearn/hmmlearn/issues/335
https://github.com/hmmlearn/hmmlearn/issues/340


In [None]:
# Print emission matrices
print("Neutralizing HMM emission matrix:")
print(HMM_neut.emissionprob_)


Neutralizing HMM emission matrix:
[[1.02059428e-02 5.09948239e-03 4.01058882e-02 3.26798873e-02
  3.08089086e-01 1.27914138e-01 2.74024675e-02 1.59443939e-02
  2.24958771e-03 8.25119996e-02 1.31362170e-01 1.75569828e-02
  2.58922017e-02 6.24944491e-03 4.06739227e-03 4.93130842e-02
  1.72346482e-02 4.50458374e-02 1.07086617e-04 5.09682786e-02]
 [2.00057914e-01 5.91321235e-04 7.32517602e-04 6.49976125e-03
  4.74819840e-04 1.69684192e-02 1.91552676e-02 5.83694350e-02
  9.26459467e-03 4.57657109e-02 3.22017792e-03 1.05131160e-04
  5.64259295e-02 1.52038951e-03 2.48958579e-01 1.87149739e-02
  7.24312731e-02 1.07027546e-01 5.69807403e-02 7.67354989e-02]
 [3.41020696e-01 4.95815206e-04 3.65433405e-01 2.94760498e-02
  3.63856430e-03 4.05905895e-02 1.91222792e-02 1.15612461e-02
  2.98613530e-03 1.85117429e-02 7.24280326e-04 1.61666569e-02
  1.21705888e-02 1.28834114e-02 4.97316158e-03 1.26924590e-02
  3.24496772e-02 6.06332150e-02 9.19222383e-03 5.27780356e-03]
 [4.30219565e-04 3.83836074e-02 5

In [None]:
print("\nNon-neutralizing HMM emission matrix:")
print(HMM_non.emissionprob_)


Non-neutralizing HMM emission matrix:
[[3.61935302e-02 3.81230838e-02 6.72860187e-02 1.00503220e-01
  4.08002494e-02 1.93907270e-01 3.75047250e-03 1.61925384e-02
  9.40470590e-03 8.06952296e-02 4.47789792e-03 1.23548144e-02
  6.89561795e-02 1.09346824e-03 2.07915330e-03 1.26315306e-01
  1.22071157e-01 4.33660918e-02 1.36717127e-02 1.87579007e-02]
 [4.54391887e-02 8.88324027e-04 3.57320129e-01 1.12434554e-02
  1.47072225e-02 1.40931475e-01 1.97329464e-02 4.35395727e-03
  3.57103798e-03 1.36002953e-02 1.12824136e-03 1.83684808e-02
  1.42815844e-02 3.16848404e-03 7.96993766e-03 4.91986465e-02
  1.30998593e-02 2.13314846e-02 3.36370914e-02 2.26028158e-01]
 [9.03953175e-01 3.87796535e-12 5.02763949e-11 3.65104194e-09
  4.66060738e-05 3.23360868e-03 2.37407731e-04 2.30956351e-03
  1.65531863e-06 5.09207550e-07 1.40151435e-12 3.82169134e-03
  2.17025522e-09 3.49161998e-04 3.72123264e-04 5.92836263e-03
  3.37482991e-02 4.51821103e-02 5.79624176e-05 7.57758080e-04]
 [1.89120108e-02 1.50246562e

In [11]:
import pandas as pd
import numpy as np

# List of amino acids
AA = "ACDEFGHIKLMNPQRSTVWY"
aa_to_idx = {aa:i for i,aa in enumerate(AA)}

# Function to create labeled DataFrame for emission matrix
def create_emission_df(emission_matrix, amino_acids=AA):
    n_states = emission_matrix.shape[0]
    df = pd.DataFrame(emission_matrix, columns=list(amino_acids))
    df.index = [f"State_{i+1}" for i in range(n_states)]
    return df

# Create labeled DataFrames
em_neut_df = create_emission_df(HMM_neut.emissionprob_)
em_non_df = create_emission_df(HMM_non.emissionprob_)

# Display the DataFrames
print("Neutralizing Antibodies Emission Probability Matrix:")
display(em_neut_df)

print("\nNon-Neutralizing Antibodies Emission Probability Matrix:")
display(em_non_df)


Neutralizing Antibodies Emission Probability Matrix:


Unnamed: 0,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y
State_1,0.009905,0.031783,0.022549,0.002506,0.067712,0.244319,0.017232,0.007711,2.3e-05,0.114312,0.024206,0.061799,0.017343,0.017522,0.000234,0.130897,0.037397,0.035741,0.052295,0.104514
State_2,0.032137,0.014997,0.47421,0.015029,0.003779,0.071144,0.018945,0.000617,0.000705,0.007573,0.004594,0.022342,0.004966,0.011872,0.007342,0.008337,0.017308,0.000915,0.069408,0.213781
State_3,0.004312,0.009543,0.004849,0.006409,0.212806,0.059598,0.01607,0.061566,2.4e-05,0.04117,0.057676,0.014109,0.069732,0.003522,7.2e-05,0.055653,0.007356,0.077898,0.000292,0.297342
State_4,0.006427,0.00779,0.138647,0.042418,0.015633,0.066463,0.019749,0.022226,0.05732,0.038819,0.013897,0.01808,0.040482,0.016697,0.267403,0.087818,0.074637,0.042743,0.007773,0.014979
State_5,0.770547,0.000729,0.000172,0.001566,0.002391,0.015667,0.002994,0.003112,0.007471,0.009492,0.008116,0.008859,0.004396,0.001809,0.043739,0.009723,0.052263,0.05457,0.00072,0.001663
State_6,0.068128,0.017029,0.085138,0.055637,0.008695,0.211618,0.021823,0.030465,0.00129,0.062553,0.001456,0.018376,0.053364,0.018278,0.007632,0.112397,0.042082,0.075862,0.030541,0.077635



Non-Neutralizing Antibodies Emission Probability Matrix:


Unnamed: 0,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y
State_1,0.004723,6.6e-05,0.613614,0.041597,0.014963,0.088159,0.013017,0.018989,0.000279,0.027361,0.005223,0.005182,0.014053,0.012704,0.001901,0.010922,0.004495,0.073956,0.005856,0.042941
State_2,0.001068,0.000194,0.001039,0.000278,0.359945,0.01141,0.001298,0.005372,0.087277,0.035585,0.164403,0.001518,0.002169,0.000137,0.269991,0.009809,0.008936,0.002472,0.002688,0.034413
State_3,0.001888,0.015836,0.043764,0.015546,0.033315,0.102385,0.015224,0.050119,0.00167,0.058949,0.008876,0.032427,0.082259,0.015054,0.005682,0.099467,0.034656,0.09432,0.037107,0.251457
State_4,0.009659,0.029068,0.01807,0.027502,0.031513,0.218403,0.029708,0.025962,0.005339,0.068702,0.007494,0.048293,0.045449,0.021243,0.019152,0.143907,0.050895,0.041821,0.024791,0.133027
State_5,0.132777,0.010879,0.030579,0.042718,0.001427,0.065411,0.020509,0.023227,0.05197,0.026793,0.001607,0.00245,0.015897,0.01566,0.291418,0.111075,0.106162,0.039117,0.002371,0.007953
State_6,0.504533,0.002596,0.016828,0.00264,0.003666,0.092608,0.015246,0.003152,0.001981,0.005114,0.001108,0.014341,0.006746,0.000341,0.024418,0.014008,0.030034,0.033589,0.039596,0.187455
