In [10]:
import os
import pandas as pd

output_folder        = '../../AIzymes_IC50'
df_out               = 'OXA48_IC50.csv'
prep_df_in           = 'OXA48_IC50_prep.csv'
reference_seq        = 'reference_sequence.seq' 

os.makedirs(output_folder, exist_ok=True)

# Construct the full file path
file_path           = os.path.join(output_folder, prep_df_in)
reference_file_path = os.path.join(output_folder, reference_seq)
df_out_path         = os.path.join(output_folder, df_out)

# Check if the file exists and load the DataFrame
if os.path.exists(file_path):
    df = pd.read_csv(file_path)
else:
    print(f"File not found: {file_path}")

# Check if the reference sequence file exists and load it
if os.path.exists(reference_file_path):
    with open(reference_file_path, 'r') as f:
        reference_sequence = f.read().strip()  # Strip any extra whitespace/newlines
else:
    print(f"Reference sequence file not found: {reference_file_path}")

# Loop through the DataFrame and apply the mutations
sequences = []  # List to store the new sequences after applying mutations
for idx, row in df.iterrows():
    
    mutations = row['mutations']  # Get the mutations for the current row
    new_sequence = list(reference_sequence) 
    
    if pd.notna(mutations):  # Check if mutations is not NaN
        # Split the mutations by '/' and apply them one by one
        for mutation in mutations.split('/'):
            original_aa = mutation[0]  # The original amino acid
            position = int(mutation[1:-1]) - 1  # The position (convert to 0-based index)
            new_aa = mutation[-1]  # The new amino acid

            # Check if the original amino acid matches the reference sequence at the position
            if reference_sequence[position] == original_aa:
                new_sequence[position] = new_aa  # Apply the mutation
            else:
                print(f"Warning: Mutation {mutation} does not match the reference sequence at position {position + 1}")

    # Join the mutated sequence list back into a string and store it in the sequences list
    sequences.append(''.join(new_sequence))

# Add the new 'sequence' column to the DataFrame
df['sequence'] = sequences

# Display the updated DataFrame
display(df)

# Optionally, save the updated DataFrame to a new CSV file
df.to_csv(df_out_path, index=False)
print(f"Updated DataFrame saved to {df_out_path}")

Unnamed: 0,name,mutations,substrate,kcat,dkcat,KM,dKM,kcat/KM,dkcat/KM,short code,no. mutations,IC50,dIC50,MIC,sequence
0,wtOXA-48,,CAZ,0.0028,0.0008,247.0,140.0,11.0,11.0,AFST,0,0.013,0.002,0.03,MRVLALSAVFLVASIIGMPAVAKEWQENKSWNAHFTEHKSQGVVVL...
1,,A33V,CAZ,,,,,,,VFST,1,0.017,0.004,0.03,MRVLALSAVFLVASIIGMPAVAKEWQENKSWNVHFTEHKSQGVVVL...
2,,F72L,CAZ,0.0049,0.0004,18.0,5.0,281.0,281.0,ALST,1,0.029,0.006,0.12,MRVLALSAVFLVASIIGMPAVAKEWQENKSWNAHFTEHKSQGVVVL...
3,,S212A,CAZ,,,,,24.0,24.0,AFAT,1,0.015,0.002,0.03,MRVLALSAVFLVASIIGMPAVAKEWQENKSWNAHFTEHKSQGVVVL...
4,,T213A,CAZ,,,,,31.0,31.0,AFSA,1,0.012,0.001,0.03,MRVLALSAVFLVASIIGMPAVAKEWQENKSWNAHFTEHKSQGVVVL...
5,,F72L/S212A,CAZ,0.0037,0.0003,7.0,3.0,565.0,565.0,ALSA,2,0.177,0.063,1.0,MRVLALSAVFLVASIIGMPAVAKEWQENKSWNAHFTEHKSQGVVVL...
6,,F72L/T213A,CAZ,0.018,0.0005,53.0,5.0,339.0,339.0,ALAT,2,0.14,0.003,0.5,MRVLALSAVFLVASIIGMPAVAKEWQENKSWNAHFTEHKSQGVVVL...
7,,A33V/F72L,CAZ,,,,,,,VLST,2,0.034,0.002,0.12,MRVLALSAVFLVASIIGMPAVAKEWQENKSWNVHFTEHKSQGVVVL...
8,,S212A/T213A,CAZ,,,,,98.0,98.0,AFAA,2,0.023,0.005,0.06,MRVLALSAVFLVASIIGMPAVAKEWQENKSWNAHFTEHKSQGVVVL...
9,,A33V/S212A,CAZ,,,,,,,VFAT,2,0.014,0.001,0.03,MRVLALSAVFLVASIIGMPAVAKEWQENKSWNVHFTEHKSQGVVVL...


Updated DataFrame saved to ../../AIzymes_IC50\OXA48_IC50.csv


In [None]:
%run src/plm_trainer_multi_small.py
from src.tools import reset_cuda
reset_cuda()

output_folder        = '../../AIzymes_IC50'
df_path               = output_folder+'/OXA48_IC50.csv'

reset_cuda()

dataset = PLM_trainer(
    output_folder   = output_folder,
    verbose         = False
    )

PLM_trainer.load_dataset(    
    dataset,            
    df_path         = df_path,
    scores          = ['IC50','kcat','kcat/KM'],
    labels          = [],
    select_unique   = True,
    print_testtrain = True,
    normalize       = 'minmax'
    )

PLM_trainer.train_PLM( 
    dataset,
    epochs          = 100,
    esm2_model_name = "facebook/esm2_t6_8M_UR50D",
    p_loss          = 0.3,
    liveplot        = False,
    overwrite       = True
)


### PLM trainer loaded. ###
### Data loaded from: ../../AIzymes_IC50/OXA48_IC50.csv ###
### Data normalized. ###
train_df


Unnamed: 0,sequence,IC50,kcat,kcat/KM,norm_IC50,norm_kcat,norm_kcat/KM
2,MRVLALSAVFLVASIIGMPAVAKEWQENKSWNAHFTEHKSQGVVVL...,0.012,,31.0,0.0,,0.006229
14,MRVLALSAVFLVASIIGMPAVAKEWQENKSWNVHFTEHKSQGVVVL...,0.148,,,0.271457,,
3,MRVLALSAVFLVASIIGMPAVAKEWQENKSWNAHFTEHKSQGVVVL...,0.013,0.0028,11.0,0.001996,0.0,0.0
10,MRVLALSAVFLVASIIGMPAVAKEWQENKSWNVHFTEHKSQGVVVL...,0.015,,,0.005988,,
0,MRVLALSAVFLVASIIGMPAVAKEWQENKSWNAHFTEHKSQGVVVL...,0.023,,98.0,0.021956,,0.027094
5,MRVLALSAVFLVASIIGMPAVAKEWQENKSWNAHFTEHKSQGVVVL...,0.177,0.0037,565.0,0.329341,0.059211,0.172532
4,MRVLALSAVFLVASIIGMPAVAKEWQENKSWNAHFTEHKSQGVVVL...,0.389,0.0068,3222.0,0.752495,0.263158,1.0
15,MRVLALSAVFLVASIIGMPAVAKEWQENKSWNVHFTEHKSQGVVVL...,0.034,,,0.043912,,
13,MRVLALSAVFLVASIIGMPAVAKEWQENKSWNVHFTEHKSQGVVVL...,0.141,,,0.257485,,
12,MRVLALSAVFLVASIIGMPAVAKEWQENKSWNVHFTEHKSQGVVVL...,0.513,0.0101,1555.0,1.0,0.480263,0.480847


test_df


Unnamed: 0,sequence,IC50,kcat,kcat/KM,norm_IC50,norm_kcat,norm_kcat/KM
7,MRVLALSAVFLVASIIGMPAVAKEWQENKSWNAHFTEHKSQGVVVL...,0.029,0.0049,281.0,0.033932,0.138158,0.084086
11,MRVLALSAVFLVASIIGMPAVAKEWQENKSWNVHFTEHKSQGVVVL...,0.017,,,0.00998,,
6,MRVLALSAVFLVASIIGMPAVAKEWQENKSWNAHFTEHKSQGVVVL...,0.14,0.018,339.0,0.255489,1.0,0.102149
9,MRVLALSAVFLVASIIGMPAVAKEWQENKSWNVHFTEHKSQGVVVL...,0.014,,,0.003992,,


 16%|█▌        | 16/100 [00:54<05:57,  4.25s/it]

In [20]:
%run src/plm_trainer_multi.py
output_folder = '../../AIzymes_resi99_multi'
plot_summary(output_folder, models = ['facebook/esm2_t6_8M_UR50D', 'facebook/esm2_t12_35M_UR50D', 'facebook/esm2_t30_150M_UR50D'])

### PLM trainer loaded. ###
