In [None]:
import os
import pandas as pd
import numpy as np
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from evolvepro.src.process import process_dataset, plot_mutations_per_position, plot_histogram_of_readout

In [None]:
project_root = ''
file_path = os.path.join(project_root, 'data' , 'dms', 'activity', 'Source.xlsx')
dataset_name = 'jones'
wt_fasta_path = os.path.join(project_root, 'data' , 'dms', 'wt_fasta', 'jones_WT.fasta')
activity_column = 'DMS_0.625'
cutoff_value = 2.8
output_dir = os.path.join(project_root, 'output' , 'dms')
sheet_name = 'ADRB2'
cutoff_rule = 'greater_than'
cutoff_percentiles = [90, 95]
AA_shift = None
drop_columns = True

In [None]:
# Process the dataset
brenan_df, brenan_frac = process_dataset(
    file_path=file_path,
    dataset_name=dataset_name,
    wt_fasta_path=wt_fasta_path,
    activity_column=activity_column,
    cutoff_value=cutoff_value,
    output_dir=output_dir,
    sheet_name=sheet_name,
    cutoff_rule=cutoff_rule,
    cutoff_percentiles=cutoff_percentiles,
    AA_shift=AA_shift,
    drop_columns=drop_columns
)

In [None]:
plot_mutations_per_position(brenan_df)
plot_histogram_of_readout(brenan_df, activity_column, cutoff_value)

In [None]:
# generating embeddings for all mutants
import sys
import evolvepro.plm.esm.extract as esm_extract

# simulating system command-line input for arguments
sys.argv = ['evolvepro/plm/esm/extract.py', 
            'esm1b_t33_650M_UR50S', # model name
            'output/dms/jones.fasta', # input sequence fasta file
            'output/dms/embeddings', # output location
            '--toks_per_batch', '512',
            '--include', 'mean',
            '--concatenate_dir', 'output'
            ]
esm_extract.main()