##### import library

In [1]:
import pandas as pd 
from Bio import SeqIO
from tqdm import tqdm
from evo import Evo, score_sequences
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


##### Read model

In [2]:
device = 'cuda:0'

evo_model = Evo('evo-1-131k-base')
model, tokenizer = evo_model.model, evo_model.tokenizer
model.to(device)
model.eval()

Loading checkpoint shards: 100%|██████████| 3/3 [01:22<00:00, 27.52s/it]


StripedHyena(
  (embedding_layer): VocabParallelEmbedding(512, 4096)
  (norm): RMSNorm()
  (unembed): VocabParallelEmbedding(512, 4096)
  (blocks): ModuleList(
    (0-7): 8 x ParallelGatedConvBlock(
      (pre_norm): RMSNorm()
      (post_norm): RMSNorm()
      (filter): ParallelHyenaFilter()
      (projections): Linear(in_features=4096, out_features=12288, bias=True)
      (out_filter_dense): Linear(in_features=4096, out_features=4096, bias=True)
      (mlp): ParallelGatedMLP(
        (l1): Linear(in_features=4096, out_features=10928, bias=False)
        (l2): Linear(in_features=4096, out_features=10928, bias=False)
        (l3): Linear(in_features=10928, out_features=4096, bias=False)
      )
    )
    (8): AttentionBlock(
      (pre_norm): RMSNorm()
      (post_norm): RMSNorm()
      (inner_mha_cls): MHA(
        (Wqkv): Linear(in_features=4096, out_features=12288, bias=True)
        (inner_attn): FlashSelfAttention(
          (drop): Dropout(p=0.0, inplace=False)
        )
        

##### Read fasta ( This alt_output.fasta was generated by executing  evo_fasta_generator.ipynb in same folder.)

In [None]:
records = list(SeqIO.parse("alt_output.fasta", 'fasta'))

In [6]:
seq_dict = {record.description: str(record.seq) for record in records}
seq_dict_score = {seq_dict[i]: np.nan for i in seq_dict.keys()}

##### Calculate Evo score

In [13]:
batch_seqs = 1
c = 0

for i in seq_dict_score.keys():

    seq_dict_score[i] = score_sequences(
        [i], model, tokenizer, device=device)
    
    c = c + 1
    
    if c%500==0:
        # break
        print("completed ", c)
    

completed  500
completed  1000
completed  1500
completed  2000
completed  2500
completed  3000
completed  3500
completed  4000
completed  4500
completed  5000
completed  5500
completed  6000
completed  6500
completed  7000
completed  7500
completed  8000
completed  8500
completed  9000
completed  9500
completed  10000
completed  10500
completed  11000
completed  11500
completed  12000
completed  12500
completed  13000
completed  13500
completed  14000
completed  14500
completed  15000
completed  15500
completed  16000
completed  16500
completed  17000
completed  17500
completed  18000
completed  18500
completed  19000
completed  19500
completed  20000
completed  20500
completed  21000
completed  21500
completed  22000
completed  22500
completed  23000
completed  23500
completed  24000
completed  24500
completed  25000
completed  25500
completed  26000
completed  26500
completed  27000
completed  27500
completed  28000
completed  28500
completed  29000
completed  29500
completed  30000


##### Get the values in dataframe per unique mutation

In [40]:
dct_result = dict()
for record in records:
    
    # try:
    dct_result[record.description] = seq_dict_score[record.seq]
    
alt_df = pd.DataFrame.from_dict(dct_result, orient='index').reset_index()
alt_df.columns = ["key", "alt_evo_score"]
# ref_df

In [48]:
alt_df.shape

(36922, 2)

##### Read kGain data to obtain unique id

In [42]:
df_gain = pd.read_excel("/home/bernadettem/bernadettenotebook/evo_16_nov/MetaData_ecoli_final.xlsx", sheet_name= "Gain score")
df_gain.head()

Unnamed: 0,Position,Gene,Allele,Ref_allele,Alt_allele,Annotation,label,accumulated_gain
0,63,intergenic,A->C,A,C,noncoding,p6,-10.385914
1,201,thrL,T->G,T,G,missense,p6,-1.750693
2,241,thrL,A->C,A,C,missense,m1,-0.911836
3,309,thrA,T->G,T,G,noncoding,m1,4.009052
4,322,thrA,A->G,A,G,noncoding,p3,1.532477


In [43]:
# Create a single key column by concatenating all specified columns as strings
df_gain['key'] = df_gain[['Position', 'Gene', 'Allele', 'Ref_allele', 'Alt_allele', 'Annotation',
       'label']].astype(str).agg('_'.join, axis=1)

In [38]:
df_gain

Unnamed: 0,Position,Gene,Allele,Ref_allele,Alt_allele,Annotation,label,accumulated_gain,key
0,63,intergenic,A->C,A,C,noncoding,p6,-10.385914,63_ intergenic_ A->C_ A_C_ noncoding_p6
1,201,thrL,T->G,T,G,missense,p6,-1.750693,201_ thrL_ T->G_ T_G_ missense_p6
2,241,thrL,A->C,A,C,missense,m1,-0.911836,241_ thrL_ A->C_ A_C_ missense_m1
3,309,thrA,T->G,T,G,noncoding,m1,4.009052,309_ thrA_ T->G_ T_G_ noncoding_m1
4,322,thrA,A->G,A,G,noncoding,p3,1.532477,322_ thrA_ A->G_ A_G_ noncoding_p3
...,...,...,...,...,...,...,...,...,...
36917,4629541,lasT,C->T,C,T,missense,m2,-4.635292,4629541_ lasT_ C->T_ C_T_ missense_m2
36918,4629573,lasT,T->C,T,C,synonymous,m4,-7.657991,4629573_ lasT_ T->C_ T_C_ synonymous_m4
36919,4629600,lasT,A->G,A,G,missense,p3,2.403010,4629600_ lasT_ A->G_ A_G_ missense_p3
36920,4629600,lasT,A->G,A,G,missense,p6,2.403010,4629600_ lasT_ A->G_ A_G_ missense_p6


##### Merge with gain dataframe

In [44]:
df_combined_kgain_evo = pd.merge(df_gain, alt_df, on='key', how='inner')
df_combined_kgain_evo

Unnamed: 0,Position,Gene,Allele,Ref_allele,Alt_allele,Annotation,label,accumulated_gain,key,alt_evo_score
0,63,intergenic,A->C,A,C,noncoding,p6,-10.385914,63_ intergenic_ A->C_ A_C_ noncoding_p6,-2.219959
1,201,thrL,T->G,T,G,missense,p6,-1.750693,201_ thrL_ T->G_ T_G_ missense_p6,-1.671160
2,241,thrL,A->C,A,C,missense,m1,-0.911836,241_ thrL_ A->C_ A_C_ missense_m1,-2.014411
3,309,thrA,T->G,T,G,noncoding,m1,4.009052,309_ thrA_ T->G_ T_G_ noncoding_m1,-1.905999
4,322,thrA,A->G,A,G,noncoding,p3,1.532477,322_ thrA_ A->G_ A_G_ noncoding_p3,-1.845984
...,...,...,...,...,...,...,...,...,...,...
36917,4629541,lasT,C->T,C,T,missense,m2,-4.635292,4629541_ lasT_ C->T_ C_T_ missense_m2,-2.286406
36918,4629573,lasT,T->C,T,C,synonymous,m4,-7.657991,4629573_ lasT_ T->C_ T_C_ synonymous_m4,-1.922138
36919,4629600,lasT,A->G,A,G,missense,p3,2.403010,4629600_ lasT_ A->G_ A_G_ missense_p3,-2.212620
36920,4629600,lasT,A->G,A,G,missense,p6,2.403010,4629600_ lasT_ A->G_ A_G_ missense_p6,-2.212620


##### Save dataframe for future use

In [None]:
df_combined_kgain_evo[["CHROM", "Position", "Gene", "Allele", "alt_evo_score"]].to_csv("evo_score.csv", index=False)

Unnamed: 0,CHROM,Position,Gene,Allele,alt_evo_score
0,chrI,63,intergenic,A->C,-2.219959
1,chrI,201,thrL,T->G,-1.671160
2,chrI,241,thrL,A->C,-2.014411
3,chrI,309,thrA,T->G,-1.905999
4,chrI,322,thrA,A->G,-1.845984
...,...,...,...,...,...
36917,chrI,4629541,lasT,C->T,-2.286406
36918,chrI,4629573,lasT,T->C,-1.922138
36919,chrI,4629600,lasT,A->G,-2.212620
36920,chrI,4629600,lasT,A->G,-2.212620
