# Notebook corresponding to case studies taken from MaveDB

In [1]:
from pathlib import Path

In [2]:
import os
base_path = Path("/cta/share/users/ProteinGym/Clinical_Variants(Substitutions)")
list(base_path.glob("*.csv"))[:5]

[PosixPath('/cta/share/users/ProteinGym/Clinical_Variants(Substitutions)/NP_003110.1.csv'),
 PosixPath('/cta/share/users/ProteinGym/Clinical_Variants(Substitutions)/NP_005076.3.csv'),
 PosixPath('/cta/share/users/ProteinGym/Clinical_Variants(Substitutions)/NP_005493.2.csv'),
 PosixPath('/cta/share/users/ProteinGym/Clinical_Variants(Substitutions)/NP_057097.2.csv'),
 PosixPath('/cta/share/users/ProteinGym/Clinical_Variants(Substitutions)/NP_055790.1.csv')]

In [3]:
import pandas as pd
# Use a generator expression to read and concatenate CSVs
df = pd.concat((pd.read_csv(file) for file in base_path.rglob("*.csv")), ignore_index=True)
# df = pd.read_csv('/cta/share/users/ProteinGym/Clinical_Variants(Substitutions)/NP_055790.1.csv')
df[df["DMS_bin_score"] == "Benign"]

Unnamed: 0.1,Unnamed: 0,protein,protein_sequence,mutant,mutated_sequence,DMS_bin_score
0,67567,NP_003110.1,MAVLLLLLRALRRGPGPGPRPLWGPGPAWSPGFPARPGRGRPYMAS...,R13G,MAVLLLLLRALRGGPGPGPRPLWGPGPAWSPGFPARPGRGRPYMAS...,Benign
1,67568,NP_003110.1,MAVLLLLLRALRRGPGPGPRPLWGPGPAWSPGFPARPGRGRPYMAS...,G32E,MAVLLLLLRALRRGPGPGPRPLWGPGPAWSPEFPARPGRGRPYMAS...,Benign
2,67569,NP_003110.1,MAVLLLLLRALRRGPGPGPRPLWGPGPAWSPGFPARPGRGRPYMAS...,K113R,MAVLLLLLRALRRGPGPGPRPLWGPGPAWSPGFPARPGRGRPYMAS...,Benign
5,67572,NP_003110.1,MAVLLLLLRALRRGPGPGPRPLWGPGPAWSPGFPARPGRGRPYMAS...,V183I,MAVLLLLLRALRRGPGPGPRPLWGPGPAWSPGFPARPGRGRPYMAS...,Benign
6,67573,NP_003110.1,MAVLLLLLRALRRGPGPGPRPLWGPGPAWSPGFPARPGRGRPYMAS...,R294H,MAVLLLLLRALRRGPGPGPRPLWGPGPAWSPGFPARPGRGRPYMAS...,Benign
...,...,...,...,...,...,...
62715,18955,NP_004647.1,MNKGWLELESDPGLFTLLVEDFGVKGVQVEEIYDLQSKCQGPVYGF...,S596G,MNKGWLELESDPGLFTLLVEDFGVKGVQVEEIYDLQSKCQGPVYGF...,Benign
62717,18957,NP_004647.1,MNKGWLELESDPGLFTLLVEDFGVKGVQVEEIYDLQSKCQGPVYGF...,V476A,MNKGWLELESDPGLFTLLVEDFGVKGVQVEEIYDLQSKCQGPVYGF...,Benign
62718,18958,NP_004647.1,MNKGWLELESDPGLFTLLVEDFGVKGVQVEEIYDLQSKCQGPVYGF...,T423K,MNKGWLELESDPGLFTLLVEDFGVKGVQVEEIYDLQSKCQGPVYGF...,Benign
62719,18959,NP_004647.1,MNKGWLELESDPGLFTLLVEDFGVKGVQVEEIYDLQSKCQGPVYGF...,N290S,MNKGWLELESDPGLFTLLVEDFGVKGVQVEEIYDLQSKCQGPVYGF...,Benign


# Apply segmentation

In [4]:
import numpy as np
from tokenizers import Tokenizer
from vocabulary_functions import calc_dice_from_encodings

In [5]:
tkz_path = "/cta/share/users/mutbpe/tokenizers/blosum62/hf_uniref50_bpe_51200.json"
tkz1 = Tokenizer.from_file(tkz_path)
tkz_path = "/cta/share/users/mutbpe/tokenizers/blosum62/hf_uniref50_mutbpe_0.7_3_12_0.05_51200.json"
tkz2 = Tokenizer.from_file(tkz_path)

In [None]:
import numpy as np
from scipy.spatial.distance import pdist





'''
Given a Tokenizers encoding offsets offs
Assign each symbol in the sequence its token id
Return the numpy list containing assignments
(Like cluster assignments)
'''
def _offsets_to_cluster(offs):
    seq_len = offs[-1][-1]
    cluster_assign = np.zeros((seq_len, 1))
    for i, (b, e) in enumerate(offs):
        cluster_assign[b:e] = i
    return cluster_assign


# Cluster based rand-index
def calc_rand_index(offs1, offs2):
    clusters1 = _offsets_to_cluster(offs1)
    clusters2 = _offsets_to_cluster(offs2)
    assign_eq = pdist(clusters1, metric='hamming')  # 0 if same, 1 if different
    label_eq = pdist(clusters2, metric='hamming') # 0 if same, 1 if different
    rand_index = np.mean(assign_eq == label_eq)
    return rand_index

# Cluster based dice index
def calc_dice_index(offs1, offs2):
    clusters1 = _offsets_to_cluster(offs1)
    clusters2 = _offsets_to_cluster(offs2)
    assign_eq = pdist(clusters1, metric='hamming')  # 0 if same, 1 if different
    label_eq = pdist(clusters2, metric='hamming') # 0 if same, 1 if different
    # Boolean masks
    same_assign = assign_eq == 0
    same_label = label_eq == 0
    TP = np.sum(same_assign & same_label)
    FP = np.sum(same_assign & ~same_label)
    FN = np.sum(~same_assign & same_label)
    # Jaccard index (ignores TN)
    dice_index = 2*TP / (2*TP + FP + FN + 1e-10)
    return dice_index

# Cluster based jaccard index
def calc_jaccard_index(offs1, offs2):
    clusters1 = _offsets_to_cluster(offs1)
    clusters2 = _offsets_to_cluster(offs2)
    assign_eq = pdist(clusters1, metric='hamming')  # 0 if same, 1 if different
    label_eq = pdist(clusters2, metric='hamming') # 0 if same, 1 if different
    # Boolean masks
    same_assign = assign_eq == 0
    same_label = label_eq == 0
    TP = np.sum(same_assign & same_label)
    FP = np.sum(same_assign & ~same_label)
    FN = np.sum(~same_assign & same_label)
    # Jaccard index (ignores TN)
    jaccard_index = TP / (TP + FP + FN + 1e-10)
    return jaccard_index

# Calc all indices at the same time to avoid redundancy
# Returns rand, dice, jaccard
def calc_all_indices(offs1, offs2):
    clusters1 = _offsets_to_cluster(offs1)
    clusters2 = _offsets_to_cluster(offs2)
    assign_eq = pdist(clusters1, metric='hamming')  # 0 if same, 1 if different
    label_eq = pdist(clusters2, metric='hamming') # 0 if same, 1 if different
    same_assign = assign_eq == 0
    same_label = label_eq == 0
    TP = np.sum(same_assign & same_label)
    FP = np.sum(same_assign & ~same_label)
    FN = np.sum(~same_assign & same_label)
    TN = np.sum(~same_assign & ~same_label)


    jaccard_index = TP / (TP + FP + FN + 1e-10)
    dice_index = 2*TP / (2*TP + FP + FN + 1e-10)
    rand_index = (TP + TN) / (TP + FP + FN + TN)
    
    return rand_index, dice_index, jaccard_index




In [8]:
def batch_calc(tkz, ref_list, target_list):
    scores = np.zeros((len(target_list), 4))
    for i, seq in enumerate(target_list):
        ref_seq = ref_list[i]
        ref_enc = tkz.encode(ref_seq)
        mutated_enc = tkz.encode(seq)
        scores[i, :] = np.array([
                calc_rand_index(ref_enc, mutated_enc),
                calc_dice_from_encodings((ref_enc, mutated_enc)),
                calc_dice_index(ref_enc, mutated_enc),
                calc_jaccard_index(ref_enc, mutated_enc)
        ])
    return scores.mean(axis = 0)

def batch_calc_eq(tkz, ref_list, target_list):
    ct = 0
    for i, seq in enumerate(target_list):
        ref_seq = ref_list[i]
        ref_enc = tkz.encode(ref_seq)
        mutated_enc = tkz.encode(seq)
        if ref_enc.offsets == mutated_enc.offsets:
            ct += 1
    return ct
        
df_benign = df[df["DMS_bin_score"] == "Benign"]
df_patho = df[df["DMS_bin_score"] == "Pathogenic"]
print("BPE Path", batch_calc_eq(tkz1, df_patho["protein_sequence"].to_list()[:30000], df_patho["mutated_sequence"].to_list()[:30000]))
print("PUMA Path", batch_calc_eq(tkz2, df_patho["protein_sequence"].to_list()[:30000], df_patho["mutated_sequence"].to_list()[:30000]))
print("BPE Benign", batch_calc_eq(tkz1, df_benign["protein_sequence"].to_list()[:30000], df_benign["mutated_sequence"].to_list()[:30000]))
print("PUMA Benign", batch_calc_eq(tkz2, df_benign["protein_sequence"].to_list()[:30000], df_benign["mutated_sequence"].to_list()[:30000]))


BPE Path 14329
PUMA Path 15045
BPE Benign 16018
PUMA Benign 16794


# Notlar
- Ayni kalan proteinler ne kadar ayni iki algoritma arasinda.
- DMS datasindaki, fonksiyonel olarak coherent butun parcalar ile bizim tokenlarimiz arasinda bir uyusma var mi diye incele

In [16]:
df[df["DMS_bin_score"] == "Pathogenic"].__len__()
#

32000

In [19]:
encs1 = tkz1.encode_batch(df_patho["protein_sequence"])
encs2 = tkz1.encode_batch(df_patho["mutated_sequence"])
from tqdm import tqdm
data = []
for enc1, enc2 in tqdm(zip(encs1, encs2)):
    data.append(calc_all_indices(enc1, enc2))

32000it [01:28, 360.28it/s] 


In [22]:
type(encs1[1].offsets[0])

tuple

In [34]:
def take_offsets_subset(offs, lower_idx, upper_idx):
    new_offsets = []
    for s,e in offs:
        start, end = 0, 0
        if e <= lower_idx:
            continue
        if s > upper_idx:
            continue
        if s < lower_idx:
            start = 0
        else:
            start = s - lower_idx
        if e > upper_idx:
            end = upper_idx - lower_idx + 1
        else:
            end = e - lower_idx
        new_offsets.append((start, end))
    return new_offsets


# Instead of calculating the metrics through whole sequence, calculate them on a short window around the mutation.
mutants = df_patho["mutant"].to_list()
mutants = [int(mut[1:-1]) for mut in mutants]
for i, (enc1, enc2) in tqdm(enumerate(zip(encs1, encs2))):
    offsets1 = enc1.offsets.copy()
    offsets2 = enc2.offsets.copy()
    if offsets1 == offsets2:
        print(calc_all_indices(enc1, enc2))
    seq_len = offsets1[-1][-1]
    mutation_pos = mutants[i]
    if seq_len <= 100:
        print("less than 101")
        pass # stop calculating
    else:
        # Window size 101
        upper_idx = mutation_pos + 50
        lower_idx = mutation_pos - 50
        if upper_idx >= seq_len:
            lower_idx += -(upper_idx - seq_len + 1)
            upper_idx = seq_len - 1
        if lower_idx < 0:
            upper_idx -= lower_idx
            lower_idx = 0
        new1 = take_offsets_subset(offsets1, lower_idx, upper_idx)
        new2 = take_offsets_subset(offsets2, lower_idx, upper_idx)
        enc1.offsets = new1
        enc2.offsets = new2
        print(calc_all_indices(enc1, enc2))
        

            

0it [00:00, ?it/s]


AttributeError: attribute 'offsets' of 'tokenizers.Encoding' objects is not writable

In [37]:
a = [1, 2, 3]
a.clear()
a

[]