In [7]:
#0 Module import

import numpy as np
import pandas as pd
import random 
import time
import os
import pickle
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification

In [8]:
#1 Validation Dataset Preparation

##1.1 Dataset input 
file_path = 'CRISPRoffT_filtered.csv'
df = pd.read_csv(file_path, sep=',')

##1.2 Dataset filtering 
filtered_df = df[(df['Species'] == "Homo sapiens") & \
                 (df['Cas9_type'] == "SpCas9") & \
                 (df['gRNA'].isin(["sgRNA","Alt-R XT 2-part gRNA"]))
                ]

##1.3 Dataset subsetting

validated_df = filtered_df[(filtered_df['Validation'].notna()) & (filtered_df['Identity'] == "OFF")]
validated_df = validated_df[~validated_df['Target_sequence'].str.contains('I', case=False, na=False)]

validated_onlyMis_df = validated_df[(validated_df['Bulge'].isna()) & (validated_df['Bulge2'].isna())]
non_validated_df = filtered_df[filtered_df['Validation'].isna()]

#### 1.3.1 Drop duplicates as some pairs were identical but retrieved in different conditions 
validated_onlyMis_df.to_csv('validated_onlyMis_df.csv', index=False)

validated_onlyMis_df = validated_onlyMis_df.drop_duplicates(
    subset=['Guide_sequence', 'Target_sequence', 'Validation'], 
    keep='first'
)
validated_onlyMis_df.to_csv('validated_onlyMis_df_removed_duplicates.csv', index=False)

validated_df_sgRNA_DNA = validated_df[['Guide_sequence', 'Target_sequence','Validation']]
validated_df_sgRNA_DNA = validated_df_sgRNA_DNA.drop_duplicates()
validated_df_sgRNA_DNA_tp = validated_df_sgRNA_DNA[validated_df_sgRNA_DNA['Validation'] == True ]
validated_df_sgRNA_DNA['Validation'] = validated_df_sgRNA_DNA['Validation'].apply(lambda x: 1 if x else 0)


validated_onlyMis_df_sgRNA_DNA = validated_onlyMis_df[['Guide_sequence', 'Target_sequence','Validation']]
validated_onlyMis_df_sgRNA_DNA = validated_onlyMis_df_sgRNA_DNA.drop_duplicates()
validated_onlyMis_df_sgRNA_DNA_tp = validated_onlyMis_df_sgRNA_DNA[validated_onlyMis_df_sgRNA_DNA['Validation'] == True ]
validated_onlyMis_df_sgRNA_DNA['Validation'] = validated_onlyMis_df_sgRNA_DNA['Validation'].apply(lambda x: 1 if x else 0)

validated_onlyMis_df_sgRNA_DNA_tp.to_csv('validated_onlyMis_df_sgRNA_DNA_tp.csv', index=False)
validated_df_sgRNA_DNA_tp.to_csv('validated_df_sgRNA_DNA_tp.csv', index=False)
validated_onlyMis_df_sgRNA_DNA.to_csv('validated_onlyMis_df_sgRNA_DNA.csv', index=False)


print(len(validated_df),len(validated_onlyMis_df),len(non_validated_df))

print(len(validated_df_sgRNA_DNA), len(validated_onlyMis_df_sgRNA_DNA), len(validated_df_sgRNA_DNA_tp), len(validated_onlyMis_df_sgRNA_DNA_tp))

  df = pd.read_csv(file_path, sep=',')


5040 2764 228277
2869 2764 447 417


In [12]:
len(set(validated_df_sgRNA_DNA['Guide_sequence']))

55

In [3]:
#2 Public Dataset Preparation

with open('./SM_OT_Review/dataset_benchmarking2023/HEK293T.pkl', "rb") as f:
    data_HEK293T = pickle.load(f)
    
with open('./SM_OT_Review/dataset_benchmarking2023/K562.pkl', "rb") as f:
    data_K562 = pickle.load(f)  
    
with open('./SM_OT_Review/dataset_benchmarking2023/II3.pkl', "rb") as f:
    data_II3 = pickle.load(f)
    
with open('./SM_OT_Review/dataset_benchmarking2023/II4.pkl', "rb") as f:
    data_II4 = pickle.load(f)  
    
with open('./SM_OT_Review/dataset_benchmarking2023/II5.pkl', "rb") as f:
    data_II5 = pickle.load(f)
    
with open('./SM_OT_Review/dataset_benchmarking2023/II6.pkl', "rb") as f:
    data_II6 = pickle.load(f)    
     

In [54]:
data_HEK293T

Unnamed: 0,sgRNA,DNA,label
0,GCCTCTTTCCCACCCACCTTGGG,GTCTCTTTCCCAGCGACCTGGGG,0
1,GACTTGTTTTCATTGTTCTCAGG,GAGTCATTTTCATTGTCTTCATG,0
2,GGTGAGTGAGTGTGTGCGTGTGG,TGTGAGTGTGTGTGTGTGTGTGT,0
3,GGTGAGTGAGTGTGTGCGTGTGG,TGTGTGTTCGTGTGTGCGTGTGT,0
4,GCCTCCCCAAAGCCTGGCCAGGG,GCTTCTCCAAAGCCTTCAGAGGG,0
...,...,...,...
132909,GGTGAGTGAGTGTGTGCGTGTGG,TGTGTGTGTGTGTGTATGTGTGC,0
132910,GGTGAGTGAGTGTGTGCGTGTGG,TGTGAGTGTGTGTGTGTGTGTGT,0
132911,GCCTCCCCAAAGCCTGGCCAGGG,TCAGCCCCAAAGCCTGGCCTGTT,0
132912,GGTGAGTGAGTGTGTGCGTGTGG,AGCGTGTGCGTGTGTGTGTGTGT,0


In [4]:
data_HEK293T_filtered = data_HEK293T[
    ~data_HEK293T.apply(lambda row: (row['sgRNA'], row['DNA']) in sgRNAs_val, axis=1)
]

data_K562_filtered = data_K562[
    ~data_K562.apply(lambda row: (row['sgRNA'], row['DNA']) in sgRNAs_val, axis=1)
]

data_II3_filtered = data_II3[
    ~data_II3.apply(lambda row: (row['sgRNA'], row['DNA']) in sgRNAs_val, axis=1)
]

data_II4_filtered = data_II4[
    ~data_II4.apply(lambda row: (row['sgRNA'], row['DNA']) in sgRNAs_val, axis=1)
]

data_II5_filtered = data_II5[
    ~data_II5.apply(lambda row: (row['sgRNA'], row['DNA']) in sgRNAs_val, axis=1)
]

data_II6_filtered = data_II6[
    ~data_II6.apply(lambda row: (row['sgRNA'], row['DNA']) in sgRNAs_val, axis=1)
]

print(len(data_HEK293T) - len(data_HEK293T_filtered))
print(len(data_K562) - len(data_K562_filtered))
print(len(data_II3) - len(data_II3_filtered))
print(len(data_II4) - len(data_II4_filtered))
print(len(data_II5) - len(data_II5_filtered))
print(len(data_II6) - len(data_II6_filtered))

NameError: name 'sgRNAs_val' is not defined

In [6]:
data_sets = [data_HEK293T, data_K562, data_II3, data_II4, data_II5, data_II6]
data_sets_names = ['HEK293T','K562','II3','II4','II5','II6']
i = 0

def read_npz(file_path):
    data  = np.load(file_path, allow_pickle=True)
    ont   = data['ont']
    offt  = data['offt']
    label = data['label']
    return ont, offt, label

sgRNAs_val = []
for index, row in validated_df_sgRNA_DNA.iterrows():
    sgRNAs_val.append([row['Guide_sequence'], row['Target_sequence']])
sgRNAs_val = set(zip(
    validated_df_sgRNA_DNA['Guide_sequence'],
    validated_df_sgRNA_DNA['Target_sequence']
))

for data_set in data_sets:
    print(len(data_set))
    
    df_filtered = data_set[
    ~data_set.apply(lambda row: (row['sgRNA'], row['DNA']) in sgRNAs_val, axis=1)
    ]
    print(len(df_filtered))

    np.savez('./Datasets/training_sets_' + data_sets_names[i] + '_ori_filtered.npz', 
                 ont=df_filtered['sgRNA'].to_numpy(), 
                 offt=df_filtered['DNA'].to_numpy(),
                label=df_filtered['label'].to_numpy())
    i += 1

132914
132516
20319
20242
217733
217217
294534
294316
95829
95824
383463
383463


In [52]:
## BACKUP CODES
## remove duplicates in public_datasets
data_sets = ['HEK293T','K562','II3','II4','II5','II6']

def read_npz(file_path):
    data  = np.load(file_path, allow_pickle=True)
    ont   = data['ont']
    offt  = data['offt']
    label = data['label']
    return ont, offt, label

sgRNAs_val = []
for index, row in validated_df_sgRNA_DNA.iterrows():
    sgRNAs_val.append([row['Guide_sequence'], row['Target_sequence']])
sgRNAs_val = set(zip(
    validated_df_sgRNA_DNA['Guide_sequence'],
    validated_df_sgRNA_DNA['Target_sequence']
))


for data_set in data_sets:
    filename = './Datasets/training_sets_' + data_set + '_1.npz'
    ont, offt, label = read_npz(filename)
    data = {
        'ont': ont,
        'offt': offt,
        'label':label
    }
    
    df = pd.DataFrame(data)
    
    df_filtered = df[
    ~df.apply(lambda row: (row['ont'], row['offt']) in sgRNAs_val, axis=1)
    ]


    np.savez('./Datasets/training_sets_' + data_set + '_filtered.npz', 
                 ont=df_filtered['ont'].to_numpy(), 
                 offt=df_filtered['offt'].to_numpy(),
                label=df_filtered['label'].to_numpy())
    #print(len(df))
    print(len(df) - len(df_filtered))
    print("filtered:{}".format(len(df_filtered)))

21872
filtered:236851
2732
filtered:39282
4902
filtered:399735
21441
filtered:529626
5960
filtered:187009
18473
filtered:750230


In [53]:
## BACKUP CODES
## remove duplicates in public_datasets
data_sets = ['HEK293T','K562','II3','II4','II5','II6']

def read_npz(file_path):
    data  = np.load(file_path, allow_pickle=True)
    ont   = data['ont']
    offt  = data['offt']
    label = data['label']
    return ont, offt, label

sgRNAs_val = []
for index, row in validated_df_sgRNA_DNA.iterrows():
    sgRNAs_val.append([row['Guide_sequence'], row['Target_sequence']])
sgRNAs_val = set(zip(
    validated_df_sgRNA_DNA['Guide_sequence'],
    validated_df_sgRNA_DNA['Target_sequence']
))


for data_set in data_sets:
    filename = './Datasets/training_sets_' + data_set + '_3.npz'
    ont, offt, label = read_npz(filename)
    data = {
        'ont': ont,
        'offt': offt,
        'label':label
    }
    
    df = pd.DataFrame(data)
    
    df_filtered = df[
    ~df.apply(lambda row: (row['ont'], row['offt']) in sgRNAs_val, axis=1)
    ]


    np.savez('./Datasets/training_sets_' + data_set + '_filtered2.npz', 
                 ont=df_filtered['ont'].to_numpy(), 
                 offt=df_filtered['offt'].to_numpy(),
                label=df_filtered['label'].to_numpy())
    #print(len(df))
    print(len(df) - len(df_filtered))
    print("filtered:{}".format(len(df_filtered)))

19908
filtered:241930
2615
filtered:39122
4592
filtered:390306
22829
filtered:548364
5928
filtered:186636
18352
filtered:749966


In [None]:
#3 Integrate validation dataset

##3.1 Exclude idential pairs in validated dataset
sgRNAs_val = set(validated_onlyMis_df_sgRNA_DNA['Guide_sequence'])
selected_sgRNAs_list = []
for i in range(5):
    random.seed(42+i)
    selected_sgRNAs = random.sample(list(sgRNAs_val), 5)
    selected_sgRNAs_list.append(selected_sgRNAs)
    
data_HEK293Ts = []
data_K562s = []
data_II3s = []
data_II4s = []
data_II5s = []
data_II6s = []

data_testing = []

##3.2 Combine validation dataset with public datasets
q = 0
for sgRNAs_val in selected_sgRNAs_list:
    
    ## Validated_only dataset

    """ Generate the validated_only datasets for training a
    nd testing, the validated_only dataset contains same sgRNA as 
    public datasets
    """  

    data_val = validated_onlyMis_df_sgRNA_DNA[~validated_onlyMis_df_sgRNA_DNA['Guide_sequence'].isin(sgRNAs_val)]
    data_val.rename(columns={'Guide_sequence': 'sgRNA',
                               'Target_sequence': 'DNA',
                               'Validation': 'label'}, inplace=True)
    
    ## To save the validation datasets for training 
    data_val.to_csv('./Datasets/Val_only' + str(q) + '.csv', index = False) 
    
    data_test = validated_onlyMis_df_sgRNA_DNA[validated_onlyMis_df_sgRNA_DNA['Guide_sequence'].isin(sgRNAs_val)]
    data_test.rename(columns={'Guide_sequence': 'sgRNA',
                               'Target_sequence': 'DNA',
                               'Validation': 'label'}, inplace=True)
    
    ## To save the testing datasets
    data_test.to_csv('./Datasets/Testing' + str(q) + '.csv', index = False)
    q += 1
    data_testing.append(data_test)
    
    print(len(data_val),len(data_test))
    

    ## filter sgRNAs from original dataset
    ##? update: 2025-03-31 to remove all duplicates in validation pool?(all pairs)
    
    data_HEK293T_filtered = data_HEK293T[~data_HEK293T['sgRNA'].isin(sgRNAs_val)]
    data_K562_filtered = data_K562[~data_K562['sgRNA'].isin(sgRNAs_val)]
    data_II3_filtered = data_II3[~data_II3['sgRNA'].isin(sgRNAs_val)]
    data_II4_filtered = data_II4[~data_II4['sgRNA'].isin(sgRNAs_val)]
    data_II5_filtered = data_II5[~data_II5['sgRNA'].isin(sgRNAs_val)]
    data_II6_filtered = data_II6[~data_II6['sgRNA'].isin(sgRNAs_val)]

    
    data_HEK293Ts.append(data_HEK293T_filtered)
    data_K562s.append(data_K562_filtered)
    data_II3s.append(data_II3_filtered)
    data_II4s.append(data_II4_filtered)
    data_II5s.append(data_II5_filtered)
    data_II6s.append(data_II6_filtered)

    
    data_K562_combined = pd.concat([data_K562_filtered, data_val], ignore_index=True)
    data_HEK293T_combined = pd.concat([data_HEK293T_filtered, data_val], ignore_index=True)
    data_II3_combined = pd.concat([data_II3_filtered, data_val], ignore_index=True)
    data_II4_combined = pd.concat([data_II4_filtered, data_val], ignore_index=True)
    data_II5_combined = pd.concat([data_II5_filtered, data_val], ignore_index=True)
    data_II6_combined = pd.concat([data_II6_filtered, data_val], ignore_index=True)

    data_HEK293Ts.append(data_HEK293T_combined)
    data_K562s.append(data_K562_combined)
    data_II3s.append(data_II3_combined)
    data_II4s.append(data_II4_combined)
    data_II5s.append(data_II5_combined)
    data_II6s.append(data_II6_combined)

In [None]:
#4 Resampling using SMOTEENN

##4.1 Class and function for DNA sequence conversion 
class SeqTranslate:
    def __init__(self):
        # Define the mapping for encoding and decoding
        self.nucleotide_to_number = {'A': 1, 'C': 2, 'G': 3, 'T': 4}
        self.number_to_nucleotide = {v: k for k, v in self.nucleotide_to_number.items()}

    def encode(self, sequence):
        """
        Encode a nucleotide sequence into numeric representation.

        Args:
            sequence (str): The nucleotide sequence (e.g., "ACGT").

        Returns:
            list: A list of integers representing the sequence.
        """
        try:
            return [self.nucleotide_to_number[nuc] for nuc in sequence]
        except KeyError:
            raise ValueError("Invalid nucleotide found in sequence. Allowed: A, C, G, T")

    def decode(self, numeric_sequence):
        """
        Decode a numeric sequence back into nucleotide representation.

        Args:
            numeric_sequence (list): A list of integers (e.g., [1, 2, 3, 4]).

        Returns:
            str: The decoded nucleotide sequence.
        """
        try:
            return ''.join(self.number_to_nucleotide[num] for num in numeric_sequence)
        except KeyError:
            raise ValueError("Invalid number found in sequence. Allowed: 1, 2, 3, 4")

# Example usage
translator = SeqTranslate()

# Encoding
sequence = "ACGTACG"
encoded = translator.encode(sequence)
print("Encoded sequence:", encoded)

# Decoding
decoded = translator.decode(encoded)
print("Decoded sequence:", decoded)


def SMOTEENN_resampling(df, random_num):
    
    Encoded_seqs = []
    Labels = []

    for index, row in df.iterrows():

        on_target = row['sgRNA']
        off_target = row['DNA']
        label = row['label']

        translator = SeqTranslate()
        encoded_seq = translator.encode(on_target + off_target)

        Encoded_seqs.append(encoded_seq)
        Labels.append(int(label))

    X = pd.DataFrame(Encoded_seqs)
    y = pd.DataFrame(Labels)
    
    print(len(X),len(y))

    ###1.5.2 resampling with SMOTEENN

    smote_enn = SMOTEENN(random_state=random_num)

    # Resample the data
    X_resampled, y_resampled = smote_enn.fit_resample(X, y)

    resampled_ont = []
    resampled_offt = []
    
    print(len(X_resampled),len(y_resampled))

    for index, row in X_resampled.iterrows():

        translator = SeqTranslate()
        decoded_seq = translator.decode(row)

        on_target = decoded_seq[:23]
        off_target = decoded_seq[23:]

        resampled_ont.append(on_target)
        resampled_offt.append(off_target)
    
    return resampled_ont,resampled_offt,y_resampled

In [None]:
##4.2 Dataset resampling using SMOTEENN


training_sets_HEK293T = []
training_sets_K562 = []
training_sets_II3 = []
training_sets_II4 = []
training_sets_II5 = []
training_sets_II6 = []
label_sets_train_HEK293T = []
label_sets_train_K562 = []
label_sets_train_II3 = []
label_sets_train_II4 = []
label_sets_train_II5 = []
label_sets_train_II6 = []

# resampling and save the training datasets
for i in range(10):
    
    data_HEK293T_filtered = data_HEK293Ts[i]
    data_K562_filtered = data_K562s[i]
    data_II3_filtered = data_II3s[i]
    data_II4_filtered = data_II4s[i]
    data_II5_filtered = data_II5s[i]
    data_II6_filtered = data_II6s[i]

    random_num = 42

    resampled_ont_HEK293T, resampled_offt_HEK293T, y_resampled_HEK293T = SMOTEENN_resampling(data_HEK293T_filtered, random_num)
    np.savez('training_sets_HEK293T_' + str(i) + '.npz', 
             ont=pd.DataFrame(resampled_ont_HEK293T)[0].to_numpy(), 
             offt=pd.DataFrame(resampled_offt_HEK293T)[0].to_numpy(),
             label=y_resampled_HEK293T[0].to_numpy())
    resampled_ont_K562, resampled_offt_K562, y_resampled_K562 = SMOTEENN_resampling(data_K562_filtered, random_num)
    np.savez('training_sets_K562_' + str(i) + '.npz', 
             ont=pd.DataFrame(resampled_ont_K562)[0].to_numpy(), 
             offt=pd.DataFrame(resampled_offt_K562)[0].to_numpy(),
            label=y_resampled_K562[0].to_numpy())
    resampled_ont_II3, resampled_offt_II3, y_resampled_II3 = SMOTEENN_resampling(data_II3_filtered, random_num)
    np.savez('training_sets_II3_' + str(i) + '.npz', 
             ont=pd.DataFrame(resampled_ont_II3)[0].to_numpy(), 
             offt=pd.DataFrame(resampled_offt_II3)[0].to_numpy(),
            label=y_resampled_II3[0].to_numpy())
    resampled_ont_II4, resampled_offt_II4, y_resampled_II4 = SMOTEENN_resampling(data_II4_filtered, random_num)
    np.savez('training_sets_II4_' + str(i) + '.npz', 
             ont=pd.DataFrame(resampled_ont_II4)[0].to_numpy(), 
             offt=pd.DataFrame(resampled_offt_II4)[0].to_numpy(),
            label=y_resampled_II4[0].to_numpy())
    resampled_ont_II5, resampled_offt_II5, y_resampled_II5 = SMOTEENN_resampling(data_II5_filtered, random_num)
    np.savez('training_sets_II5_' + str(i) + '.npz', 
             ont=pd.DataFrame(resampled_ont_II5)[0].to_numpy(), 
             offt=pd.DataFrame(resampled_offt_II5)[0].to_numpy(),
            label=y_resampled_II5[0].to_numpy())
    resampled_ont_II6, resampled_offt_II6, y_resampled_II6 = SMOTEENN_resampling(data_II6_filtered, random_num)
    np.savez('training_sets_II6_' + str(i) + '.npz', 
             ont=pd.DataFrame(resampled_ont_II6)[0].to_numpy(), 
             offt=pd.DataFrame(resampled_offt_II6)[0].to_numpy(),
            label=y_resampled_II6[0].to_numpy())

    training_sets_HEK293T.append([pd.DataFrame(resampled_ont_HEK293T)[0].to_numpy(),pd.DataFrame(resampled_offt_HEK293T)[0].to_numpy()])
    training_sets_K562.append([pd.DataFrame(resampled_ont_K562)[0].to_numpy(),pd.DataFrame(resampled_offt_K562)[0].to_numpy()])
    training_sets_II3.append([pd.DataFrame(resampled_ont_II3)[0].to_numpy(),pd.DataFrame(resampled_offt_II3)[0].to_numpy()])
    training_sets_II4.append([pd.DataFrame(resampled_ont_II4)[0].to_numpy(),pd.DataFrame(resampled_offt_II4)[0].to_numpy()])
    training_sets_II5.append([pd.DataFrame(resampled_ont_II5)[0].to_numpy(),pd.DataFrame(resampled_offt_II5)[0].to_numpy()])
    training_sets_II6.append([pd.DataFrame(resampled_ont_II6)[0].to_numpy(),pd.DataFrame(resampled_offt_II6)[0].to_numpy()])


In [7]:
##5 Val only dataset w/real testing 

val_only_ori = pd.read_csv('./Datasets/val_only/validated_onlyMis_df_sgRNA_DNA.csv', sep = ',')


testing_files = ['./Datasets/real_testing/Testing0.csv',
                 './Datasets/real_testing//Testing1.csv',
                 './Datasets/real_testing//Testing2.csv',
                 './Datasets/real_testing//Testing3.csv',
                 './Datasets/real_testing//Testing4.csv'
]


i = 0
for filename in testing_files:
    df = pd.read_csv(filename, sep=',')
    ont = df['sgRNA'].to_numpy()
    offt = df['DNA'].to_numpy()
    label = df['label'].to_numpy()
    testing_set = set(ont)
    training_set = val_only_ori[~val_only_ori['Guide_sequence'].isin(testing_set)]
    
    filename = './Datasets/val_only/val_only_training_' + str(i) + '.csv'
    
    training_set.to_csv(filename, index = False)
    
    i += 1


In [None]:
## BACKUP CODES
## remove duplicates in public_datasets

def read_npz(file_path):
    data  = np.load(file_path, allow_pickle=True)
    ont   = data['ont']
    offt  = data['offt']
    label = data['label']
    return ont, offt, label
filename = './Datasets/training_sets_HEK293T_3.npz'
ont, offt, label = read_npz(filename)
data = {
    'ont': ont,
    'offt': offt,
    'label':label
}

data_HEK293T = pd.DataFrame(data)

len(data_HEK293T)

sgRNAs_val = []
for index, row in validated_df_sgRNA_DNA.iterrows():
    sgRNAs_val.append([row['Guide_sequence'], row['Target_sequence']])
sgRNAs_val = set(zip(
    validated_df_sgRNA_DNA['Guide_sequence'],
    validated_df_sgRNA_DNA['Target_sequence']
))

data_HEK293T_filtered = data_HEK293T[
    ~data_HEK293T.apply(lambda row: (row['ont'], row['offt']) in sgRNAs_val, axis=1)
]

np.savez('./Datasets/training_sets_HEK293T_2.npz', 
             ont=data_HEK293T_filtered['ont'].to_numpy(), 
             offt=data_HEK293T_filtered['offt'].to_numpy(),
            label=data_HEK293T_filtered['label'].to_numpy())