In [2]:
# Install the exact library versions from the GitHub repository
!pip install tensorflow
!pip install keras
!pip install pandas



In [3]:
import pandas as pd

# Load the dataset you just uploaded
df = pd.read_csv('GUIDE-Seq.csv')

# Display the first 5 rows to see the format
print("--- First 5 rows of the dataset ---")
print(df.head())

# Display the column names so we know what to use
print("\n--- Column Names ---")
print(df.columns.tolist())

--- First 5 rows of the dataset ---
                        DNA                     crRNA  label  read  \
0  GCTGCCAGTACAGGCTCCCCCTCG  GCAGCCAGTACA_GCTCACCATGG    0.0   0.0   
1  GCTGCCAGTACAGGCTCCCCCTCG  GCAGCCAGTACAG_CTCACCATGG    0.0   0.0   
2  TACTAGAGTGACAAGTCACACAAT  G_CTAGAGTCACAAGTCCCACAGG    0.0   0.0   
3  -C_TAGAGTGACAAGTCACACAAT  -GCTAGAGTCACAAGTCCCACAGG    0.0   0.0   
4  ACAGCGAGTACAAGCTCATCATGA  GCAGCCAGTAC_AGCTCACCATGG    0.0   0.0   

                                                pair  
0  GCAGCCAGTACA_GCTCACCATGG|GCTGCCAGTACAGGCTCCCCCTCG  
1  GCAGCCAGTACAG_CTCACCATGG|GCTGCCAGTACAGGCTCCCCCTCG  
2  G_CTAGAGTCACAAGTCCCACAGG|TACTAGAGTGACAAGTCACACAAT  
3  -GCTAGAGTCACAAGTCCCACAGG|-C_TAGAGTGACAAGTCACACAAT  
4  GCAGCCAGTAC_AGCTCACCATGG|ACAGCGAGTACAAGCTCATCATGA  

--- Column Names ---
['DNA', 'crRNA', 'label', 'read', 'pair']


In [4]:
import pandas as pd

# --- Column names identified from your output ---
SGRNA_COLUMN = 'crRNA'
OFFTARGET_COLUMN = 'DNA'
LABEL_COLUMN = 'label'

# --- You shouldn't need to change anything below this line ---

# Create a copy to work with
clean_df = df.copy()

# 1. Clean the sequence data by removing any character that is not A, C, G, or T
clean_df[SGRNA_COLUMN] = clean_df[SGRNA_COLUMN].str.replace('[^ACGT]', '', regex=True)
clean_df[OFFTARGET_COLUMN] = clean_df[OFFTARGET_COLUMN].str.replace('[^ACGT]', '', regex=True)

# 2. NEW STEP: Truncate any sequences longer than 23 bp to exactly 23 bp
clean_df[SGRNA_COLUMN] = clean_df[SGRNA_COLUMN].str[:23]
clean_df[OFFTARGET_COLUMN] = clean_df[OFFTARGET_COLUMN].str[:23]

# 3. Now, filter to keep ONLY the rows where both sequences are exactly 23 characters long
initial_rows = len(clean_df)
clean_df = clean_df[
    (clean_df[SGRNA_COLUMN].str.len() == 23) &
    (clean_df[OFFTARGET_COLUMN].str.len() == 23)
]
print(f"Standardized sequence lengths. Kept {len(clean_df)} out of {initial_rows} rows.")

# 4. Select and rename the columns to the standard format
final_df = clean_df[[SGRNA_COLUMN, OFFTARGET_COLUMN, LABEL_COLUMN]].copy()
final_df.rename(columns={
    SGRNA_COLUMN: 'sgRNA',
    OFFTARGET_COLUMN: 'off_target',
    LABEL_COLUMN: 'label'
}, inplace=True)

# 5. Ensure the label is an integer (0 or 1)
final_df['label'] = final_df['label'].astype(int)

print("\n--- First 5 rows of cleaned and formatted data ---")
print(final_df.head())

# Save the final, clean data to a new CSV file
final_df.to_csv('cleaned_benchmark_data.csv', index=False)
print("\n✅ Success! Your benchmark dataset is ready and saved as 'cleaned_benchmark_data.csv'")

Standardized sequence lengths. Kept 149483 out of 213933 rows.

--- First 5 rows of cleaned and formatted data ---
                     sgRNA               off_target  label
0  GCAGCCAGTACAGCTCACCATGG  GCTGCCAGTACAGGCTCCCCCTC      0
1  GCAGCCAGTACAGCTCACCATGG  GCTGCCAGTACAGGCTCCCCCTC      0
2  GCTAGAGTCACAAGTCCCACAGG  TACTAGAGTGACAAGTCACACAA      0
4  GCAGCCAGTACAGCTCACCATGG  ACAGCGAGTACAAGCTCATCATG      0
5  GCAGCCAGTACAGCTCACCATGG  ACAGCGAGTACAAGCTCATCATG      0

✅ Success! Your benchmark dataset is ready and saved as 'cleaned_benchmark_data.csv'


**CnnCRISPR**

In [None]:
!git clone https://github.com/LQYoLH/CnnCrispr.git

Cloning into 'CnnCrispr'...
remote: Enumerating objects: 122, done.[K
remote: Counting objects: 100% (38/38), done.[K
remote: Compressing objects: 100% (36/36), done.[K
remote: Total 122 (delta 10), reused 0 (delta 0), pack-reused 84 (from 1)[K
Receiving objects: 100% (122/122), 24.81 MiB | 19.13 MiB/s, done.
Resolving deltas: 100% (38/38), done.


In [None]:
!ls CnnCrispr/

CnnCrispr_code	 images		 offtarget_data.rar  test1
CnnCrispr_final  offtarget_data  README.md


In [None]:
# Find any file in the repository containing 'GloVe' in its name
!find ./CnnCrispr -name "*GloVe*"

./CnnCrispr/CnnCrispr_final/Encoded_data/Class/keras_GloVeVec_5_100_10000.csv


In [None]:
import pandas as pd
import numpy as np

# --- 1. Load benchmark dataset ---
df = pd.read_csv('cleaned_benchmark_data.csv')
print(f"📄 Loaded {len(df)} samples.")

# --- 2. Load the pre-trained GloVe embeddings ---
glove_path = 'CnnCrispr/CnnCrispr_final/Encoded_data/Class/keras_GloVeVec_5_100_10000.csv'
glove_matrix = np.loadtxt(glove_path, delimiter=',')
glove_dict = {int(row[0]): row[1:] for row in glove_matrix}
print("✅ Loaded GloVe embeddings of shape:", np.array(list(glove_dict.values())).shape)

# --- 3. Define nucleotide-to-index mapping ---
nu_dict = {'A': 0, 'C': 1, 'G': 2, 'T': 3}

# --- 4. Encode sgRNA–off-target pairs using GloVe embeddings ---
encoded_sequences = []
labels = []

for _, row in df.iterrows():
    sgrna_seq = row['sgRNA'].strip()
    offtarget_seq = row['off_target'].strip()
    label = int(row['label'])

    seq_embeddings = []
    for i in range(23):
        s = sgrna_seq[i]
        o = offtarget_seq[i]
        pair_index = nu_dict[s] * 4 + nu_dict[o]  # 0–15 mapping
        seq_embeddings.append(glove_dict[pair_index])

    encoded_sequences.append(seq_embeddings)
    labels.append(label)

# Convert to NumPy arrays
X = np.array(encoded_sequences)  # shape: (N, 23, 100)
y = np.array(labels)
print("✅ Final encoded data shape:", X.shape)

# --- 5. Save for prediction ---
np.savez('cnncrispr_benchmark_encoded.npz', X=X, y=y)
print("💾 Saved preprocessed benchmark data for CNNCRISPR.")


📄 Loaded 149483 samples.
✅ Loaded GloVe embeddings of shape: (16, 100)
✅ Final encoded data shape: (149483, 23, 100)
💾 Saved preprocessed benchmark data for CNNCRISPR.


In [None]:
"""
====================================================================
CRITICAL: RESTART YOUR KERNEL BEFORE RUNNING THIS CODE!
====================================================================

In Google Colab: Runtime -> Restart runtime (then run all cells)
In Jupyter: Kernel -> Restart (then run all cells)

This ensures TensorFlow starts in eager execution mode from the beginning.
====================================================================
"""

import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, average_precision_score
import tensorflow as tf

# Verify eager execution
print(f"TensorFlow version: {tf.__version__}")
print(f"Eager execution enabled: {tf.executing_eagerly()}")

if not tf.executing_eagerly():
    print("\n" + "="*70)
    print("❌ ERROR: Eager execution is NOT enabled!")
    print("="*70)
    print("\nYou MUST restart your kernel/runtime before running this code.")
    print("\nIn Google Colab: Runtime -> Restart runtime")
    print("In Jupyter: Kernel -> Restart")
    print("\nThen run this cell again.")
    print("="*70)
    raise RuntimeError("Please restart kernel and try again")

print("✅ Eager execution is enabled. Proceeding...\n")

# --- 1. Perform simple integer encoding ---
print("Step 1: Loading and encoding data...")
df = pd.read_csv('cleaned_benchmark_data.csv')
nu_dict = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
encoded_vectors, labels = [], []

for _, row in df.iterrows():
    sgrna_seq, offtarget_seq = row['sgRNA'], row['off_target']
    label, vector = row['label'], []

    for i in range(23):
        try:
            s_nuc, o_nuc = sgrna_seq[i], offtarget_seq[i]
            vector.append(nu_dict[s_nuc] * 4 + nu_dict[o_nuc])
        except KeyError:
            vector = []
            break

    if len(vector) == 23:
        encoded_vectors.append(vector)
        labels.append(label)

X_test = np.array(encoded_vectors, dtype=np.int32)
y_test = np.array(labels)

print(f"✅ Data loaded: {X_test.shape}\n")

# --- 2. Build and load model ---
print("Step 2: Building model architecture...")

from keras.models import Model
from keras.layers import Embedding, Bidirectional, LSTM, Conv1D, BatchNormalization, Flatten, Dense, Dropout, Input

inputs = Input(shape=(23,), dtype='int32')
x = Embedding(input_dim=16, output_dim=100)(inputs)
x = Bidirectional(LSTM(40, return_sequences=True))(x)
x = Conv1D(10, 5, activation='relu')(x)
x = BatchNormalization()(x)
x = Conv1D(20, 5, activation='relu')(x)
x = BatchNormalization()(x)
x = Conv1D(40, 5, activation='relu')(x)
x = BatchNormalization()(x)
x = Conv1D(80, 5, activation='relu')(x)
x = BatchNormalization()(x)
x = Conv1D(100, 5, activation='relu')(x)
x = BatchNormalization()(x)
x = Flatten()(x)
x = Dropout(0.4)(x)
x = Dense(20, activation='relu')(x)
outputs = Dense(2, activation='softmax')(x)

model = Model(inputs=inputs, outputs=outputs)

print("Step 3: Loading model weights...")
weights_path = 'CnnCrispr/CnnCrispr_final/Model_save/CnnCrispr_weights.h5'
model.load_weights(weights_path)
print("✅ Model loaded successfully\n")

# --- 3. Make predictions in eager mode ---
print("Step 4: Making predictions...")
print(f"Processing {len(X_test)} samples in batches...\n")

batch_size = 256
all_predictions = []

for i in range(0, len(X_test), batch_size):
    batch = X_test[i:i+batch_size]
    batch_pred = model(batch, training=False).numpy()
    all_predictions.append(batch_pred)

    if (i // batch_size + 1) % 50 == 0:
        print(f"Processed {min(i+batch_size, len(X_test))}/{len(X_test)} samples")

y_pred_probs = np.vstack(all_predictions)
y_scores = y_pred_probs[:, 1]

print(f"\n✅ Predictions completed! Shape: {y_pred_probs.shape}\n")

# --- 4. Calculate and display performance ---
auroc = roc_auc_score(y_test, y_scores)
auprc = average_precision_score(y_test, y_scores)

print("="*70)
print("🎯 CnnCRISPR Performance Results")
print("="*70)
print(f"Area Under ROC Curve (AUROC): {auroc:.4f}")
print(f"Area Under PR Curve (AUPRC):  {auprc:.4f}")
print("="*70)

TensorFlow version: 2.19.0
Eager execution enabled: True
✅ Eager execution is enabled. Proceeding...

Step 1: Loading and encoding data...
✅ Data loaded: (149483, 23)

Step 2: Building model architecture...
Step 3: Loading model weights...
✅ Model loaded successfully

Step 4: Making predictions...
Processing 149483 samples in batches...

Processed 12800/149483 samples
Processed 25600/149483 samples
Processed 38400/149483 samples
Processed 51200/149483 samples
Processed 64000/149483 samples
Processed 76800/149483 samples
Processed 89600/149483 samples
Processed 102400/149483 samples
Processed 115200/149483 samples
Processed 128000/149483 samples
Processed 140800/149483 samples

✅ Predictions completed! Shape: (149483, 2)

🎯 CnnCRISPR Performance Results
Area Under ROC Curve (AUROC): 0.8871
Area Under PR Curve (AUPRC):  0.0050


**PI-CRISPR**

In [None]:
!git clone https://github.com/florianst/picrispr.git

Cloning into 'picrispr'...
remote: Enumerating objects: 61, done.[K
remote: Counting objects: 100% (15/15), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 61 (delta 5), reused 10 (delta 3), pack-reused 46 (from 1)[K
Receiving objects: 100% (61/61), 58.65 MiB | 26.38 MiB/s, done.
Resolving deltas: 100% (13/13), done.
Updating files: 100% (42/42), done.
Filtering content: 100% (2/2), 1000.32 MiB | 51.15 MiB/s, done.


In [None]:

!pip install tensorflow
!pip install torch
!pip install xgboost
!pip install scikit-learn pandas numpy matplotlib scipy



In [None]:
!ls -R picrispr/

picrispr/:
default_vals  models			    picrispr.py       test_input.csv
encoding.py   models.py			    README.md
load_data.py  offtarget_260520_nuc.csv.zip  requirements.txt

picrispr/default_vals:
defaultvals_tf_interface_type_s2_class.pickle
defaultvals_tf_interface_type_s2.pickle
defaultvals_tf_interface_type_s4_class.pickle
defaultvals_tf_interface_type_s4.pickle
defaultvals_tf_s2_class.pickle
defaultvals_tf_s2.pickle
defaultvals_tf_s4_class.pickle
defaultvals_tf_s4.pickle
defaultvals_torch_interface_type_s2_class.pickle
defaultvals_torch_interface_type_s2.pickle
defaultvals_torch_interface_type_s4_class.pickle
defaultvals_torch_interface_type_s4.pickle
defaultvals_torch_s2_class.pickle
defaultvals_torch_s2.pickle
defaultvals_torch_s4_class.pickle
defaultvals_torch_s4.pickle

picrispr/models:
models_torch.zip
trainresult_tf_interface_type_s2_class.pickle
trainresult_tf_interface_type_s2_class_weights.pickle
trainresult_tf_interface_type_s2.pickle
trainresult_tf_interface_type_s2_weig

In [None]:
# Unzip the pre-trained models into the 'models' directory
!unzip -q picrispr/models/models_torch.zip -d picrispr/models/
print("✅ Pre-trained models unzipped successfully.")

✅ Pre-trained models unzipped successfully.


In [None]:
import pandas as pd
df = pd.read_csv('cleaned_benchmark_data.csv')
predict_only_df = df.rename(columns={
    'sgRNA': 'grna_target_sequence',
    'off_target': 'target_sequence'
}).drop(columns=['label'])
predict_only_df.to_csv('picrispr_predict_only.csv', index=False)
print("✅ Created 'picrispr_predict_only.csv'")

# === STEP 5: RUN PREDICTION ===
print("\n--- Running piCRISPR prediction ---")
# Using model 2 (RNN 6x23 nuc), the best performer from the paper
!python picrispr/picrispr.py picrispr_predict_only.csv 2 picrispr/models False

# === STEP 6: CALCULATE AND DISPLAY RESULTS ===
print("\n--- Calculating final performance metrics ---")
from sklearn.metrics import roc_auc_score, average_precision_score
try:
    results_df = pd.read_csv('output.csv')
    ground_truth_df = pd.read_csv('cleaned_benchmark_data.csv')

    y_test = ground_truth_df['label']
    y_scores = results_df['piCRISPR prediction']

    auroc = roc_auc_score(y_test, y_scores)
    auprc = average_precision_score(y_test, y_scores)

    print("\n" + "="*70)
    print("🎯 piCRISPR Performance Results on Your Benchmark Dataset")
    print("="*70)
    print(f"Area Under ROC Curve (AUROC): {auroc:.4f}")
    print(f"Area Under PR Curve (AUPRC):  {auprc:.4f}")
    print("="*70)
except FileNotFoundError:
    print("❌ Error: 'output.csv' was not created. Please review the prediction logs above for errors.")
except Exception as e:
    print(f"An error occurred during evaluation: {e}")


✅ Created 'picrispr_predict_only.csv'

--- Running piCRISPR prediction ---
E0000 00:00:1760360243.149045    1707 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760360243.159426    1707 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1760360243.194651    1707 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1760360243.194692    1707 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1760360243.194701    1707 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1760360243.194708    1707 computati

 **deep crispr**

In [None]:
# === STEP 1: Set Up the DeepCRISPR Environment (TF2 Compatibility) ===

# Install modern, compatible libraries
!pip install tensorflow
!pip install dm-sonnet
!pip install pandas scikit-learn

print("\n✅ Modern environment for DeepCRISPR is ready.")

Collecting dm-sonnet
  Downloading dm_sonnet-2.0.2-py3-none-any.whl.metadata (12 kB)
Downloading dm_sonnet-2.0.2-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.4/268.4 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dm-sonnet
Successfully installed dm-sonnet-2.0.2

✅ Modern environment for DeepCRISPR is ready.


In [None]:
import pandas as pd
import numpy as np

print("Step 1: Loading your benchmark dataset...")
df = pd.read_csv('cleaned_benchmark_data.csv')

# --- Define the encoding logic for DeepCRISPR ---
nuc_map = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
# The model expects input shape: [channels, 1, sequence_length]
# For sequence-only, channels = 4 and sequence_length = 23
channels = 4
seq_len = 23

def one_hot_encode(sequence):
    """Encodes a 23-bp DNA sequence into the 4x1x23 format for DeepCRISPR."""
    # Initialize a matrix of zeros with the required shape
    encoded_matrix = np.zeros((channels, 1, seq_len), dtype=np.uint8)

    for i, nucleotide in enumerate(sequence):
        if nucleotide in nuc_map:
            channel_index = nuc_map[nucleotide]
            encoded_matrix[channel_index, 0, i] = 1
    return encoded_matrix

# Lists to store the processed data
sg_features = []
ot_features = []
labels = []

print("Step 2: Encoding data into the two-part, 4-channel format...")

for _, row in df.iterrows():
    sgRNA_seq = row['sgRNA']
    off_target_seq = row['off_target']
    label = row['label']

    # Encode both sequences
    sg_encoded = one_hot_encode(sgRNA_seq)
    ot_encoded = one_hot_encode(off_target_seq)

    # Add the encoded matrices and the label to our lists
    sg_features.append(sg_encoded)
    ot_features.append(ot_encoded)
    labels.append(label)

# Convert lists to final NumPy arrays
X_sg = np.array(sg_features)
X_ot = np.array(ot_features)
y = np.array(labels)

print("\n✅ Preprocessing complete!")
print(f"Shape of sgRNA features: {X_sg.shape}")
print(f"Shape of Off-target features: {X_ot.shape}")
print(f"Shape of Labels: {y.shape}")

# Save the preprocessed data into a single, organized file
np.savez(
    'deepcrispr_benchmark_encoded.npz',
    X_sg=X_sg,
    X_ot=X_ot,
    y=y
)
print("\n💾 Saved preprocessed data to 'deepcrispr_benchmark_encoded.npz'")

Step 1: Loading your benchmark dataset...
Step 2: Encoding data into the two-part, 4-channel format...

✅ Preprocessing complete!
Shape of sgRNA features: (149483, 4, 1, 23)
Shape of Off-target features: (149483, 4, 1, 23)
Shape of Labels: (149483,)

💾 Saved preprocessed data to 'deepcrispr_benchmark_encoded.npz'


In [None]:
# Remove the old directory to ensure a clean start
!rm -rf DeepCRISPR/trained_models/offtar_pt_cnn

# Re-create the directory
!mkdir -p DeepCRISPR/trained_models/offtar_pt_cnn

# Unpack the model files into the new directory
!tar -xzf DeepCRISPR/trained_models/offtar_pt_cnn.tar.gz -C DeepCRISPR/trained_models/offtar_pt_cnn/

print("✅ Pre-trained off-target model unpacked successfully.")

tar (child): DeepCRISPR/trained_models/offtar_pt_cnn.tar.gz: Cannot open: No such file or directory
tar (child): Error is not recoverable: exiting now
tar: Child returned status 2
tar: Error is not recoverable: exiting now
✅ Pre-trained off-target model unpacked successfully.


In [13]:
import os
import tarfile
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import roc_auc_score, average_precision_score

# === STEP 1: INSTALL LIBRARIES ===
print("--- Installing libraries ---")
# Use a compatible TensorFlow version
!pip install tensorflow scikit-learn pandas -q
print("✅ Libraries installed.\n")

# === STEP 2: CLONE REPOSITORY ===
print("--- Cloning DeepCRISPR repository ---")
if not os.path.exists('DeepCRISPR'):
    !git clone https://github.com/bm2-lab/DeepCRISPR.git
    print("✅ Repository cloned.")
else:
    print("✅ Repository already exists.")
print()

# === STEP 3: CHECK AND LIST AVAILABLE MODEL FILES ===
print("--- Checking available model files ---")
trained_models_dir = 'DeepCRISPR/trained_models/'

if not os.path.exists(trained_models_dir):
    print(f"❌ Error: Directory '{trained_models_dir}' not found!")
    print("Please check if the repository was cloned correctly.")
else:
    print(f"Contents of {trained_models_dir}:")
    for item in os.listdir(trained_models_dir):
        full_path = os.path.join(trained_models_dir, item)
        if os.path.isfile(full_path):
            size = os.path.getsize(full_path)
            print(f"  - {item} ({size/1024:.1f} KB)")
        else:
            print(f"  - {item}/ (directory)")
    print()

# === STEP 4: FIND AND UNPACK THE MODEL ===
print("--- Finding and unpacking the pre-trained model ---")

# Try to find the correct archive file
possible_archives = [
    'DeepCRISPR/trained_models/offtar_pt_cnn.tar.gz',
    'DeepCRISPR/trained_models/off_target_pt_cnn.tar.gz',
    'DeepCRISPR/trained_models/offtarget_pt_cnn.tar.gz'
]

archive_path = None
for path in possible_archives:
    if os.path.exists(path):
        archive_path = path
        print(f"✅ Found model archive: {path}")
        break

if archive_path is None:
    # List all .tar.gz files
    print("\n⚠️ Standard model file not found. Searching for any .tar.gz files...")
    tar_gz_files = []
    for root, dirs, files in os.walk(trained_models_dir):
        for file in files:
            if file.endswith('.tar.gz'):
                tar_gz_files.append(os.path.join(root, file))

    if tar_gz_files:
        print("\nFound the following .tar.gz files:")
        for i, f in enumerate(tar_gz_files):
            print(f"  {i+1}. {f}")
        archive_path = tar_gz_files[0]
        print(f"\n✅ Using: {archive_path}")
    else:
        print("\n❌ No .tar.gz model files found!")
        print("Please download the pre-trained model manually.")
        print("Check: https://github.com/bm2-lab/DeepCRISPR/tree/master/trained_models")
        exit()

# Unpack the model
model_dir = 'DeepCRISPR/trained_models/model_unpacked/'
print(f"\nUnpacking model to: {model_dir}")
!rm -rf {model_dir}
!mkdir -p {model_dir}

try:
    with tarfile.open(archive_path, 'r:gz') as tar:
        tar.extractall(path=model_dir)
    print("✅ Model unpacked successfully.")
except Exception as e:
    print(f"❌ Error unpacking model: {e}")
    exit()

# List contents of unpacked directory
print(f"\nContents of {model_dir}:")
for item in os.listdir(model_dir):
    print(f"  - {item}")
print()

# === STEP 5: FIND THE .META FILE ===
print("--- Finding the model's checkpoint files ---")
meta_files = []
for root, dirs, files in os.walk(model_dir):
    for file in files:
        if file.endswith('.meta'):
            meta_files.append(os.path.join(root, file))

if not meta_files:
    print(f"❌ No .meta file found in {model_dir}")
    print("\nAll files in the directory:")
    for root, dirs, files in os.walk(model_dir):
        for file in files:
            print(f"  - {os.path.join(root, file)}")
    exit()

model_meta_path = meta_files[0]
model_checkpoint_dir = os.path.dirname(model_meta_path)

print(f"✅ Found meta file: {model_meta_path}")
print(f"✅ Checkpoint directory: {model_checkpoint_dir}\n")

# === STEP 6: PREPARE THE BENCHMARK DATA ===
print("--- Preparing benchmark data ---")

if not os.path.exists('cleaned_benchmark_data.csv'):
    print("❌ Error: 'cleaned_benchmark_data.csv' not found!")
    print("Please upload your benchmark dataset.")
    exit()

df = pd.read_csv('cleaned_benchmark_data.csv')
print(f"Loaded {len(df)} samples from benchmark data")

nuc_map = {'A': 0, 'C': 1, 'G': 2, 'T': 3}

# Model expects shape: (batch, 1, 23, 8)
# The 8 likely represents [sgRNA_base * 4 + offtarget_base]
def encode_pair(sg_seq, ot_seq):
    """
    Encode sgRNA-offTarget pair in the format expected by DeepCRISPR.
    Shape: (1, 23, 8) where 8 = 4 one-hot channels for each of sg and ot
    """
    encoded = np.zeros((1, 23, 8), dtype=np.float32)

    for i in range(min(23, len(sg_seq), len(ot_seq))):
        sg_nuc = sg_seq[i].upper()
        ot_nuc = ot_seq[i].upper()

        # First 4 channels: sgRNA one-hot encoding
        if sg_nuc in nuc_map:
            encoded[0, i, nuc_map[sg_nuc]] = 1

        # Last 4 channels: off-target one-hot encoding
        if ot_nuc in nuc_map:
            encoded[0, i, 4 + nuc_map[ot_nuc]] = 1

    return encoded

# Prepare features
sg_features, ot_features, labels = [], [], []

print("Encoding sequences...")
for idx, row in df.iterrows():
    encoded = encode_pair(row['sgRNA'], row['off_target'])
    # DeepCRISPR likely uses the same input for both placeholders
    sg_features.append(encoded)
    ot_features.append(encoded)
    labels.append(row['label'])

    if (idx + 1) % 10000 == 0:
        print(f"  Encoded {idx + 1}/{len(df)} samples")

X_sg = np.array(sg_features, dtype=np.float32)
X_ot = np.array(ot_features, dtype=np.float32)
y_test = np.array(labels)

print(f"\n✅ Data prepared successfully")
print(f"   Input shape: {X_sg.shape}")
print(f"   Labels shape: {y_test.shape}")
print(f"   Expected model input shape: (batch, 1, 23, 8)")
print(f"   Our shape matches: {X_sg.shape[1:] == (1, 23, 8)}\n")

# === STEP 7: LOAD MODEL AND PREDICT ===
print("--- Loading model and running prediction ---")

# Disable eager execution for TF1 compatibility
tf.compat.v1.disable_eager_execution()

graph = tf.Graph()
with graph.as_default():
    sess = tf.compat.v1.Session()

    # Load the model
    print("Loading model checkpoint...")
    saver = tf.compat.v1.train.import_meta_graph(model_meta_path)
    saver.restore(sess, tf.train.latest_checkpoint(model_checkpoint_dir))
    print("✅ Model loaded successfully")

    # Get input and output tensors
    print("\nSearching for input/output tensors...")

    # Try common tensor names
    possible_sg_names = ['sg_input:0', 'sgRNA_input:0', 'input_1:0', 'Placeholder:0']
    possible_ot_names = ['ot_input:0', 'offtarget_input:0', 'input_2:0', 'Placeholder_1:0']
    possible_pred_names = ['prediction:0', 'output:0', 'Softmax:0', 'predictions:0']
    possible_training_names = ['is_training:0', 'training:0', 'Placeholder_2:0', 'Placeholder_3:0', 'dropout:0']

    x_sg_tensor = None
    x_ot_tensor = None
    prediction_tensor = None
    training_tensor = None

    # Try to find tensors
    for name in possible_sg_names:
        try:
            x_sg_tensor = graph.get_tensor_by_name(name)
            print(f"✅ Found sgRNA input: {name}")
            break
        except:
            pass

    for name in possible_ot_names:
        try:
            x_ot_tensor = graph.get_tensor_by_name(name)
            print(f"✅ Found off-target input: {name}")
            break
        except:
            pass

    for name in possible_pred_names:
        try:
            prediction_tensor = graph.get_tensor_by_name(name)
            print(f"✅ Found prediction output: {name}")
            break
        except:
            pass

    # Try to find training/dropout placeholder
    training_placeholders = []

    # Check all numbered placeholders
    for i in range(2, 20):  # Check Placeholder_2 through Placeholder_19
        name = f'Placeholder_{i}:0'
        try:
            tensor = graph.get_tensor_by_name(name)
            training_placeholders.append((name, tensor))
            print(f"✅ Found placeholder: {name} (dtype: {tensor.dtype})")
        except KeyError:
            # This placeholder doesn't exist, stop searching
            if i > 5:  # Only break after checking at least a few
                break
        except Exception as e:
            print(f"  Error checking {name}: {e}")

    # Also try common names
    for name in ['is_training:0', 'training:0', 'dropout:0', 'keep_prob:0']:
        try:
            tensor = graph.get_tensor_by_name(name)
            if (name, tensor) not in training_placeholders:
                training_placeholders.append((name, tensor))
                print(f"✅ Found training mode placeholder: {name}")
        except:
            pass

    if not training_placeholders:
        print("⚠️ No training mode placeholders found")

    # If not found, list all available tensors
    if x_sg_tensor is None or x_ot_tensor is None or prediction_tensor is None:
        print("\n⚠️ Could not find expected tensors. Listing all tensors in the graph:")
        all_tensors = [n.name for n in graph.as_graph_def().node]
        for tensor_name in sorted(all_tensors)[:50]:  # Show first 50
            print(f"  - {tensor_name}")
        print(f"\n(Showing first 50 of {len(all_tensors)} total tensors)")

        print("\n❌ Could not automatically identify the correct tensor names.")
        print("Please check the model architecture and update the tensor names manually.")
        sess.close()
        exit()

    # Run prediction
    print("\nRunning predictions...")

    # Build feed dict
    feed_dict = {
        x_sg_tensor: X_sg,
        x_ot_tensor: X_ot
    }

    # Add all training placeholders
    if training_placeholders:
        print(f"  Setting {len(training_placeholders)} placeholder(s) for inference mode")
        for name, tensor in training_placeholders:
            # Check the shape and dtype to provide the right value
            try:
                shape = tensor.get_shape().as_list()
            except ValueError:
                # Unknown shape - assume scalar
                shape = []

            dtype = tensor.dtype

            value_set = False

            # Determine if this is a scalar or array placeholder
            is_scalar = (shape == [] or len(shape) == 0)
            needs_batch = (len(shape) > 0 and shape[0] is None) or (shape == [None])

            if dtype == tf.bool:
                # Boolean placeholder
                if is_scalar:
                    feed_dict[tensor] = False
                elif needs_batch:
                    feed_dict[tensor] = np.array([False] * X_sg.shape[0], dtype=bool)
                else:
                    feed_dict[tensor] = np.array([False] * X_sg.shape[0], dtype=bool)
                value_set = True
            elif dtype == tf.float32 or dtype == tf.float64:
                # Float placeholder (like keep_prob)
                if is_scalar:
                    feed_dict[tensor] = 1.0  # No dropout during inference
                elif needs_batch:
                    feed_dict[tensor] = np.ones(X_sg.shape[0], dtype=np.float32)
                else:
                    feed_dict[tensor] = np.ones(X_sg.shape[0], dtype=np.float32)
                value_set = True
            elif dtype == tf.uint8 or dtype == tf.int32 or dtype == tf.int64:
                # Integer placeholder - likely a flag or index
                if is_scalar:
                    feed_dict[tensor] = 0
                elif needs_batch:
                    feed_dict[tensor] = np.zeros(X_sg.shape[0], dtype=np.uint8 if dtype == tf.uint8 else np.int32)
                else:
                    feed_dict[tensor] = np.zeros(X_sg.shape[0], dtype=np.uint8 if dtype == tf.uint8 else np.int32)
                value_set = True

            if value_set:
                print(f"    {name}: shape={shape}, dtype={dtype}, value_shape={np.array(feed_dict[tensor]).shape}")
            else:
                print(f"    {name}: shape={shape}, dtype={dtype} - SKIPPED (unhandled type)")

    y_pred_probs = sess.run(prediction_tensor, feed_dict=feed_dict)

    # Handle different output formats
    if y_pred_probs.shape[-1] == 2:
        y_scores = y_pred_probs[:, 1]
    elif y_pred_probs.shape[-1] == 1:
        y_scores = y_pred_probs[:, 0]
    else:
        y_scores = y_pred_probs.flatten()

    print(f"✅ Predictions completed. Shape: {y_pred_probs.shape}")

    # Calculate metrics
    auroc = roc_auc_score(y_test, y_scores)
    auprc = average_precision_score(y_test, y_scores)

    print("\n" + "="*70)
    print("🎯 DeepCRISPR Performance Results on Your Benchmark Dataset")
    print("="*70)
    print(f"Area Under ROC Curve (AUROC): {auroc:.4f}")
    print(f"Area Under PR Curve (AUPRC):  {auprc:.4f}")
    print("="*70)

    sess.close()

--- Installing libraries ---
✅ Libraries installed.

--- Cloning DeepCRISPR repository ---
✅ Repository already exists.

--- Checking available model files ---
Contents of DeepCRISPR/trained_models/:
  - offtar_pt_cnn.tar.gz (25819.8 KB)
  - model_unpacked/ (directory)
  - offtar_pt_cnn_reg.tar.gz (28476.8 KB)
  - ontar_ptaug_cnn.tar.gz (19929.8 KB)
  - ontar_pt_cnn_reg.tar.gz (27686.4 KB)
  - ontar_cnn_reg_seq.tar.gz (25832.7 KB)

--- Finding and unpacking the pre-trained model ---
✅ Found model archive: DeepCRISPR/trained_models/offtar_pt_cnn.tar.gz

Unpacking model to: DeepCRISPR/trained_models/model_unpacked/


  tar.extractall(path=model_dir)


✅ Model unpacked successfully.

Contents of DeepCRISPR/trained_models/model_unpacked/:
  - offtar_pt_cnn

--- Finding the model's checkpoint files ---
✅ Found meta file: DeepCRISPR/trained_models/model_unpacked/offtar_pt_cnn/model.ckpt-off.meta
✅ Checkpoint directory: DeepCRISPR/trained_models/model_unpacked/offtar_pt_cnn

--- Preparing benchmark data ---
Loaded 149483 samples from benchmark data
Encoding sequences...
  Encoded 10000/149483 samples
  Encoded 20000/149483 samples
  Encoded 30000/149483 samples
  Encoded 40000/149483 samples
  Encoded 50000/149483 samples
  Encoded 60000/149483 samples
  Encoded 70000/149483 samples
  Encoded 80000/149483 samples
  Encoded 90000/149483 samples
  Encoded 100000/149483 samples
  Encoded 110000/149483 samples
  Encoded 120000/149483 samples
  Encoded 130000/149483 samples
  Encoded 140000/149483 samples

✅ Data prepared successfully
   Input shape: (149483, 1, 23, 8)
   Labels shape: (149483,)
   Expected model input shape: (batch, 1, 23, 8

**CRISOT**

In [14]:
!git clone https://github.com/bm2-lab/CRISOT.git

Cloning into 'CRISOT'...
remote: Enumerating objects: 83, done.[K
remote: Counting objects: 100% (30/30), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 83 (delta 15), reused 19 (delta 6), pack-reused 53 (from 1)[K
Receiving objects: 100% (83/83), 157.68 MiB | 22.88 MiB/s, done.
Resolving deltas: 100% (27/27), done.


In [15]:
!ls CRISOT/

crisot_framework.png  CRISOT.py  example  models     script
crisot_modules.py     data	 LICENSE  README.md  utils.py


In [16]:
# Install the specific libraries required by CRISOT
!pip install xgboost==1.7.3 pandas numpy

print("✅ Environment for CRISOT is ready.")

Collecting xgboost==1.7.3
  Downloading xgboost-1.7.3-py3-none-manylinux2014_x86_64.whl.metadata (1.9 kB)
Downloading xgboost-1.7.3-py3-none-manylinux2014_x86_64.whl (193.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 3.0.5
    Uninstalling xgboost-3.0.5:
      Successfully uninstalled xgboost-3.0.5
Successfully installed xgboost-1.7.3
✅ Environment for CRISOT is ready.


In [2]:
# Find and unzip the model files
!unzip -q -o CRISOT/models/crisot-fp_xgb_models.zip -d CRISOT/models/

print("✅ Pre-trained models for CRISOT unzipped successfully.")

unzip:  cannot find or open CRISOT/models/crisot-fp_xgb_models.zip, CRISOT/models/crisot-fp_xgb_models.zip.zip or CRISOT/models/crisot-fp_xgb_models.zip.ZIP.
✅ Pre-trained models for CRISOT unzipped successfully.


In [1]:
import pandas as pd

print("Loading our benchmark dataset...")
df = pd.read_csv('cleaned_benchmark_data.csv')

# Rename the columns to match the required format for CRISOT
crisot_input_df = df.rename(columns={
    'sgRNA': 'On',
    'off_target': 'Off'
})

# Save the formatted data to a new CSV file
crisot_input_df.to_csv('crisot_benchmark_input.csv', index=False)

print("✅ Benchmark data formatted for CRISOT and saved to 'crisot_benchmark_input.csv'")

Loading our benchmark dataset...
✅ Benchmark data formatted for CRISOT and saved to 'crisot_benchmark_input.csv'


In [5]:
# Display the first 5 lines of the output file, including the header
!head -n 5 crisot_output.csv

On,Off,label,CRISOT_Score
GCAGCCAGTACAGCTCACCATGG,GCTGCCAGTACAGGCTCCCCCTC,0,0.19869676422952098
GCAGCCAGTACAGCTCACCATGG,GCTGCCAGTACAGGCTCCCCCTC,0,0.19869676422952098
GCTAGAGTCACAAGTCCCACAGG,TACTAGAGTGACAAGTCACACAA,0,0.0
GCAGCCAGTACAGCTCACCATGG,ACAGCGAGTACAAGCTCATCATG,0,0.0


In [6]:
import pandas as pd
from sklearn.metrics import roc_auc_score, average_precision_score

print("--- Calculating final performance metrics ---")
try:
    # Load the prediction results generated by the script
    results_df = pd.read_csv('crisot_output.csv')

    # Load your original benchmark data to get the ground truth labels
    ground_truth_df = pd.read_csv('cleaned_benchmark_data.csv')

    # Extract the true labels and the predicted scores using the CORRECT column name
    y_test = ground_truth_df['label']
    y_scores = results_df['CRISOT_Score'] # Corrected from 'CRISOT-Score'

    # Calculate the performance metrics
    auroc = roc_auc_score(y_test, y_scores)
    auprc = average_precision_score(y_test, y_scores)

    print("\n" + "="*70)
    print("🎯 CRISOT Performance Results on Your Benchmark Dataset")
    print("="*70)
    print(f"Area Under ROC Curve (AUROC): {auroc:.4f}")
    print(f"Area Under PR Curve (AUPRC):  {auprc:.4f}")
    print("="*70)

except FileNotFoundError:
    print("❌ Error: 'crisot_output.csv' was not created. Please review the prediction logs above for errors.")
except KeyError:
    print("❌ Error: Could not find the 'CRISOT_Score' column. Please check the output of '!head crisot_output.csv' again.")
except Exception as e:
    print(f"An error occurred during evaluation: {e}")

--- Calculating final performance metrics ---

🎯 CRISOT Performance Results on Your Benchmark Dataset
Area Under ROC Curve (AUROC): 0.9888
Area Under PR Curve (AUPRC):  0.0723


 **CRISPR IP** *italicized text*

In [7]:
!git clone https://github.com/BioinfoVirgo/CRISPR-IP.git

Cloning into 'CRISPR-IP'...
remote: Enumerating objects: 51, done.[K
remote: Counting objects: 100% (51/51), done.[K
remote: Compressing objects: 100% (38/38), done.[K
remote: Total 51 (delta 16), reused 45 (delta 10), pack-reused 0 (from 0)[K
Receiving objects: 100% (51/51), 19.55 MiB | 4.88 MiB/s, done.
Resolving deltas: 100% (16/16), done.
Updating files: 100% (20/20), done.


In [1]:
# Install modern, compatible versions of the required libraries
!pip install tensorflow pandas scikit-learn

print("✅ Environment for CRISPR-IP is ready.")

✅ Environment for CRISPR-IP is ready.


In [2]:
import pandas as pd
import numpy as np

print("--- Preparing benchmark data for CRISPR-IP ---")

# --- Encoding logic from the encoding.py file ---
encoded_dict = {'A': [1, 0, 0, 0], 'T': [0, 1, 0, 0], 'G': [0, 0, 1, 0], 'C': [0, 0, 0, 1], '_': [0, 0, 0, 0], '-': [0, 0, 0, 0]}
pos_dict = {'A':1, 'T':2, 'G':3, 'C':4, '_':5, '-':5}

def encode_sequence_pair(sgRNA_seq, off_target_seq):
    tlen = 24
    # Pad sequences to 24 characters
    target_seq = "_"*(tlen-len(sgRNA_seq)) + sgRNA_seq
    off_target_seq = "_"*(tlen-len(off_target_seq)) + off_target_seq

    target_seq_code = np.array([encoded_dict[base] for base in list(target_seq)])
    off_target_seq_code = np.array([encoded_dict[base] for base in list(off_target_seq)])

    on_off_dim6_codes = []
    for i in range(tlen):
        diff_code = np.bitwise_or(target_seq_code[i], off_target_seq_code[i])
        dir_code = np.zeros(2)
        if pos_dict[target_seq[i]] == pos_dict[off_target_seq[i]]:
            diff_code = diff_code * -1
            dir_code[0] = 1
            dir_code[1] = 1
        elif pos_dict[target_seq[i]] < pos_dict[off_target_seq[i]]:
            dir_code[0] = 1
        elif pos_dict[target_seq[i]] > pos_dict[off_target_seq[i]]:
            dir_code[1] = 1

        on_off_dim6_codes.append(np.concatenate((diff_code, dir_code)))

    on_off_dim6_codes = np.array(on_off_dim6_codes)
    isPAM = np.zeros((24, 1))
    isPAM[-3:, :] = 1
    on_off_code = np.concatenate((on_off_dim6_codes, isPAM), axis=1)
    return on_off_code
# --- End of encoding logic ---

# Load your benchmark data
df = pd.read_csv('cleaned_benchmark_data.csv')

encoded_features = []
labels = []

for _, row in df.iterrows():
    sgRNA_seq = row['sgRNA']
    off_target_seq = row['off_target']

    # Encode the pair and add it to our list
    encoded_pair = encode_sequence_pair(sgRNA_seq, off_target_seq)
    encoded_features.append(encoded_pair)
    labels.append(row['label'])

# Convert to final NumPy arrays
X = np.array(encoded_features)
y = np.array(labels)

print("\n✅ Preprocessing complete!")
print(f"Shape of encoded features: {X.shape}")
print(f"Shape of Labels: {y.shape}")

# Save the processed data
np.savez('crispr_ip_benchmark_encoded.npz', X=X, y=y)
print("\n💾 Saved preprocessed data to 'crispr_ip_benchmark_encoded.npz'")

--- Preparing benchmark data for CRISPR-IP ---

✅ Preprocessing complete!
Shape of encoded features: (149483, 24, 7)
Shape of Labels: (149483,)

💾 Saved preprocessed data to 'crispr_ip_benchmark_encoded.npz'


In [3]:
!ls CRISPR-IP/example_saved/

example+crispr_ip.h5	    example-test-data.csv
example-predict-result.csv  example-train-data.csv


In [5]:
!ls CRISPR-IP/codes/

CRISPR_IP.py  encoding.py  __init__.py	__pycache__


In [6]:
!cat CRISPR-IP/codes/CRISPR_IP.py

import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import model_from_json, load_model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Attention, Dense, Conv2D, Bidirectional, LSTM, Flatten, Input, Activation, Reshape, Dropout, Concatenate, AveragePooling1D, MaxPool1D, BatchNormalization, Attention, GlobalAveragePooling1D, GlobalMaxPool1D, GRU, AdditiveAttention, AlphaDropout, LeakyReLU
from tensorflow.keras.initializers import VarianceScaling
from tensorflow.keras.utils import to_categorical

def transformIO(xtrain, xtest, ytrain, ytest, seq_len , coding_dim, num_classes):
    xtrain = xtrain.reshape(xtrain.shape[0], 1, seq_len, coding_dim)
    xtest = xtest.reshape(xtest.shape[0], 1, seq_len, coding_dim)
    input_shape = (1, seq_len, coding_dim)
    xtrain = xtrain.astype('float32')
    xtest = xtest.astype('float32')
    print('xtrain shape:', xtrain.shape)
    print(xtrain.shape[0], 'train samples')
    print(xtest.shape[

In [11]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix, classification_report

# === STEP 1: INSTALL LIBRARIES ===
print("--- Installing libraries ---")
!pip install tensorflow pandas scikit-learn -q
print("✅ Libraries installed.\n")

# === STEP 2: CLONE REPOSITORY ===
print("--- Cloning CRISPR-IP repository ---")
if not os.path.exists('CRISPR-IP'):
    !git clone https://github.com/BioinfoVirgo/CRISPR-IP.git
else:
    print("✅ Repository already exists.\n")

# === STEP 3: PREPARE DATA WITH ORIGINAL ENCODING ===
print("--- Preparing benchmark data with ORIGINAL CRISPR-IP encoding ---")
if not os.path.exists('cleaned_benchmark_data.csv'):
    print("❌ Critical Error: 'cleaned_benchmark_data.csv' not found. Please upload it first.")
    exit()

df = pd.read_csv('cleaned_benchmark_data.csv')

# === ORIGINAL ENCODING FROM CRISPR-IP ===
encoded_dict = {'A': [1,0,0,0], 'T': [0,1,0,0], 'G': [0,0,1,0], 'C': [0,0,0,1], '_': [0,0,0,0], '-': [0,0,0,0]}
pos_dict = {'A':1, 'T':2, 'G':3, 'C':4, '_':5, '-':5}

def my_encode_on_off_dim(target_seq, off_target_seq):
    """Original encoding function from CRISPR-IP"""
    tlen = 24
    # IMPORTANT: Uses "-" for padding, not "_"
    target_seq = "-" * (tlen - len(target_seq)) + target_seq
    off_target_seq = "-" * (tlen - len(off_target_seq)) + off_target_seq

    target_seq_code = np.array([encoded_dict[base] for base in list(target_seq)])
    off_target_seq_code = np.array([encoded_dict[base] for base in list(off_target_seq)])

    on_off_dim6_codes = []
    for i in range(len(target_seq)):
        diff_code = np.bitwise_or(target_seq_code[i], off_target_seq_code[i])
        dir_code = np.zeros(2)

        if pos_dict[target_seq[i]] == pos_dict[off_target_seq[i]]:
            diff_code = diff_code * -1
            dir_code[0] = 1
            dir_code[1] = 1
        elif pos_dict[target_seq[i]] < pos_dict[off_target_seq[i]]:
            dir_code[0] = 1
        elif pos_dict[target_seq[i]] > pos_dict[off_target_seq[i]]:
            dir_code[1] = 1
        else:
            raise Exception("Invalid seq!", target_seq, off_target_seq)

        on_off_dim6_codes.append(np.concatenate((diff_code, dir_code)))

    on_off_dim6_codes = np.array(on_off_dim6_codes)
    isPAM = np.zeros((24, 1))
    isPAM[-3:, :] = 1
    on_off_code = np.concatenate((on_off_dim6_codes, isPAM), axis=1)
    return on_off_code

# Encode data
print("Encoding sequences...")
encoded_features, labels = [], []
for _, row in df.iterrows():
    encoded_features.append(my_encode_on_off_dim(row['sgRNA'], row['off_target']))
    labels.append(row['label'])

X_test = np.array(encoded_features, dtype=np.float32).reshape(-1, 1, 24, 7)
y_test = np.array(labels)
print(f"✅ Data encoded. Shape: {X_test.shape}\n")

# === STEP 4: BUILD MODEL WITH ORIGINAL ARCHITECTURE ===
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, Reshape, AveragePooling1D, MaxPool1D, Bidirectional, LSTM, Attention, GlobalAveragePooling1D, GlobalMaxPool1D, Flatten, BatchNormalization, Dense, Dropout, Concatenate, Permute
from tensorflow.keras.initializers import VarianceScaling

print("--- Building model with ORIGINAL CRISPR-IP architecture ---")
input_shape = (1, 24, 7)
initializer = VarianceScaling(mode='fan_avg', distribution='uniform')

input_value = Input(shape=input_shape)
conv_1_output = Conv2D(60, (1, 7), padding='valid', data_format='channels_first', kernel_initializer=initializer)(input_value)

# After Conv2D: shape is (None, 60, 1, 24)
# Reshape to (60, 24) - keeping channels first
conv_1_output_reshape = Reshape((60, 24))(conv_1_output)

# Transpose to (24, 60) - swapping dimensions
conv_1_output_reshape2 = Permute((2, 1))(conv_1_output_reshape)

# Now with channels_first pooling on (24, 60):
# This treats 24 as channels and pools along the 60 dimension
# Result: (24, 30) for each pooling operation
conv_1_output_reshape_average = AveragePooling1D(pool_size=2, data_format='channels_first')(conv_1_output_reshape2)
conv_1_output_reshape_max = MaxPool1D(pool_size=2, data_format='channels_first')(conv_1_output_reshape2)

# Concatenate along last axis: (24, 30) + (24, 30) = (24, 60)
# This gives us 60 input features to the LSTM, which matches the saved weights!

# Concatenate and continue
bidirectional_1_output = Bidirectional(LSTM(30, return_sequences=True, dropout=0.25, kernel_initializer=initializer))(
    Concatenate(axis=-1)([conv_1_output_reshape_average, conv_1_output_reshape_max])
)

attention_1_output = Attention()([bidirectional_1_output, bidirectional_1_output])
average_1_output = GlobalAveragePooling1D(data_format='channels_last')(attention_1_output)
max_1_output = GlobalMaxPool1D(data_format='channels_last')(attention_1_output)
concat_output = Concatenate(axis=-1)([average_1_output, max_1_output])
flatten_output = Flatten()(concat_output)
linear_1_output = BatchNormalization()(Dense(200, activation='relu', kernel_initializer=initializer)(flatten_output))
linear_2_output = Dense(100, activation='relu', kernel_initializer=initializer)(linear_1_output)
linear_2_output_dropout = Dropout(0.9)(linear_2_output)
linear_3_output = Dense(2, activation='softmax', kernel_initializer=initializer)(linear_2_output_dropout)

model = Model(input_value, linear_3_output)
print("✅ Model architecture built.\n")

# === STEP 5: LOAD WEIGHTS AND PREDICT ===
model_path = 'CRISPR-IP/example_saved/example+crispr_ip.h5'
print(f"--- Loading weights from {model_path} ---")
model.load_weights(model_path)
print("✅ Weights loaded successfully.\n")

print("--- Making predictions ---")
y_pred_probs = model.predict(X_test, batch_size=1024)
y_scores = y_pred_probs[:, 1]
y_pred_labels = (y_scores >= 0.5).astype(int)

# === STEP 6: EVALUATION ===
print("\n" + "="*70)
print("🎯 CRISPR-IP PERFORMANCE WITH CORRECT ENCODING")
print("="*70)

auroc = roc_auc_score(y_test, y_scores)
auprc = average_precision_score(y_test, y_scores)

print(f"AUROC: {auroc:.4f}")
print(f"AUPRC: {auprc:.4f}")

print("\n📊 Confusion Matrix:")
cm = confusion_matrix(y_test, y_pred_labels)
print(cm)

print("\n📋 Classification Report:")
print(classification_report(y_test, y_pred_labels, target_names=['No Activity (0)', 'Activity (1)']))

print("\n📈 Prediction Score Statistics:")
print(f"Min:  {y_scores.min():.6f}")
print(f"Max:  {y_scores.max():.6f}")
print(f"Mean: {y_scores.mean():.6f}")
print(f"Median: {np.median(y_scores):.6f}")

if (y_test == 1).sum() > 0:
    positive_scores = y_scores[y_test == 1]
    print(f"\n🎯 TRUE POSITIVE scores (label=1):")
    print(f"  Mean: {positive_scores.mean():.6f}")
    print(f"  Median: {np.median(positive_scores):.6f}")

if (y_test == 0).sum() > 0:
    negative_scores = y_scores[y_test == 0]
    print(f"\n🎯 TRUE NEGATIVE scores (label=0):")
    print(f"  Mean: {negative_scores.mean():.6f}")
    print(f"  Median: {np.median(negative_scores):.6f}")

print("="*70)

--- Installing libraries ---
✅ Libraries installed.

--- Cloning CRISPR-IP repository ---
✅ Repository already exists.

--- Preparing benchmark data with ORIGINAL CRISPR-IP encoding ---
Encoding sequences...
✅ Data encoded. Shape: (149483, 1, 24, 7)

--- Building model with ORIGINAL CRISPR-IP architecture ---
✅ Model architecture built.

--- Loading weights from CRISPR-IP/example_saved/example+crispr_ip.h5 ---
✅ Weights loaded successfully.

--- Making predictions ---
[1m146/146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step

🎯 CRISPR-IP PERFORMANCE WITH CORRECT ENCODING
AUROC: 0.7812
AUPRC: 0.0009

📊 Confusion Matrix:
[[130730  18711]
 [    13     29]]

📋 Classification Report:
                 precision    recall  f1-score   support

No Activity (0)       1.00      0.87      0.93    149441
   Activity (1)       0.00      0.69      0.00        42

       accuracy                           0.87    149483
      macro avg       0.50      0.78      0.47    149483
   weigh