In [1]:
pip install torch fair-esm numpy pandas scikit-learn

Collecting fair-esm
  Downloading fair_esm-2.0.0-py3-none-any.whl.metadata (37 kB)
Downloading fair_esm-2.0.0-py3-none-any.whl (93 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.1/93.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fair-esm
Successfully installed fair-esm-2.0.0


In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

# Step 1: Load Dataset
def load_dataset(file_path):
    """
    Load the AMP dataset from a CSV file.
    Args:
        file_path (str): Path to the CSV file containing sequences and labels.
    Returns:
        sequences (list): List of peptide sequences.
        labels (list): Corresponding labels (1 for AMP, 0 for non-AMP).
    """
    df = pd.read_csv(file_path)
    sequences = df["sequence"].tolist()
    labels = df["label"].tolist()
    return sequences, labels

# Step 3: Calculate Amino Acid Percentages
def calculate_C_amino_acid_percentage(sequence):
    """
    Calculate the percentage of the amino acid 'L' (Leucine) in a single peptide sequence.

    Args:
        sequence (str): A single peptide sequence.

    Returns:
        float: Percentage of 'L' in the sequence.
    """
    if not sequence:  # Handle empty sequences
        return 0.0

    total_length = len(sequence)  # Total length of the sequence
    L_count = sequence.count('C')  # Count occurrences of 'L'

    return (L_count / total_length) * 100  # Convert to percentage


def calculate_L_amino_acid_percentage(sequence):
    """
    Calculate the percentage of the amino acid 'L' (Leucine) in a single peptide sequence.

    Args:
        sequence (str): A single peptide sequence.

    Returns:
        float: Percentage of 'L' in the sequence.
    """
    if not sequence:  # Handle empty sequences
        return 0.0

    total_length = len(sequence)  # Total length of the sequence
    L_count = sequence.count('L')  # Count occurrences of 'L'

    return (L_count / total_length) * 100  # Convert to percentage


def calculate_hydrophobicity(sequence):
    """
    Calculate the hydrophobicity score of a peptide sequence based on the Kyte-Doolittle scale.
    Args:
        sequence (str): Peptide sequence.
    Returns:
        float: Hydrophobicity score.
    """
    kd_scale = {
        'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5,
        'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5,
        'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6,
        'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2
    }
    return sum(kd_scale.get(aa, 0) for aa in sequence) / len(sequence)


def calculate_charge(sequence, pH=7.4):
    """
    Calculate the net charge of a peptide sequence at a given pH.
    """
    pKa = {'K': 10.5, 'R': 12.5, 'H': 6.0, 'D': 3.9, 'E': 4.1}
    positive_charge = sum([1 / (1 + 10**(pH - pKa[aa])) for aa in sequence if aa in 'KRH'])
    negative_charge = sum([1 / (1 + 10**(pKa[aa] - pH)) for aa in sequence if aa in 'DE'])
    return positive_charge - negative_charge

# Step 4: Calculate Polarity
def calculate_polarity(sequence):
    """
    Calculate the polarity of a peptide sequence using the Grantham polarity scale.
    Args:
        sequence (str): Peptide sequence.
    Returns:
        float: Average polarity of the peptide.
    """
    grantham_polarity = {
        'A': 8.1, 'R': 10.5, 'N': 11.6, 'D': 13.0, 'C': 5.5, 'Q': 10.5, 'E': 12.3,
        'G': 9.0, 'H': 10.4, 'I': 5.2, 'L': 4.9, 'K': 11.3, 'M': 5.7, 'F': 5.2,
        'P': 8.0, 'S': 9.2, 'T': 8.6, 'W': 5.4, 'Y': 6.2, 'V': 5.9
    }

    polarity = np.mean([grantham_polarity[aa] for aa in sequence if aa in grantham_polarity])
    return polarity


In [5]:
import torch
import esm
print("✅ ESM is installed correctly!")
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split


# Step 1: Load Dataset
def load_dataset(file_path):
    """
    Load the AMP dataset from a CSV file.
    Args:
        file_path (str): Path to the CSV file containing sequences and labels.
    Returns:
        sequences (list): List of peptide sequences.
        labels (list): Corresponding labels (1 for AMP, 0 for non-AMP).
    """
    df = pd.read_csv(file_path)
    sequences = df["sequence"].tolist()
    labels = df["label"].tolist()
    return sequences, labels


# Step 2: Extract Features Using ESM-2
def extract_features(sequences, batch_size=32, model_name="esm2_t6_8M_UR50D"):
    """
    Extract ESM-2 embeddings for peptide sequences in batches.
    Args:
        sequences (list): List of peptide sequences.
        batch_size (int): Number of sequences to process per batch.
        model_name (str): Name of the pre-trained ESM-2 model to use.
    Returns:
        np.ndarray: Array of embeddings for each sequence.
    """
    print("Loading ESM-2 model...")
    model, alphabet = esm.pretrained.__dict__[model_name]()
    model = model.eval().cuda() if torch.cuda.is_available() else model.eval()
    batch_converter = alphabet.get_batch_converter()

    embeddings = []
    print("Extracting features...")
    for i in tqdm(range(0, len(sequences), batch_size), desc="Batches"):
        batch_sequences = [(str(j), seq) for j, seq in enumerate(sequences[i:i + batch_size])]
        _, _, batch_tokens = batch_converter(batch_sequences)
        batch_tokens = batch_tokens.cuda() if torch.cuda.is_available() else batch_tokens
        with torch.no_grad():
            results = model(batch_tokens, repr_layers=[6])
            batch_embeddings = results["representations"][6].mean(1).cpu().numpy()  # Mean pooling
        embeddings.append(batch_embeddings)

    return np.vstack(embeddings)

def extract_simple_features(sequences):
  charge = [calculate_charge(seq) for seq in sequences]
  hydrophobicity = [calculate_hydrophobicity(seq) for seq in sequences]
  # polarity = [calculate_polarity(seq) for seq in sequences]
  camasp = [calculate_L_amino_acid_percentage(seq) for seq in sequences]
  lamasp = [calculate_C_amino_acid_percentage(seq) for seq in sequences]
  return np.column_stack((charge, hydrophobicity, camasp, lamasp))

def train_and_evaluate(X, y):
    """
    Train Random Forest and XGBoost classifiers and evaluate their performance.

    Args:
        X (np.ndarray): Feature matrix (embeddings).
        y (list): Labels for each sequence.
    """
    # Split data into training and testing sets (80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # ✅ Train Random Forest Classifier
    print("\n🔹 Training Random Forest classifier...")
    rf_clf = RandomForestClassifier(n_estimators=5)
    rf_clf.fit(X_train, y_train)
    rf_pred = rf_clf.predict(X_test)

    # ✅ Train XGBoost Classifier
    print("\n🔹 Training XGBoost classifier...")
    xgb_clf = xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss")
    xgb_clf.fit(X_train, y_train)
    xgb_pred = xgb_clf.predict(X_test)

    # ✅ Evaluate Performance
    def evaluate_model(name, y_true, y_pred):
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        print(f"\n🔹 {name} Model Evaluation:")
        print(f"✅ Accuracy: {accuracy:.2f}")
        print(f"✅ Precision: {precision:.2f}")
        print(f"✅ Recall: {recall:.2f}")
        print(f"✅ F1 Score: {f1:.2f}")

    evaluate_model("Random Forest", y_test, rf_pred)
    evaluate_model("XGBoost", y_test, xgb_pred)

# Main Function
# if __name__ == "__main__":
#     # File path to your dataset
#     file_path = "/content/sample_data/total_dataset.csv"  # Ensure the CSV has 'sequence' and 'label' columns
#     file_path2 = "/content/sample_data/merged_dataset.csv"
#     file_path3 = r"/content/sample_data/merged_dataset_animal.csv"

#     # Step 1: Load Dataset
#     print("Loading dataset...")
#     sequences, labels = load_dataset(file_path3)

#     # Step 2: Extract Features Using ESM-2
#     embeddings = extract_features(sequences)

#     # Step 3: Train and Evaluate Classifier
#     train_and_evaluate(embeddings, labels)

✅ ESM is installed correctly!


In [8]:
# File paths to your datasets
data_file_path = r"/content/sample_data/total_dataset.csv"  # Replace with the path to your Non-AMP dataset
data_file_path2 = r"/content/sample_data/merged_dataset.csv"
data_file_path3 = r"/content/sample_data/merged_dataset_animal.csv"


# Step 1: Load Datasets
print("Loading datasets...")
sequences, labels = load_dataset(data_file_path3)  # Non-AMP sequences

X = extract_simple_features(sequences)
print(f'shape of X is : {np.shape(X)}')
y = labels
print(f'shape of y is : {np.shape(y)}')

# 2️⃣ Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 3️⃣ Create an XGBoost classifier
model = xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss", random_state=42)

# 4️⃣ Train the model
model.fit(X_train, y_train)

# 5️⃣ Make predictions
y_pred = model.predict(X_test)

# 6️⃣ Evaluate accuracy
name = "our model XGBoost"
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"\n🔹 {name} Model Evaluation:")
print(f"✅ Accuracy: {accuracy:.2f}")
print(f"✅ Precision: {precision:.2f}")
print(f"✅ Recall: {recall:.2f}")
print(f"✅ F1 Score: {f1:.2f}")


Loading datasets...
shape of X is : (5160, 4)
shape of y is : (5160,)

🔹 our model XGBoost Model Evaluation:
✅ Accuracy: 0.87
✅ Precision: 0.88
✅ Recall: 0.86
✅ F1 Score: 0.87
