# Feature Calculations for Peptides (DAPAM and Peptipedia)
## Code 3

Make sure to change paths accordingly.

In [None]:
# import libraries
import time
import random
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt  # this is used for the plot the graph

# Sklearn classes
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    GridSearchCV,
    KFold,
)
from sklearn import metrics
from sklearn.metrics import confusion_matrix, silhouette_score
import sklearn.metrics.cluster as smc
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder,
    LabelEncoder,
    MinMaxScaler,
)
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn import tree
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_blobs

# import the provided helper functions
from google.colab import drive
drive.mount('/content/drive')
import sys

# change path
sys.path.append('/content/drive/My Drive/Chen Lab/Antimicrobial Peptide Project/Data')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Sets random seed for reproducibility
SEED = 42
random.seed(SEED)

Mounted at /content/drive


In [None]:
# change path
file_path = '/content/drive/My Drive/Chen Lab/Antimicrobial Peptide Project/AnnotatedResults copy.csv'

# Read the CSV file
valid_sequences_df = pd.read_csv(file_path)

valid_sequences_df.head()


Unnamed: 0,PMC ID,Sequence,Antibacterial,Mechanism,Potency,Extra Information,gram,Alternate phrase for mechanism,Precise sentence found by algorithm,Keyword,Sentence,Unnamed: 11,Notes on article,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17
0,PMC6705042,LLGDFFRKSKEKIGKEFKRIVQRIKDFLRNLVPRTES,1.0,Pore,high,Large pore,neg,,Y,Pore,The featured peptides and their characteristic...,,"had to see what sequences Cla, Cec, etc referr...",,,,,
1,,SLGNFFRKARKKIGEEFKRIVQRIKDFLQHLIPRTEA,1.0,Pore,low,,neg,,Y,Pore,The featured peptides and their characteristic...,,,,,,,
2,,GIGKFLHSAGKFGKAFVGEIMKS,1.0,Pore,low,,neg,,Y,Pore,The featured peptides and their characteristic...,,,,,,,
3,,GFFALIPKIISSPLFKTLLSAVGSALSSSGDQE,1.0,Pore,high,Large pore,neg,,Y,Pore,The featured peptides and their characteristic...,,,,,,,
4,,KWKLFKKIEKVGQNIRDGIIKAGPAVAVVGQATQIAK,1.0,Pore,high,Small pore,neg,,Y,Pore,"With respect to Cec behavior, the precise anti...",The featured peptides and their characteristic...,,,,,,


In [None]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import peptides

# Load the CSV file (change path)
file_path = "/content/drive/My Drive/Chen Lab/Antimicrobial Peptide Project/AnnotatedResults copy.csv"
df = pd.read_csv(file_path)

#To fill in PMC ID
df["PMC ID"] = df["PMC ID"].fillna(method='ffill')

# Drop unnecessary columns (A, E, F, H, I, J, K, and subsequent columns)
columns_to_drop = df.columns[[4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]]  # Adjusted to match column index positions
df = df.drop(columns=columns_to_drop)

# Filter only antibacterial peptides and reset index
df_filtered = df[df["Antibacterial"] == 1].copy().reset_index(drop=True)


# Convert 'gram' column values
gram_mapping = {
    "pos": "+",
    "neg": "-",
    "pos & neg": "+/-"
}
df_filtered["gram"] = df_filtered["gram"].map(gram_mapping)

# Peptide Properties Calculation Class
class PeptidePropertiesTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        properties_df = X["Sequence"].apply(self.calculate_peptide_properties).apply(pd.Series)
        return properties_df

    def calculate_peptide_properties(self, sequence):
        analyzed_seq = ProteinAnalysis(sequence)
        molecular_weight = analyzed_seq.molecular_weight()
        isoelectric_point = analyzed_seq.isoelectric_point()
        aromaticity = analyzed_seq.aromaticity()
        instability_index = analyzed_seq.instability_index()
        hydrophobic_ratio = analyzed_seq.gravy()
        net_charge = analyzed_seq.charge_at_pH(7.0)
        amino_acid_percent = analyzed_seq.get_amino_acids_percent()

        peptide = peptides.Peptide(sequence)
        boman_index = peptide.boman()
        aliphatic_index = peptide.aliphatic_index()
        charge = peptide.charge()

        charge_density = net_charge / molecular_weight if molecular_weight != 0 else 0

        properties = {
            "Length (number of amino acids)": len(sequence),
            "Molecular Weight (Daltons)": molecular_weight,
            "Net Charge": net_charge,
            "Charge": charge,
            "Charge Density (Charge/Dalton)": charge_density,
            "Instability Index": instability_index,
            "Aromaticity": aromaticity,
            "Aliphatic Index": aliphatic_index,
            "Boman Index (kcal/mol)": boman_index,
            "Isoelectric Point (pH)": isoelectric_point,
            "Hydrophobic Ratio": hydrophobic_ratio,
            **{f"Percent {aa}": amino_acid_percent.get(aa, 0) for aa in "ARNDCEQGHILKMFPSTWYV"}
        }

        return properties

# Compute Peptide Properties
transformer = PeptidePropertiesTransformer()
properties_df = transformer.transform(df_filtered)

# Merge new properties with original dataframe after resetting indices
df_processed = pd.concat([df_filtered.reset_index(drop=True), properties_df.reset_index(drop=True)], axis=1)

# Replace NaN values with empty strings to prevent gaps in CSV
df_processed.fillna("", inplace=True)

# Save processed data
df_processed.to_csv("/content/drive/My Drive/Chen Lab/Antimicrobial Peptide Project/processed_annotatedcopy_peptide_data_updated.csv", index=False)



  df["PMC ID"] = df["PMC ID"].fillna(method='ffill')


In [None]:
from collections import defaultdict

# Standardize mechanism names
df_processed["Mechanism"] = df_processed["Mechanism"].str.lower().str.strip()

# Ensure we handle multiple mechanisms correctly
all_mechanisms = set()
mechanism_dict = defaultdict(list)

# Iterate over each row and split mechanisms
for idx, row in df_processed.iterrows():
    if pd.notna(row["Mechanism"]):  # Ignore NaN values
        mechanisms = [m.strip() for m in row["Mechanism"].split(",")]
        for mech in mechanisms:
            all_mechanisms.add(mech)
            mechanism_dict[mech].append(idx)  # Store row indices for each mechanism

# Normalize some names (fix typos, merge similar terms)
mechanism_mapping = {
    "pore": "Pore",
    "torroidal pore": "Toroidal pore",
    "toroidal pore": "Toroidal pore",
    "carpet": "Carpet",
    "barrel-stave": "Barrel-stave",
    "membrane permeabiility": "Membrane disruption",
    "membrane permeability": "Membrane disruption",
    "permealize membrane": "Membrane disruption",
    "membrane disruption": "Membrane disruption",
    "membrane damage": "Membrane disruption",
    "bacterial membrane external portrusion": "Membrane disruption",
    "non-lytic membrane permeabilization": "Membrane disruption",
    "bacterial membrane": "Membrane disruption",
    "disrupting membrane": "Membrane disruption",
    "membrane": "Membrane disruption",
    "disrupt cell membrane": "Membrane disruption",
    "disrupting the membrane": "Membrane disruption",
    "biofilm destruction": "Biofilm destruction",
    "antibiofilm": "Biofilm destruction",
    "biofilm": "Biofilm destruction",
}

# Apply mapping
mechanism_dict_cleaned = defaultdict(list)
for mech, indices in mechanism_dict.items():
    clean_mech = mechanism_mapping.get(mech, mech)  # Map to corrected names if available
    mechanism_dict_cleaned[clean_mech].extend(indices)

# Compute statistics per mechanism and structure the output
mechanism_stats = {}
for mech, indices in mechanism_dict_cleaned.items():
    subset = df_processed.loc[indices]  # Get relevant rows
    numeric_data = subset.select_dtypes(include=[np.number])  # Only numerical columns
    stats = numeric_data.agg(["mean", "std"])  # Compute mean and std
    mechanism_stats[mech] = stats  # Store stats

# Combine statistics into a single DataFrame (multi-index for readability)
mechanism_stats_df = pd.concat(mechanism_stats, axis=1)

# Transpose for better readability (features as rows, mechanisms as columns)
mechanism_stats_df = mechanism_stats_df.T

# Rename index levels to indicate mechanism & statistic type
mechanism_stats_df.index.set_names(["Mechanism", "Statistic"], inplace=True)

# Save to CSV (change path)
mechanism_stats_df.to_csv("/content/drive/My Drive/Chen Lab/Antimicrobial Peptide Project/annotatedcopy_mechanismstats.csv")

print("Processing complete. Transposed mechanism statistics saved as 'mechanism_statistics_cleaned_transposed.csv'.")


Processing complete. Transposed mechanism statistics saved as 'mechanism_statistics_cleaned_transposed.csv'.
