In [3]:
import pandas as pd
from Bio.SeqUtils.ProtParam import ProteinAnalysis

# Read dataset
df = pd.read_csv("inhibits.csv")

# Calculate properties safely
charges = []
hydros = []
isopoints = []
aroms = []

for seq in df["Sequence"]:
    if len(seq) == 0:  
        charges.append(None)
        hydros.append(None)
        isopoints.append(None)
        aroms.append(None)
        continue

    analysis = ProteinAnalysis(seq)
    charges.append(analysis.charge_at_pH(7.4))
    hydros.append(analysis.gravy())
    isopoints.append(analysis.isoelectric_point())
    aroms.append(analysis.aromaticity())

# Add results to DataFrame
df["Charge"] = charges
df["Hydrophobicity"] = hydros
df["Isoelectric_Point"] = isopoints
df["Aromaticity"] = aroms

# Save cleaned results
df.to_csv("inhibits_final.csv", index=False)


âœ… Cleaned and analyzed peptide properties saved!


In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler,StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split ,cross_val_score
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
df.columns = df.columns.str.strip()        # remove spaces
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE

In [23]:
df=pd.read_csv('inhibits_final.csv')

df.set_index('Sequence', inplace=True)
df["Inhibits"] = df["Inhibits"].str.strip()

#encoding
encoder = LabelEncoder()
df['Inhibits'] = encoder.fit_transform(df['Inhibits'])
y=df['Inhibits']

df.drop(columns=['Inhibits'],inplace=True)
df.columns = [str(i) for i in range(df.shape[1])]
X=df
y





Sequence
VQIVYPGKL        1
VQIVYPGKV        1
VQIVYPGRL        1
YAAKK            0
GCTKSIPPICFPD    1
                ..
LIDRIIKRK        1
IIDRLIDRKK       1
LIDRIIDRKK       1
IIDRLIDRKE       1
LIDRIIDRKE       1
Name: Inhibits, Length: 118, dtype: int32

In [55]:
#  classification model (random forest)
#using smote as an way to add more instances of all the classes
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)


X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=42)


# Training the Random Forest Classifier
clf=RandomForestClassifier(n_estimators= 100 ,max_depth= 5,class_weight='balanced',random_state=42)


# Perform 5-fold cross-validation
scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')


clf.fit(X_train, y_train)
# Print results
print("Cross-validation scores:", scores)
print("Average accuracy:", scores.mean())

# Making predictions on the test set
y_pred = clf.predict(X_test)

# Evaluating the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Cross-validation scores: [0.66666667 0.58333333 0.75       0.95652174 0.91304348]
Average accuracy: 0.7739130434782608
Confusion Matrix:
[[17  2]
 [ 2 18]]

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.89      0.89        19
           1       0.90      0.90      0.90        20

    accuracy                           0.90        39
   macro avg       0.90      0.90      0.90        39
weighted avg       0.90      0.90      0.90        39



In [63]:
#svm classifier
from sklearn.svm import SVC
svm_classifier = SVC(kernel='linear', C=1 , random_state=42)
svm_classifier.fit(X_train, y_train)

y_pred = svm_classifier.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred))

Accuracy: 0.87
              precision    recall  f1-score   support

           0       0.89      0.84      0.86        19
           1       0.86      0.90      0.88        20

    accuracy                           0.87        39
   macro avg       0.87      0.87      0.87        39
weighted avg       0.87      0.87      0.87        39

