# 🔬 Cytotoxicity-Based Potency Classification
This notebook simulates a cytotoxicity assay (% cell killing vs reference standard) and builds a classification model to determine batch release pass/fail based on a regulatory range of **70–130%**.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc

np.random.seed(303)
n_samples = 300
df = pd.DataFrame({
    'donor_age': np.random.randint(20, 65, n_samples),
    'passage_number': np.random.randint(1, 5, n_samples),
    'MOI': np.random.uniform(2, 10, n_samples),
    'culture_days': np.random.randint(7, 15, n_samples),
    'transduction_efficiency': np.random.uniform(30, 90, n_samples),
    'viability_percent': np.random.uniform(75, 99, n_samples),
    'activation_marker_percent': np.random.uniform(20, 95, n_samples)
})
noise = np.random.normal(0, 10, n_samples)
df['cytotoxicity_percent'] = (
    0.6 * df['transduction_efficiency'] +
    0.4 * df['viability_percent'] +
    0.2 * df['activation_marker_percent'] -
    5.0 * df['passage_number'] + noise
).clip(20, 150)
df['release_pass'] = df['cytotoxicity_percent'].between(70, 130).astype(int)
df.head()

In [None]:
X = df.drop(columns=['cytotoxicity_percent', 'release_pass'])
y = df['release_pass']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
accuracy

In [None]:
report = classification_report(y_test, y_pred)
print(report)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(14, 5))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=axs[0])
axs[0].set_title('Confusion Matrix')
axs[0].set_xlabel('Predicted Label')
axs[0].set_ylabel('True Label')
axs[0].set_xticklabels(['Fail (0)', 'Pass (1)'])
axs[0].set_yticklabels(['Fail (0)', 'Pass (1)'])

y_probs = clf.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_probs)
roc_auc = auc(fpr, tpr)
axs[1].plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}', color='darkorange')
axs[1].plot([0, 1], [0, 1], linestyle='--', color='gray')
axs[1].set_title('ROC Curve')
axs[1].set_xlabel('False Positive Rate')
axs[1].set_ylabel('True Positive Rate')
axs[1].legend(loc='lower right')
plt.tight_layout()
plt.show()

In [None]:
importances = clf.feature_importances_
features = X.columns
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10, 6))
plt.barh(features[indices], importances[indices])
plt.xlabel('Importance Score')
plt.title('Feature Importance - Random Forest')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()