# SUPPORT VECTOR MACHINE â€“ DRUG RESPONSE CLASSIFICATION

## Dataset Description

### Target: Drug Response (0 = No Response, 1 = Positive Response)

In [None]:
# Import Required Libraries
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix
)
from sklearn.decomposition import PCA

## Task 1: Exploratory Data Analysis

### Load Dataset

In [None]:
df = pd.read_csv("Pharma_Industry.csv")
df.head()

### Basic Exploration

In [None]:
df.info()
df.describe().T

### Feature Distributions 
(Histograms)

In [None]:
df.hist(bins=30, figsize=(15,10))
plt.suptitle("Feature Distributions")
plt.show()

### Box Plots (Outliers Detection)

In [None]:
df.drop("Drug Response", axis=1).boxplot(figsize=(10, 6), rot=90)
plt.title("Box Plot of Features")
plt.tight_layout()
plt.show()

In [None]:
# Density(KDE) Plots
df.plot(kind="density", subplots=True, layout=(3,3), figsize=(14,10), sharex=False)
plt.suptitle("Density Plots of Features")
plt.show()

### Correlation Analysis

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

* Some biological features show moderate correlation with Drug Response
* No extremely high multicollinearity observed
* Non-linear relationships justify kernel-based SVM

## Task 2: Data Preprocessing

* No categorical features - NO LabelEncoder needed
* Features already numeric so, Encoding not required

### Separate Features and Target

In [None]:
X = df.drop("Drug Response", axis=1)
y = df["Drug Response"]

### Train-Test Split(Stratified)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

### Feature Scaling (Critical for SVM)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

* Scaling prevents feature dominance and improves SVM margin optimization

## Task 3: Data Visualization

### PCA Scatter Plot

In [None]:
X_scaled = scaler.fit_transform(X)
X_pca = PCA(n_components=2).fit_transform(X_scaled)

plt.figure()
plt.scatter(X_pca[:,0], X_pca[:,1], c=y)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("PCA Projection of Dataset")
plt.show()

### Pair Plot

In [None]:
sns.pairplot(df, hue="Drug Response")
plt.show()

### Class Distribution

In [None]:
sns.countplot(x="Drug Response", data=df)
plt.title("Class Distribution of Drug Response")
plt.show()

* Dataset is reasonably balanced, making accuracy a reliable metric.

## Task 4: SVM Implementation

### Implementing SVM from scikit-learn & Traning

In [None]:
svm_model = SVC(kernel="rbf", C=1, gamma=0.1, random_state=42)
svm_model.fit(X_train_scaled, y_train)

### Evaluate Model Metrics

In [None]:
y_pred = svm_model.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

### Confusion Matrix

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred),
            annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

## Task 5: Visualization of SVM Results

### PCA for Visualization

In [None]:
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

### Train SVM on PCA Data

In [None]:
svm_vis = SVC(kernel="linear")
svm_vis.fit(X_train_pca, y_train)

y_pred_vis = svm_vis.predict(X_test_pca)

### Classification Visualization

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(X_test_pca[:,0], X_test_pca[:,1],
            c=y_pred_vis, cmap="coolwarm", edgecolor="k")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("SVM Classification Results (PCA)")
plt.show()

## Task 6: Parameter Tuning and Optimization

In [None]:
param_grid = {
    "C": [0.1, 1, 10],
    "gamma": ["scale", 0.1, 0.01],
    "kernel": ["rbf"]
}

In [None]:
grid = GridSearchCV(SVC(), param_grid, cv=5, scoring="accuracy")
grid.fit(X_train_scaled, y_train)

print("Best Parameters:", grid.best_params_)
print("Best Accuracy:", grid.best_score_)

## Task 7: Kernel Comparison and Analysis

In [None]:
kernels = ["linear", "poly", "rbf"]
results = {}

for k in kernels:
    model = SVC(kernel=k)
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_test_scaled)
    results[k] = accuracy_score(y_test, preds)

pd.DataFrame(results.items(), columns=["Kernel", "Accuracy"])

### Strengths & Weaknesses of SVM

#### Strengths

* Effective for high-dimensional biomedical data
* Handles non-linear drug-response patterns
* Strong generalization with proper kernels

#### Weaknesses

* Sensitive to hyperparameters
* Less interpretable than logistic regression
* Computationally expensive for large datasets

### Real-World Pharmaceutical Implications

* Early drug response prediction
* Reduced failed clinical trials
* Supports personalized medicine
* Improves patient safety and treatment outcomes