
# Module 9: Supervised Learning II
## Case Study – 3

### Objective: 
 * Employ SVM from scikit learn for binary classification. 
 * Impact of preprocessing data and hyper parameter search using grid search. 

In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score



# Step 1: Load and Explore the Data
# We start by examining the dataset for missing values, data types, and general structure. 
# From the given description, no missing values are present.

# Load the dataset
data = pd.read_csv('College.csv')

# Check basic info
print(data.info())
print(data.isnull().sum())

# Preview the data
print(data.head())

print(data.describe())

# Count of each class in the target
print(data['Private'].value_counts())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 777 entries, 0 to 776
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Private      777 non-null    object 
 1   Apps         777 non-null    int64  
 2   Accept       777 non-null    int64  
 3   Enroll       777 non-null    int64  
 4   Top10perc    777 non-null    int64  
 5   Top25perc    777 non-null    int64  
 6   F.Undergrad  777 non-null    int64  
 7   P.Undergrad  777 non-null    int64  
 8   Outstate     777 non-null    int64  
 9   Room.Board   777 non-null    int64  
 10  Books        777 non-null    int64  
 11  Personal     777 non-null    int64  
 12  PhD          777 non-null    int64  
 13  Terminal     777 non-null    int64  
 14  S.F.Ratio    777 non-null    float64
 15  perc.alumni  777 non-null    int64  
 16  Expend       777 non-null    int64  
 17  Grad.Rate    777 non-null    int64  
dtypes: float64(1), int64(16), object(1)
memory usage: 

In [39]:
# Step 2: Encode the Target Variable and Split the Data
# Use LabelEncoder to convert the Private column into numerical form and split the data into training and testing sets.

# Encode the target variable
le = LabelEncoder()
data['Private'] = le.fit_transform(data['Private'])  # 'Yes' -> 1, 'No' -> 0

# Split the data into features and target
X = data.drop(columns=['Private'])
y = data['Private']


'''
# Oversampling the Minority Class
#We can use oversampling techniques to generate synthetic samples for the minority class ("No").
#Common methods include:
#Random Oversampling: Duplicates random samples from the minority class.
#SMOTE (Synthetic Minority Oversampling Technique): Generates synthetic samples by interpolating between existing minority class samples.

from imblearn.over_sampling import SMOTE

# Apply SMOTE to balance the classes
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)
'''

'''
# Undersampling the Majority Class
#Undersampling reduces the number of samples in the majority class to match the minority class size. 
#While this can improve balance, it risks losing valuable information
from imblearn.under_sampling import RandomUnderSampler

# Apply undersampling
undersampler = RandomUnderSampler(random_state=42)
X, y = undersampler.fit_resample(X, y)
'''


# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




In [40]:
# Step 3: Fit a Linear SVM and Observe Accuracy
# We’ll fit a LinearSVC without any preprocessing to establish a baseline accuracy.

# Fit LinearSVC
linear_svc = LinearSVC(max_iter=10000, random_state=42)
linear_svc.fit(X_train, y_train)

# Predict and evaluate
y_pred = linear_svc.predict(X_test)
# Cross-validation accuracy
cv_scores = cross_val_score(linear_svc, X, y, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy (Linear SVM, no scaling): {cv_scores.mean() * 100:.2f}%")

# Evaluate metrics
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Test Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1-Score: {f1:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Cross-Validation Accuracy (Linear SVM, no scaling): 93.44%
Test Accuracy: 92.95%
Precision: 0.95, Recall: 0.94, F1-Score: 0.95
Confusion Matrix:
[[ 42   5]
 [  6 103]]

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.89      0.88        47
           1       0.95      0.94      0.95       109

    accuracy                           0.93       156
   macro avg       0.91      0.92      0.92       156
weighted avg       0.93      0.93      0.93       156



In [41]:
# Step 4: Preprocess the Data with StandardScaler
# Standardize the features and fit the same LinearSVC model again to observe changes in accuracy.

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_scaled = scaler.fit_transform(X)

# Fit LinearSVC with scaled data
linear_svc_scaled = LinearSVC(max_iter=10000, random_state=42)
linear_svc_scaled.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred_scaled = linear_svc_scaled.predict(X_test_scaled)
# Cross-validation accuracy
cv_scores_scaled = cross_val_score(linear_svc_scaled, X_scaled, y, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy (Linear SVM, with scaling): {cv_scores_scaled.mean() * 100:.2f}%")

# Evaluate metrics
precision_scaled, recall_scaled, f1_scaled, _ = precision_recall_fscore_support(y_test, y_pred_scaled, average='binary')
conf_matrix_scaled = confusion_matrix(y_test, y_pred_scaled)

print(f"Test Accuracy (Scaled): {accuracy_score(y_test, y_pred_scaled) * 100:.2f}%")
print(f"Precision: {precision_scaled:.2f}, Recall: {recall_scaled:.2f}, F1-Score: {f1_scaled:.2f}")
print("Confusion Matrix (Scaled):")
print(conf_matrix_scaled)
print("\nClassification Report:")
print(classification_report(y_test, y_pred_scaled))



Cross-Validation Accuracy (Linear SVM, with scaling): 93.31%
Test Accuracy (Scaled): 92.95%
Precision: 0.95, Recall: 0.94, F1-Score: 0.95
Confusion Matrix (Scaled):
[[ 42   5]
 [  6 103]]

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.89      0.88        47
           1       0.95      0.94      0.95       109

    accuracy                           0.93       156
   macro avg       0.91      0.92      0.92       156
weighted avg       0.93      0.93      0.93       156



In [43]:
# Step 5: Hyperparameter Search for Non-Linear SVM
# Perform a grid search to find the best hyperparameters for a non-linear SVM (e.g., RBF kernel).

# Define the SVM model
svm_model = SVC()

# Define parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'poly']
}

# Perform grid search
grid_search = GridSearchCV(svm_model, param_grid, cv=5, scoring='accuracy', verbose=0)
grid_search.fit(X_train_scaled, y_train)

# Print best parameters and accuracy
best_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Score: {grid_search.best_score_ * 100:.2f}%")

# Evaluate the best model on the test data
y_pred_best = best_model.predict(X_test_scaled)
# Cross-validation accuracy
cv_scores_best = cross_val_score(best_model, X_scaled, y, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy (Best Non-Linear SVM): {cv_scores_best.mean() * 100:.2f}%")

# Evaluate metrics
precision_best, recall_best, f1_best, _ = precision_recall_fscore_support(y_test, y_pred_best, average='binary')
conf_matrix_best = confusion_matrix(y_test, y_pred_best)

print(f"Test Accuracy (Best Non-Linear SVM): {accuracy_score(y_test, y_pred_best) * 100:.2f}%")
print(f"Precision: {precision_best:.2f}, Recall: {recall_best:.2f}, F1-Score: {f1_best:.2f}")
print("Confusion Matrix (Best Non-Linear SVM):")
print(conf_matrix_best)
print("\nClassification Report:")
print(classification_report(y_test, y_pred_best))


Best Parameters: {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
Best Cross-Validation Score: 95.17%
Cross-Validation Accuracy (Best Non-Linear SVM): 94.08%
Test Accuracy (Best Non-Linear SVM): 92.31%
Precision: 0.94, Recall: 0.94, F1-Score: 0.94
Confusion Matrix (Best Non-Linear SVM):
[[ 41   6]
 [  6 103]]

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.87      0.87        47
           1       0.94      0.94      0.94       109

    accuracy                           0.92       156
   macro avg       0.91      0.91      0.91       156
weighted avg       0.92      0.92      0.92       156

