In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score 
import warnings
from sklearn.exceptions import ConvergenceWarning

# Load datasets
train_df = pd.read_csv("../data/processed/train_set_SMOTEd.csv")
test_df = pd.read_csv("../data/processed/test_set.csv")


In [9]:
# Define feature columns (all except 'id' and 'credit_status')
feature_columns = [col for col in train_df.columns if col not in ["credit_status", "id"]]

# Split features and target
X_train = train_df[feature_columns]
y_train = LabelEncoder().fit_transform(train_df["credit_status"])
X_test = test_df[feature_columns]
y_test = LabelEncoder().fit_transform(test_df["credit_status"])

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Testing different kernel functions

In [10]:
# Suppress ConvergenceWarnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# List of kernels to compare
kernels = ['linear', 'rbf', 'poly', 'sigmoid']

# Dictionary to store results
f1_scores = {}

# Train and evaluate SVM for each kernel
for kernel in kernels:
    print(f" Training SVM with kernel = '{kernel}'")
    model = SVC(kernel=kernel, probability=False, random_state=42, max_iter=1000)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    score = f1_score(y_test, y_pred)
    f1_scores[kernel] = score
    print(f" F1 Score ({kernel}): {score:.4f}\n")

# Summary
print(" F1 Score Comparison:")
for kernel, score in f1_scores.items():
    print(f" - {kernel}: {score:.4f}")

 Training SVM with kernel = 'linear'
 F1 Score (linear): 0.7705

 Training SVM with kernel = 'rbf'
 F1 Score (rbf): 0.5575

 Training SVM with kernel = 'poly'
 F1 Score (poly): 0.9511

 Training SVM with kernel = 'sigmoid'
 F1 Score (sigmoid): 0.8016

 F1 Score Comparison:
 - linear: 0.7705
 - rbf: 0.5575
 - poly: 0.9511
 - sigmoid: 0.8016


The polynomial kernel yielded the best performance based on F1 score, suggesting it may be better suited to the complexity of this dataset compared to the other kernels tested.

### SVM model with poly kernel

Find the best regularization parameter for the polynomial kernel.

In [11]:
# Train and evaluate SVM with poly kernel for each C value
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100]
}

grid_search = GridSearchCV(
    estimator = SVC(kernel = 'poly', probability = False, random_state = 42, max_iter = 1000),
    param_grid = param_grid,
    cv = 3,
    scoring = 'f1',
    n_jobs = 2,
    verbose = 2
)

grid_search.fit(X_train_scaled, y_train)
print(" Best C:", grid_search.best_params_['C'])

Fitting 3 folds for each of 5 candidates, totalling 15 fits
 Best C: 10


Based on the cross-validation results, we selected C = 10 to train our final SVM baseline model.

In [12]:
# Train final SVM model using best C and kernel
svm_final = SVC(kernel = 'poly', C = 10, random_state = 42, max_iter = 5000)
svm_final.fit(X_train_scaled, y_train);

### Evaluation

In [13]:
# Predict on test data
y_pred = svm_final.predict(X_test_scaled)

# Attach predictions to customer IDs
results_df = test_df[["id"]].copy()
results_df["predicted_credit_status"] = y_pred

# Display results
print(results_df.head())

# Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

        id  predicted_credit_status
0  5052720                        1
1  5087861                        0
2  5068206                        1
3  5137255                        1
4  5023163                        1
Accuracy: 0.812260010970927

Confusion Matrix:
 [[  76  565]
 [ 804 5847]]

Classification Report:
               precision    recall  f1-score   support

           0       0.09      0.12      0.10       641
           1       0.91      0.88      0.90      6651

    accuracy                           0.81      7292
   macro avg       0.50      0.50      0.50      7292
weighted avg       0.84      0.81      0.83      7292

