In [None]:
# load library

import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score 

# Load trainset
train_set_df = pd.read_csv('../data/processed/train_set.csv')

# Load testset
test_set_df = pd.read_csv('../data/processed/test_set.csv')


In [15]:
# Inspect dataframe
train_set_df.head()

Unnamed: 0,gender,own_car,own_realty,annual_income,work_phone,phone,email,family_size,age,years_employed,...,occupation_Managers,occupation_Medicine staff,occupation_Others,occupation_Private service staff,occupation_Realty agents,occupation_Sales staff,occupation_Secretaries,occupation_Security staff,occupation_Waiters/barmen staff,credit_status
0,0,0,0,67500.0,0,1,0,3.0,0.204082,1.0,...,False,False,False,False,False,True,False,False,False,Good
1,0,0,1,675000.0,0,1,0,3.0,0.469388,3.8,...,False,False,False,False,False,True,False,False,False,Good
2,0,0,0,180000.0,0,0,0,2.0,0.571429,29.5,...,False,True,False,False,False,False,False,False,False,Good
3,0,1,0,387000.0,0,0,0,3.0,0.326531,7.1,...,True,False,False,False,False,False,False,False,False,Good
4,0,1,0,337500.0,1,0,1,1.0,0.122449,0.8,...,False,False,True,False,False,False,False,False,False,Good


In [16]:
# Inspect dataframe
test_set_df.head()

Unnamed: 0,id,gender,own_car,own_realty,annual_income,work_phone,phone,email,family_size,age,...,occupation_Managers,occupation_Medicine staff,occupation_Others,occupation_Private service staff,occupation_Realty agents,occupation_Sales staff,occupation_Secretaries,occupation_Security staff,occupation_Waiters/barmen staff,credit_status
0,5052720,0,0,0,126000.0,0,0,0,2.0,0.795918,...,False,False,True,False,False,False,False,False,False,Good
1,5087861,0,0,1,112500.0,0,0,0,3.0,0.22449,...,False,False,False,True,False,False,False,False,False,Good
2,5068206,0,0,0,166500.0,0,1,0,2.0,0.836735,...,False,False,True,False,False,False,False,False,False,Good
3,5137255,1,1,0,135000.0,0,0,0,1.0,0.612245,...,False,False,False,False,False,False,False,False,False,Bad
4,5023163,0,0,1,135000.0,0,1,0,1.0,0.795918,...,False,False,True,False,False,False,False,False,False,Good


In [17]:
# Separate features from credit_label

X_train = train_set_df.drop(['credit_status'], axis=1)
y_train = train_set_df['credit_status']

X_test = test_set_df.drop(['credit_status', 'id'], axis=1)
y_test = test_set_df['credit_status']


In [18]:
# Convert string labels to numeric
y_train = y_train.map({'Bad': 0, 'Good': 1})
y_test = y_test.map({'Bad': 0, 'Good': 1})

# List of kernels to compare
kernels = ['linear', 'rbf', 'poly', 'sigmoid']

# Dictionary to store results
f1_scores = {}

# Train and evaluate SVM for each kernel
for kernel in kernels:
    print(f" Training SVM with kernel = '{kernel}'")
    model = SVC(kernel = kernel, probability = True, random_state = 42, max_iter=5000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = f1_score(y_test, y_pred)
    f1_scores[kernel] = score
    print(f" F1 Score ({kernel}): {score:.4f}\n")

# Summary
print(" F1 Score Comparison:")
for kernel, score in f1_scores.items():
    print(f" - {kernel}: {score:.4f}")
    

 Training SVM with kernel = 'linear'




 F1 Score (linear): 0.6051

 Training SVM with kernel = 'rbf'




 F1 Score (rbf): 0.8592

 Training SVM with kernel = 'poly'




 F1 Score (poly): 0.9535

 Training SVM with kernel = 'sigmoid'




 F1 Score (sigmoid): 0.8810

 F1 Score Comparison:
 - linear: 0.6051
 - rbf: 0.8592
 - poly: 0.9535
 - sigmoid: 0.8810


In [19]:
# Train and evaluate SVM with poly kernel for each C value

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100]
}

grid_search = GridSearchCV(
    estimator = SVC(kernel = 'poly', probability = True, random_state = 42, max_iter = 1000),
    param_grid = param_grid,
    cv = 3,
    scoring = 'f1',
    n_jobs = -1,
    verbose = 2
)

grid_search.fit(X_train, y_train)
print(" Best C:", grid_search.best_params_['C'])

Fitting 3 folds for each of 5 candidates, totalling 15 fits




[CV] END ................................................C=1; total time=  29.9s
[CV] END ................................................C=1; total time=  30.6s




[CV] END ..............................................C=0.1; total time=  38.9s
[CV] END ..............................................C=0.1; total time=  39.1s




[CV] END ..............................................C=0.1; total time=  41.7s




[CV] END .............................................C=0.01; total time=  46.2s
[CV] END .............................................C=0.01; total time=  46.4s
[CV] END .............................................C=0.01; total time=  46.6s




[CV] END ................................................C=1; total time=  31.7s
[CV] END ...............................................C=10; total time=  31.2s




[CV] END ...............................................C=10; total time=  30.5s




[CV] END ...............................................C=10; total time=  31.5s
[CV] END ..............................................C=100; total time=  29.5s




[CV] END ..............................................C=100; total time=  27.6s
[CV] END ..............................................C=100; total time=  27.4s
 Best C: 0.01




In [21]:
# Train final SVM model using best C and kernel
svm_final = SVC(kernel = 'poly', C = 0.01, random_state = 42, max_iter = 5000)
svm_final.fit(X_train, y_train)

# Predict on test data
y_pred = svm_final.predict(X_test)

# Evaluate model
print(" Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n Classification Report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print(" Final F1 Score:", f1_score(y_test, y_pred))



 Confusion Matrix:
 [[   0  641]
 [   7 6644]]

 Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       641
           1       0.91      1.00      0.95      6651

    accuracy                           0.91      7292
   macro avg       0.46      0.50      0.48      7292
weighted avg       0.83      0.91      0.87      7292

Accuracy: 0.9111354909489852
 Final F1 Score: 0.9535017221584385
