In [126]:
# load library

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, f1_score 

# Load trainset
train_set_df = pd.read_csv('data/train_set.csv')

# Load testset
test_set_df = pd.read_csv('data/test_set.csv')


In [128]:
# Inspect dataframe
train_set_df.head()

Unnamed: 0,gender,own_car,own_realty,annual_income,work_phone,phone,email,family_size,age,years_employed,...,occupation_Managers,occupation_Medicine staff,occupation_Others,occupation_Private service staff,occupation_Realty agents,occupation_Sales staff,occupation_Secretaries,occupation_Security staff,occupation_Waiters/barmen staff,credit_status
0,0,0,0,67500.0,0,1,0,3.0,0.204082,1.0,...,False,False,False,False,False,True,False,False,False,Good
1,0,0,1,675000.0,0,1,0,3.0,0.469388,3.8,...,False,False,False,False,False,True,False,False,False,Good
2,0,0,0,180000.0,0,0,0,2.0,0.571429,29.5,...,False,True,False,False,False,False,False,False,False,Good
3,0,1,0,387000.0,0,0,0,3.0,0.326531,7.1,...,True,False,False,False,False,False,False,False,False,Good
4,0,1,0,337500.0,1,0,1,1.0,0.122449,0.8,...,False,False,True,False,False,False,False,False,False,Good


In [130]:
# Inspect dataframe
test_set_df.head()

Unnamed: 0,id,gender,own_car,own_realty,annual_income,work_phone,phone,email,family_size,age,...,occupation_Managers,occupation_Medicine staff,occupation_Others,occupation_Private service staff,occupation_Realty agents,occupation_Sales staff,occupation_Secretaries,occupation_Security staff,occupation_Waiters/barmen staff,credit_status
0,5052720,0,0,0,126000.0,0,0,0,2.0,0.795918,...,False,False,True,False,False,False,False,False,False,Good
1,5087861,0,0,1,112500.0,0,0,0,3.0,0.22449,...,False,False,False,True,False,False,False,False,False,Good
2,5068206,0,0,0,166500.0,0,1,0,2.0,0.836735,...,False,False,True,False,False,False,False,False,False,Good
3,5137255,1,1,0,135000.0,0,0,0,1.0,0.612245,...,False,False,False,False,False,False,False,False,False,Bad
4,5023163,0,0,1,135000.0,0,1,0,1.0,0.795918,...,False,False,True,False,False,False,False,False,False,Good


In [136]:
# Separate features from credit_label

X_train = train_set_df.drop('credit_status', axis=1)
y_train = train_set_df['credit_status']

X_test = test_set_df.drop('credit_status', axis=1)
y_test = test_set_df['credit_status']


In [None]:
# List of kernels to compare
kernels = ['linear', 'rbf', 'poly', 'sigmoid']

# Dictionary to store results
f1_scores = {}

# Train and evaluate SVM for each kernel
for kernel in kernels:
    print(f" Training SVM with kernel = '{kernel}'")
    model = SVC(kernel = kernel, probability = True, random_state = 42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = f1_score(y_test, y_pred)
    f1_scores[kernel] = score
    print(f" F1 Score ({kernel}): {score:.4f}\n")

# Summary
print(" F1 Score Comparison:")
for kernel, score in f1_scores.items():
    print(f" - {kernel}: {score:.4f}")
    

 Training SVM with kernel = 'linear'


In [115]:
# Train and evaluate SVM for each C value

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100]
}

grid_search = GridSearchCV(
    estimator=SVC(kernel='rbf', probability=True, random_state=42),
    param_grid=param_grid,
    cv=3,
    scoring='f1',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)
print(" Best C:", grid_search.best_params_['C'])

Fitting 3 folds for each of 5 candidates, totalling 15 fits


python(50907) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(50908) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(50909) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(50910) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(50911) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(50912) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(50913) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(50914) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


[CV] END ..............................................C=0.1; total time=10.8min
[CV] END ..............................................C=0.1; total time= 9.8min
[CV] END ...............................................C=10; total time=21.7min
[CV] END .............................................C=0.01; total time=10.2min
[CV] END ...............................................C=10; total time=21.5min
[CV] END ................................................C=1; total time= 9.3min
[CV] END ...............................................C=10; total time=21.4min
[CV] END ................................................C=1; total time= 9.1min
[CV] END ................................................C=1; total time=21.6min


KeyboardInterrupt: 

In [118]:
# Train final SVM model using best C and kernel
svm_final = SVC(kernel='rbf', C=0.1, probability=True, random_state=42)
svm_final.fit(X_train, y_train)

# Predict on test data
y_pred = svm_final.predict(X_test)

# Evaluate model
print(" Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n Classification Report:\n", classification_report(y_test, y_pred))
print(" Final F1 Score:", f1_score(y_test, y_pred))

📊 Confusion Matrix:
 [[2019 2370]
 [1439 2950]]

📄 Classification Report:
               precision    recall  f1-score   support

         0.0       0.58      0.46      0.51      4389
         1.0       0.55      0.67      0.61      4389

    accuracy                           0.57      8778
   macro avg       0.57      0.57      0.56      8778
weighted avg       0.57      0.57      0.56      8778

🏁 Final F1 Score: 0.6076835925430013
