In [1]:
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import numpy as np

adult = fetch_ucirepo(id=2)
X = adult.data.features
y = adult.data.targets

In [2]:
X1= X.copy()

imputer = SimpleImputer(strategy='most_frequent')
X1[['workclass', 'occupation', 'native-country']] = imputer.fit_transform(X1[['workclass', 'occupation', 'native-country']])

le = LabelEncoder()
X1['workclass'] = le.fit_transform(X1['workclass'])
X1['education'] = le.fit_transform(X1['education'])
X1['marital-status'] = le.fit_transform(X1['marital-status'])
X1['occupation'] = le.fit_transform(X1['occupation'])
X1['relationship'] = le.fit_transform(X1['relationship'])
X1['race'] = le.fit_transform(X1['race'])
X1['sex'] = le.fit_transform(X1['sex'])
X1['native-country'] = le.fit_transform(X1['native-country'])

y1 = y.copy()
y1['income'].replace('<=50K', 0, inplace=True)
y1['income'].replace('<=50K.', 0, inplace=True)
y1['income'].replace('>50K', 1, inplace=True)
y1['income'].replace('>50K.', 1, inplace=True)

In [3]:
scaler = StandardScaler()
X1 = scaler.fit_transform(X1)
y2 = y1.values.ravel()
X_train, X_test, y_train, y_test = train_test_split(X1, y2, test_size=0.3, random_state=1, stratify=y2)
X_subset, _, y_subset, _ = train_test_split(X1, y2, test_size=0.7, random_state=42, stratify=y2)

In [4]:
svc = SVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Classification accuracy (Adult test set, SVC): {accuracy:.2f}\n")
print(f"Confusion matrix (Adult test set, SVC):\n{confusion_matrix(y_test, y_pred)}\n")
print(f"Classification report (Adult test set, SVC):\n{classification_report(y_test, y_pred)}")

Classification accuracy (Adult test set, SVC): 0.85

Confusion matrix (Adult test set, SVC):
[[10529   618]
 [ 1559  1947]]

Classification report (Adult test set, SVC):
              precision    recall  f1-score   support

           0       0.87      0.94      0.91     11147
           1       0.76      0.56      0.64      3506

    accuracy                           0.85     14653
   macro avg       0.82      0.75      0.77     14653
weighted avg       0.84      0.85      0.84     14653



In [5]:
kf = KFold(n_splits=5, shuffle=True, random_state=1)

cm_sum = np.zeros((2,2))
total_score = 0
f1_scores = []
precision_scores = []
recall_scores = [] 

for train, test in kf.split(X_subset, y_subset):
    SVM_clf = SVC(C=1.0, kernel='rbf', gamma='scale', degree=3, probability=True)
    SVM_clf.fit(X_subset[train], y_subset[train])
    classifier_score = SVM_clf.score(X_subset[test], y_subset[test])
    total_score += classifier_score

    y_pred_fold = SVM_clf.predict(X_subset[test])
    cm_sum += confusion_matrix(y_subset[test], y_pred_fold)

    f1 = f1_score(y_subset[test], y_pred_fold, average='weighted')
    f1_scores.append(f1)

    precision = precision_score(y_subset[test], y_pred_fold, average='weighted')
    precision_scores.append(precision)

    recall = recall_score(y_subset[test], y_pred_fold, average='weighted')
    recall_scores.append(recall)

average_score = total_score / kf.get_n_splits() 
cm_avg = cm_sum / kf.get_n_splits()
cm_avg_int = cm_avg.astype(int)
  
print(f"Classification accuracy (Adult set, SVC): {average_score:.2f}\n")
print(f"Confusion matrix (Adult set, SVC):\n{cm_avg_int}\n")
print(f"F1 score (Adult set, SVC): {np.mean(f1_scores):.2f}\n")
print(f"Precision score (Adult set, SVC): {np.mean(precision_scores):.2f}\n")
print(f"Recall score (Adult set, SVC): {np.mean(recall_scores):.2f}")

Classification accuracy (Adult set, SVC): 0.85

Confusion matrix (Adult set, SVC):
[[2108  120]
 [ 325  375]]

F1 score (Adult set, SVC): 0.84

Precision score (Adult set, SVC): 0.84

Recall score (Adult set, SVC): 0.85
