SVM Classification Tutorial

In [320]:
from ucimlrepo import fetch_ucirepo
income=fetch_ucirepo(id=2)
x=income.data.features
x["target"]=income.data.targets
income.variables

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,age,Feature,Integer,Age,,,no
1,workclass,Feature,Categorical,Income,"Private, Self-emp-not-inc, Self-emp-inc, Feder...",,yes
2,fnlwgt,Feature,Integer,,,,no
3,education,Feature,Categorical,Education Level,"Bachelors, Some-college, 11th, HS-grad, Prof-...",,no
4,education-num,Feature,Integer,Education Level,,,no
5,marital-status,Feature,Categorical,Other,"Married-civ-spouse, Divorced, Never-married, S...",,no
6,occupation,Feature,Categorical,Other,"Tech-support, Craft-repair, Other-service, Sal...",,yes
7,relationship,Feature,Categorical,Other,"Wife, Own-child, Husband, Not-in-family, Other...",,no
8,race,Feature,Categorical,Race,"White, Asian-Pac-Islander, Amer-Indian-Eskimo,...",,no
9,sex,Feature,Binary,Sex,"Female, Male.",,no


In [321]:
data=x

EDA

In [322]:
data["target"].unique()

array(['<=50K', '>50K', '<=50K.', '>50K.'], dtype=object)

In [323]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       47879 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      47876 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital-gain    48842 non-null  int64 
 11  capital-loss    48842 non-null  int64 
 12  hours-per-week  48842 non-null  int64 
 13  native-country  48568 non-null  object
 14  target          48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [324]:
data.shape

(48842, 15)

In [325]:
data.isna().sum()

age                 0
workclass         963
fnlwgt              0
education           0
education-num       0
marital-status      0
occupation        966
relationship        0
race                0
sex                 0
capital-gain        0
capital-loss        0
hours-per-week      0
native-country    274
target              0
dtype: int64

In [326]:
import pandas as pd
pd.set_option('display.max_columns', None)

In [327]:
for col in data.select_dtypes(["object"]).columns:
    print(col,data[col].unique())

workclass ['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov' '?'
 'Self-emp-inc' 'Without-pay' 'Never-worked' nan]
education ['Bachelors' 'HS-grad' '11th' 'Masters' '9th' 'Some-college' 'Assoc-acdm'
 'Assoc-voc' '7th-8th' 'Doctorate' 'Prof-school' '5th-6th' '10th'
 '1st-4th' 'Preschool' '12th']
marital-status ['Never-married' 'Married-civ-spouse' 'Divorced' 'Married-spouse-absent'
 'Separated' 'Married-AF-spouse' 'Widowed']
occupation ['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' 'Prof-specialty'
 'Other-service' 'Sales' 'Craft-repair' 'Transport-moving'
 'Farming-fishing' 'Machine-op-inspct' 'Tech-support' '?'
 'Protective-serv' 'Armed-Forces' 'Priv-house-serv' nan]
relationship ['Not-in-family' 'Husband' 'Wife' 'Own-child' 'Unmarried' 'Other-relative']
race ['White' 'Black' 'Asian-Pac-Islander' 'Amer-Indian-Eskimo' 'Other']
sex ['Male' 'Female']
native-country ['United-States' 'Cuba' 'Jamaica' 'India' '?' 'Mexico' 'South'
 'Puerto-Rico' 'Honduras' 'England' '

In [309]:
# clearly except for sex and target all are one hot encodings

In [328]:
data["target"]=data["target"].map({'<=50K':0,'<=50K.':0,'>50K':1,'>50K.':1})

In [329]:
data["target"].value_counts()

target
0    37155
1    11687
Name: count, dtype: int64

In [313]:
# our data is imabalanced this might be an issue, but let's proceed

Preprocessing

In [413]:
from sklearn.model_selection import train_test_split
xT,xt,yT,yt=train_test_split(data.drop("target",axis=1),data["target"],random_state=1,stratify=data["target"],test_size=0.2)
catcols=xT.select_dtypes(["object"]).columns.to_list()
numcols=xT.select_dtypes(["number"]).columns.to_list()

In [414]:
def todataframe(transformeddata,newfeaturenames):
    return pd.DataFrame(transformeddata, columns=newfeaturenames)

In [415]:
from sklearn.impute import SimpleImputer
catimputer=SimpleImputer(strategy="constant",fill_value="?")
numimputer=SimpleImputer(strategy="constant",fill_value=-1)

In [416]:
from sklearn.preprocessing import OneHotEncoder
onehot=OneHotEncoder(sparse_output=False, drop=None, handle_unknown='ignore')

In [417]:
from sklearn.preprocessing import MinMaxScaler
minmax=MinMaxScaler()

In [418]:
from sklearn.compose import ColumnTransformer

imputing = ColumnTransformer([
    ("cat_imputer",catimputer,catcols),
    ("num_imputer",numimputer,numcols)
])

preprocessing = ColumnTransformer([
    ("one_hot",onehot,catcols),
    ("min_max",minmax,numcols)
])

In [419]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
preprocessing_pipeline=Pipeline([
    ("step1",imputing),
    ("fix_names_1", FunctionTransformer(lambda x: todataframe(x,catcols+numcols))),
    ("step2",preprocessing),
    ("fix_names_2", FunctionTransformer(lambda x: todataframe(x,list(preprocessing.named_transformers_["one_hot"].get_feature_names_out(catcols))+numcols)))
])

In [None]:
xT=preprocessing_pipeline.fit_transform(xT)
xt=preprocessing_pipeline.transform(xt)

Plain SVC

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=1)
svm_model.fit(xT, yT)
yt_pred = svm_model.predict(xt)
print("Accuracy:", accuracy_score(yt, yt_pred))
print(classification_report(yt, yt_pred))

Accuracy: 0.8464530658204524
              precision    recall  f1-score   support

           0       0.87      0.94      0.90      7431
           1       0.73      0.56      0.64      2338

    accuracy                           0.85      9769
   macro avg       0.80      0.75      0.77      9769
weighted avg       0.84      0.85      0.84      9769



Class Weight Balanced SVC

In [424]:
svm_model_balanced = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=1, class_weight="balanced")
svm_model_balanced.fit(xT, yT)
yt_pred = svm_model_balanced.predict(xt)
print("Accuracy:", accuracy_score(yt, yt_pred))
print(classification_report(yt, yt_pred))

Accuracy: 0.7843177397891289
              precision    recall  f1-score   support

           0       0.95      0.76      0.84      7431
           1       0.53      0.86      0.66      2338

    accuracy                           0.78      9769
   macro avg       0.74      0.81      0.75      9769
weighted avg       0.85      0.78      0.80      9769



Under Sampling SVC

In [None]:
train_df = pd.DataFrame(xT)
train_df["target"] = yT.values if hasattr(yT, 'values') else yT
class_counts = train_df["target"].value_counts()
minority_class = class_counts.idxmin()
majority_class = class_counts.idxmax()
minority_df = train_df[train_df["target"] == minority_class]
majority_df = train_df[train_df["target"] == majority_class].sample(n=len(minority_df), random_state=42)
balanced_df = pd.concat([minority_df, majority_df]).sample(frac=1, random_state=42)

xT_balanced = balanced_df.drop("target", axis=1)
yT_balanced = balanced_df["target"]

svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=1)
svm_model.fit(xT_balanced, yT_balanced)
yt_pred = svm_model.predict(xt)
print("Accuracy:", accuracy_score(yt, yt_pred))
print(classification_report(yt, yt_pred))

Accuracy: 0.7758214760978606
              precision    recall  f1-score   support

           0       0.95      0.75      0.84      7431
           1       0.52      0.87      0.65      2338

    accuracy                           0.78      9769
   macro avg       0.73      0.81      0.74      9769
weighted avg       0.84      0.78      0.79      9769



Plain SVC with Grid Search

In [427]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 0.01, 0.001],
    'kernel': ['rbf', 'linear']
}
svm = SVC()

grid_search = GridSearchCV(
    estimator=svm,
    param_grid=param_grid,
    scoring='f1',   # or 'f1', 'roc_auc', etc.
    cv=5,                 # 5-fold cross-validation
    verbose=1,
    n_jobs=-1             # Use all CPU cores
)
grid_search.fit(xT, yT)
best_model = grid_search.best_estimator_
yt_pred = best_model.predict(xt)
print("Accuracy:", accuracy_score(yt, yt_pred))
print(classification_report(yt, yt_pred))

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Accuracy: 0.8555635172484389
              precision    recall  f1-score   support

           0       0.88      0.94      0.91      7431
           1       0.76      0.58      0.66      2338

    accuracy                           0.86      9769
   macro avg       0.82      0.76      0.78      9769
weighted avg       0.85      0.86      0.85      9769



In [None]:
SVM Tutorial Done 