In [3]:
import pandas as pd
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

In [4]:
adult = fetch_ucirepo(id=2) 
data = adult.data.original
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
data.dropna(inplace=True)

In [6]:
X = data.drop(columns='income')
y = data['income']

In [7]:
categorical_columns = X.select_dtypes(include=['object']).columns
numeric_columns = X.select_dtypes(exclude=['object']).columns

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse_output=False))
])

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ]
)

X_encoded = pd.DataFrame(preprocessor.fit_transform(X), columns=numeric_columns.tolist() + preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out().tolist())
X_encoded.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,0.026501,-1.062924,1.132729,0.144629,-0.217456,-0.048943,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.837781,-1.008031,1.132729,-0.145735,-0.217456,-2.251188,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.047252,0.245517,-0.424726,-0.145735,-0.217456,-0.048943,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.059039,0.426206,-1.203454,-0.145735,-0.217456,-0.048943,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-0.78478,1.408394,1.132729,-0.145735,-0.217456,-0.048943,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=45)

In [9]:
def show_metrics(y_pred, X_test, y_test):
    print('Classification report:')
    report = classification_report(y_test, y_pred, zero_division=1)
    print(report)

    print('Confusion matrix:')
    cm = confusion_matrix(y_test, y_pred)
    print(cm)

In [10]:
dt = DecisionTreeClassifier(random_state=45)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
show_metrics(y_pred, X_test, y_test)

Classification report:
              precision    recall  f1-score   support

       <=50K       0.62      0.60      0.61      7442
      <=50K.       0.29      0.30      0.30      3396
        >50K       0.42      0.41      0.42      2368
       >50K.       0.18      0.19      0.18      1081

    accuracy                           0.47     14287
   macro avg       0.38      0.38      0.38     14287
weighted avg       0.47      0.47      0.47     14287

Confusion matrix:
[[4472 2076  604  290]
 [1918 1031  286  161]
 [ 580  295  974  519]
 [ 275  156  442  208]]


In [11]:
svm = SVC(random_state=45)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
show_metrics(y_pred, X_test, y_test)

Classification report:
              precision    recall  f1-score   support

       <=50K       0.60      0.94      0.73      7442
      <=50K.       1.00      0.00      0.00      3396
        >50K       0.53      0.58      0.55      2368
       >50K.       1.00      0.00      0.00      1081

    accuracy                           0.59     14287
   macro avg       0.78      0.38      0.32     14287
weighted avg       0.71      0.59      0.47     14287

Confusion matrix:
[[7032    0  410    0]
 [3197    1  198    0]
 [1006    0 1362    0]
 [ 469    0  612    0]]
