In [31]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler

from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE

In [17]:
df = pd.read_csv('../data/adult.csv')
df

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K


In [18]:
df.replace('?', pd.NA, inplace=True)
df.dropna(inplace=True)

In [19]:
df['income'] = df['income'].apply(lambda x: 1 if x == ">50K" else 0)
print(df['income'].value_counts(normalize=True))

new_df = df.copy()

income
0    0.751078
1    0.248922
Name: proportion, dtype: float64


In [20]:
target = new_df['income']
new_df.drop('income', axis=1, inplace=True)

In [21]:
new_df

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States
5,34,Private,216864,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,3770,45,United-States
6,38,Private,150601,10th,6,Separated,Adm-clerical,Unmarried,White,Male,0,3770,40,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States


In [22]:
X_train, X_test, y_train, y_test = train_test_split(new_df, target, test_size=0.2, random_state=42, stratify=target, shuffle=True)

In [23]:
cat_cols = new_df.select_dtypes(include='object').columns
num_cols = new_df.select_dtypes(include=['int64', 'float64']).columns

In [24]:
scaler = MinMaxScaler()

X_train_num = scaler.fit_transform(X_train[num_cols])
X_test_num = scaler.transform(X_test[num_cols])

In [26]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

X_train_cat = ohe.fit_transform(X_train[cat_cols])
X_test_cat = ohe.transform(X_test[cat_cols])

In [27]:
X_train_final = np.hstack((X_train_num, X_train_cat))
X_test_final = np.hstack((X_test_num, X_test_cat))

### Balance data

In [29]:
smote = SMOTE(random_state=42)

X_train_balanced, y_train_balanced = smote.fit_resample(X_train_final, y_train)

print(pd.Series(y_train_balanced).value_counts())

income
0    18123
1    18123
Name: count, dtype: int64


### Baseline model

1. XGBoost
2. 

### 1 Модель XGBoost

In [30]:
def calc_metrics(y_true, y_pred, name):
    print(f'Посчитанные метрики для {name}: ')
    print("Accuracy: ", round(accuracy_score(y_true, y_pred), 4))
    print("Recall: ", round(recall_score(y_true, y_pred), 4))
    print("Precision: ", round(precision_score(y_true, y_pred), 4))
    print("F1: ", round(f1_score(y_true, y_pred), 4))
    print("ROC/AUC: ", round(roc_auc_score(y_true, y_pred), 4))

In [34]:
model_XGBoost = XGBClassifier(eval_metric='logloss', random_state=42)

model_XGBoost.fit(X_train_balanced, y_train_balanced)

y_pred = model_XGBoost.predict(X_test_final)

In [35]:
calc_metrics(y_test, y_pred, "XGBoost")

Посчитанные метрики для XGBoost: 
Accuracy:  0.8581
Recall:  0.7736
Precision:  0.6925
F1:  0.7308
ROC/AUC:  0.8299
