# Income Prediction

## Init

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
import mlflow

## Configuration

In [None]:
NUMERICAL_FEATURES = ["age", "education_num", "hours_per_week", "net_capital"]
CATEGORICAL_FEATURES = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "sex", "native_country"]

TARGET = "income"

RANDOM_STATE = 42

mlflow.set_tracking_uri("http://localhost:4040")
mlflow.set_experiment("Income prediction")

mlflow.sklearn.autolog(log_datasets=False)
mlflow.xgboost.autolog(log_datasets=False)

## Data Ingestion & Preparation

In [None]:
data = pd.read_csv("../data/income_data_processed.csv")
data.head()

In [None]:
feature_columns = NUMERICAL_FEATURES + CATEGORICAL_FEATURES
features = data[feature_columns + [TARGET]]
features.head()

In [None]:
train_data, test_data = train_test_split(features, random_state=RANDOM_STATE)

train_input = train_data[feature_columns]
train_output = train_data[TARGET]

## XGBoost Classification

In [None]:
numerical_transformer = Pipeline([
    ("scaler", StandardScaler()),
])

categorical_transformer = Pipeline([
    ("encoder", OneHotEncoder()),
])

preprocessor = ColumnTransformer([
    ("numerical", numerical_transformer, NUMERICAL_FEATURES),
    ("categorical", categorical_transformer, CATEGORICAL_FEATURES),
])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", XGBClassifier(random_state=RANDOM_STATE)),
])

In [None]:
with mlflow.start_run():
    pipeline.fit(train_input, train_output)

    mlflow.evaluate(
        model=pipeline.predict,
        data=test_data,
        targets=TARGET,
        model_type="classifier",
    )