In [11]:
from sklearn.datasets import fetch_openml
import pandas as pd
from xgboost import XGBClassifier

adult = fetch_openml('adult', version=2)
df = adult.frame
df.info()

# Create the output class

# Output class is 'class' column
df['class'] = df['class'].map({'<=50K' : 0, '>50K' : 1})

# Create one hot encoding for workclass

ohe_columns = [
    'workclass', 'education', 'marital-status', 'occupation',
     'relationship', 'race', 'sex', 'native-country' ]

def create_ohe_columns(df, columns):
    for column in columns:
        ohe = pd.get_dummies(df[column], prefix=column, drop_first=True,
                             dummy_na=True)

        if isinstance(ohe.columns, pd.CategoricalIndex):
            ohe.columns = ohe.columns.astype(object)    # becomes array of Python objects
            ohe.columns = pd.Index(ohe.columns)         # plain Index

        # Replace NaN column label(s)
        ohe.columns = ohe.columns.where(~ohe.columns.isna(), column + 'missing')

        # (optional) ensure they're strings
        ohe.columns = pd.Index(map(str, ohe.columns))

        # Join OHE back with the rest of the columns
        df = df.join(ohe, how='left')

    # Remove all categorical columns
    new_columns = [col for col in df.columns if col not in columns]
    df = df[new_columns]

    return df
    
# df = create_ohe_columns(df, ohe_columns)

# new_columns = [col for col in df.columns if col != 'education-num']
# df = df[new_columns]

# print(df.columns)
# df.info()

df.iloc[1]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   age             48842 non-null  int64   
 1   workclass       46043 non-null  category
 2   fnlwgt          48842 non-null  int64   
 3   education       48842 non-null  category
 4   education-num   48842 non-null  int64   
 5   marital-status  48842 non-null  category
 6   occupation      46033 non-null  category
 7   relationship    48842 non-null  category
 8   race            48842 non-null  category
 9   sex             48842 non-null  category
 10  capital-gain    48842 non-null  int64   
 11  capital-loss    48842 non-null  int64   
 12  hours-per-week  48842 non-null  int64   
 13  native-country  47985 non-null  category
 14  class           48842 non-null  category
dtypes: category(9), int64(6)
memory usage: 2.7 MB


age                               38
workclass                    Private
fnlwgt                         89814
education                    HS-grad
education-num                      9
marital-status    Married-civ-spouse
occupation           Farming-fishing
relationship                 Husband
race                           White
sex                             Male
capital-gain                       0
capital-loss                       0
hours-per-week                    50
native-country         United-States
class                              0
Name: 1, dtype: object

In [9]:
df

Unnamed: 0,age,fnlwgt,capital-gain,capital-loss,hours-per-week,class,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,...,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,native-country_nan
0,25,226802,0,0,40,0,False,False,True,False,...,False,False,False,False,False,False,True,False,False,False
1,38,89814,0,0,50,0,False,False,True,False,...,False,False,False,False,False,False,True,False,False,False
2,28,336951,0,0,40,1,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3,44,160323,7688,0,40,1,False,False,True,False,...,False,False,False,False,False,False,True,False,False,False
4,18,103497,0,0,30,0,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,257302,0,0,38,0,False,False,True,False,...,False,False,False,False,False,False,True,False,False,False
48838,40,154374,0,0,40,1,False,False,True,False,...,False,False,False,False,False,False,True,False,False,False
48839,58,151910,0,0,40,0,False,False,True,False,...,False,False,False,False,False,False,True,False,False,False
48840,22,201490,0,0,20,0,False,False,True,False,...,False,False,False,False,False,False,True,False,False,False


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import mlflow
from mlflow.models import infer_signature

y = df['class']
X = df[[col for col in df.columns if col != 'class']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# Define the model hyperparameters
params = {
    'n_estimators' : 2,
    'max_depth' : 2, 
    'learning_rate' : 1, 
    'objective' : 'binary:logistic',
    'eval_metric' : 'logloss',
}

bst = XGBClassifier(**params)

bst.fit(X_train, y_train)

y_pred = bst.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Configure MLFlow
mlflow.set_tracking_uri(uri='http://127.0.0.1:8082')

# Create a new MLflow Experiment
mlflow.set_experiment("Adult income classifier")

# Start an MLflow run
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(params)

    # Log the loss metrics
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric('precision', precision)
    mlflow.log_metric('recall', recall)
    mlflow.log_metric('f1', f1)

    # Infer the model signature
    signature = infer_signature(X_train, bst.predict(X_train))

    # Log the model, which inherits the parameters and metric
    model_info = mlflow.sklearn.log_model(
        sk_model=bst,
        name="adult_income",
        signature=signature,
        input_example=X_train,
        registered_model_name="tracking-quickstart",
    )

    # Set a tag that we can use to remind ourselves what this model was for
    mlflow.set_logged_model_tags(
        model_info.model_id, {"Training Info": "Basic XGBoost classifier for adult income"}
    )

Registered model 'tracking-quickstart' already exists. Creating a new version of this model...
2025/10/28 17:56:49 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: tracking-quickstart, version 14
Created version '14' of model 'tracking-quickstart'.


🏃 View run abrasive-ram-233 at: http://127.0.0.1:8082/#/experiments/673422008185851218/runs/9a3fadd608054241b7d783bb00dcc97d
🧪 View experiment at: http://127.0.0.1:8082/#/experiments/673422008185851218


In [5]:
X_test

Unnamed: 0,age,fnlwgt,capital-gain,capital-loss,hours-per-week,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,native-country_nan
1172,44,188278,0,0,45,False,False,False,False,True,...,False,False,False,False,False,False,True,False,False,False
14157,38,125645,0,0,50,False,False,False,True,False,...,False,False,False,False,False,False,True,False,False,False
32,24,188274,0,0,50,False,False,False,False,True,...,False,False,False,False,False,False,True,False,False,False
8832,24,152724,0,0,40,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False
40647,26,211424,0,0,40,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6567,50,24790,0,0,40,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False
14331,38,52263,0,0,40,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False
24043,18,423024,0,0,20,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False
48754,41,160467,1506,0,40,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [1]:
import pandas as pd

In [6]:
df1 = pd.DataFrame(data={'A': [1,1,1], 'B': [2,2,2]})
df2 = pd.DataFrame(data={'C': [3,3,3], 'D': [4,4,4]})
df1.merge(df2, left_index=True, right_index=True)

Unnamed: 0,A,B,C,D
0,1,2,3,4
1,1,2,3,4
2,1,2,3,4


In [None]:
s1 = pd.Series([1,1,1,1])
s2 = pd.Series([2,2,2,2])
s1.name = 'A'
s2.name = 'B'

pd.merge(s1, s2, left_index=True, right_index=True)

ValueError: Cannot merge a Series without a name