In [2]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import set_config

import shap

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
X, y = fetch_openml(
    "titanic", version=1, as_frame=True, return_X_y=True, parser="pandas"
)

TypeError: fetch_openml() got an unexpected keyword argument 'parser'

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
X_train.head()

In [None]:
num_features = ['age', 'fare']
cat_features = ['embarked', 'sex', 'pclass']

In [None]:
# here we call the new API set_config to tell sklearn we want to output a pandas DF
set_config(transform_output="pandas")

# creating the numerical pipeline
num_pipe = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])

#creating the transform to preprocess de data
transformer = ColumnTransformer(
    (
        ('numerical', num_pipe, num_features),
        ("categorical", 
             OneHotEncoder(sparse_output=False, 
                           drop="if_binary", 
                           handle_unknown="ignore"), 
             cat_features
        )
    ),
    verbose_feature_names_out=False,
)

In [None]:
# creating the classifier pipeline with a data preprocessing step and RF classifier
rf_pipeline = Pipeline([
    ('dataprep', transformer),
    ('rf_clf', RandomForestClassifier(n_estimators=100, 
                                      max_depth=10, 
                                      class_weight='balanced', 
                                      random_state=123, 
                                      verbose=0))
])

In [None]:
# training the model
rf_pipeline.fit(X_train, y_train)

In [None]:
# predicting on test dataset
y_pred = rf_pipeline.predict(X_test)

In [None]:
# model performance
print(classification_report(y_test, y_pred))

In [None]:
# retrieving the RF Classifier from the model pipeline
clf = rf_pipeline[-1]

In [None]:
# as we can see here, now we can keep the feature names inputted to our classifier
# even though we are using the Pipeline class

print(clf.feature_names_in_)
print(clf.feature_importances_)

In [None]:
# making a pandas dataframe
data = list(zip(clf.feature_names_in_, clf.feature_importances_))
df_importances = pd.DataFrame(data, columns=['Feature', 'Importance']).sort_values(by='Importance', ascending=False)
df_importances

In [None]:
df_importances.plot.barh(x='Feature', y='Importance')