# <span style="font-size: 50%;">Level 6 Data Science - Software Engineering</span><br>Topic 8 - Extreme Granient Boost (xgboost)

First, let's install the necessary libraries. Their versions (included in `requirements.txt`) are the ones that we have tested, and we can use `pip install -q` to install them without showing the output.

In [None]:
%pip install -q -r ../requirements.txt

In [None]:
# Ignore warnings - they are mostly about deprecation of certain features
import warnings
warnings.filterwarnings("ignore")

# Ignore matplotlib font manager logging (which is not relevant for this notebook)
import logging
logging.getLogger("matplotlib.font_manager").setLevel(logging.ERROR)

In [None]:
from pandas import read_csv

data = read_csv('https://raw.githubusercontent.com/BPP-Digital-Advanced-Data-Analytics/public_datasets/main/WA_Fn-UseC_-HR-Employee-Attrition.csv')

data['Attrition'] = data['Attrition'].replace({'Yes': 1, 'No': 0})
data['Attrition'] = data['Attrition'].astype(int)

In [None]:
from pycaret.classification import setup
from sklearn import set_config

set_config(enable_metadata_routing=True)

s = setup(
    data,  # our dataframe
    target="Attrition",  # the feature that we want to predict
    ignore_features=[  # features we want to exclude because they are not useful
        "EmployeeCount",
        "EmployeeNumber",
        "Over18",
        "StandardHours",
    ],
    session_id=123,
)

In [None]:
from pycaret.classification import create_model, plot_model, predict_model

thresh = 0.5 # Again select a threhold with your desired balance of Precision and Recall

xg = create_model('xgboost',
                   probability_threshold = thresh,
                  fold=5)

plot_model(xg, plot = 'confusion_matrix')
holdout_predict_xg = predict_model(xg)

In [None]:
from pycaret.classification import interpret_model

interpret_model(xg, plot = 'summary')

In [None]:
observation = 434 # row number of the observation to explain

# Create a single-row DataFrame for the observation
df = s.X_test.iloc[[observation]]

# Use the trained model to make a prediction for the observation
prediction = predict_model(xg, data=df)

# Print out the actual prediction
print(f"The model's prediction is: {prediction.prediction_label.values[0]}")

if s.y_test.iloc[observation] == 1:
    print("The person DID churn in reality")
else:
    print("The person did NOT churn in reality")

display(interpret_model(xg, plot = 'reason', observation = observation))
display(s.X_test.iloc[observation])


In [None]:
from pycaret.classification import check_fairness

check_fairness(xg, sensitive_features = ['Gender'])