In [1]:
import shap
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.inspection import PartialDependenceDisplay
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance
from imblearn.combine import SMOTETomek
from sklearn.model_selection import RandomizedSearchCV

  from .autonotebook import tqdm as notebook_tqdm


### Reading the dataset

In [2]:
file_name = '2206MCPC_VA (1).xlsx'
df = pd.read_excel('2206MCPC_VA (1).xlsx')

In [3]:
df

Unnamed: 0,Age,Age Group,Ethnicity,Race,Clinic assigned to,Zip Code,City,Year,Text Follow up Prior Scheduling Completed PrEP Appointment,Call Follow up Prior Scheduling Completed PrEP Appointment,...,Month,Waiting time(Days),Time spent at clinic(Min),First Filled Date After Completed Appointment,First Shipped Date After Completed Appointment,Insurance Status(First Filled Date After Completed Appointment),Insurance Status Source(First Filled Date After Completed Appointment),Fill shipment waiting period,First Filled Drug Prescribed,First Filled Pharmacy used
0,22,18-24,Not Hispanic or Latino,White,MCPC West,37129,MURFREESBORO,2021,0,0,...,June,3.0,67.0,2021-06-08,2021-06-09,Insured,Active Insurance In bwell,1.0,DESCOVY TAB 30,Avita
1,38,35-44,Not Hispanic or Latino,Black,MCPC West,37205,NASHVILLE,2021,0,0,...,June,3.0,74.0,2021-06-15,2021-06-16,,,8.0,DESCOVY TAB 30,Avita
2,34,25-34,Not Hispanic or Latino,White,MCPC West,75204,DALLAS,2021,0,0,...,June,0.0,67.0,2021-06-10,2021-06-10,,,5.0,DESCOVY TAB 30,Avita
3,25,25-34,Puerto Rican,Native American,MCPC West,37211,NASHVILLE,2021,0,0,...,August,2.0,80.0,2022-06-01,2022-06-02,,,304.0,EMTRIC TENO TB200 300MG LAU30@,Avita
4,28,25-34,Not Hispanic or Latino,White,MCPC West,37207-4842,NASHVILLE,2021,0,0,...,August,0.0,54.0,NaT,NaT,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10413,23,18-24,Not Hispanic or Latino,White,MCPC East,37130,MURFREESBORO,2024,0,0,...,March,4.0,24.0,NaT,NaT,,,,,
10414,26,25-34,Not Hispanic or Latino,White,MCPC East,37203,NASHVILLE,2024,0,0,...,March,1.0,78.0,2024-03-28,2024-04-02,Insured,Insured Fill,8.0,DESCOVY TAB 30,Avita
10415,37,35-44,Not Hispanic or Latino,White,MCPC East,37212,NASHVILLE,2024,0,0,...,May,0.0,31.0,2024-05-23,2024-05-24,Insured,Insured Fill,1.0,DESCOVY TAB 30,Avita
10416,42,35-44,Not Hispanic or Latino,White,Louisville,40206,LOUISVILLE,2024,0,0,...,March,2.0,88.0,2024-03-28,2024-03-28,Insured,Insured Fill,1.0,EMTRIC TENO TB200 300MG LAU30@,Avita


### Analyze the dataset

In [4]:
# Display initial information about the dataset
df_info = df.info()
df_summary = df.describe(include='all')
df_nulls = df.isna().sum()

df_info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10418 entries, 0 to 10417
Data columns (total 34 columns):
 #   Column                                                                  Non-Null Count  Dtype         
---  ------                                                                  --------------  -----         
 0   Age                                                                     10418 non-null  int64         
 1   Age Group                                                               10418 non-null  object        
 2   Ethnicity                                                               10418 non-null  object        
 3   Race                                                                    10418 non-null  object        
 4   Clinic assigned to                                                      10418 non-null  object        
 5   Zip Code                                                                10397 non-null  object        
 6   City                  

In [5]:
df_summary

Unnamed: 0,Age,Age Group,Ethnicity,Race,Clinic assigned to,Zip Code,City,Year,Text Follow up Prior Scheduling Completed PrEP Appointment,Call Follow up Prior Scheduling Completed PrEP Appointment,...,Month,Waiting time(Days),Time spent at clinic(Min),First Filled Date After Completed Appointment,First Shipped Date After Completed Appointment,Insurance Status(First Filled Date After Completed Appointment),Insurance Status Source(First Filled Date After Completed Appointment),Fill shipment waiting period,First Filled Drug Prescribed,First Filled Pharmacy used
count,10418.0,10418,10418,10418,10418,10397.0,10396,10418.0,10418.0,10418.0,...,10418,10417.0,8241.0,8657,8657,3137,3137,8657.0,8657,8657
unique,,8,13,12,4,1556.0,832,,,,...,12,,,,,1,2,,32,4
top,,25-34,Not Hispanic or Latino,White,MCPC West,37013.0,NASHVILLE,,,,...,June,,,,,Insured,Insured Fill,,DESCOVY TAB 30,Avita
freq,,4816,8315,6869,7159,478.0,4077,,,,...,1041,,,,,3137,2545,,4775,8343
mean,35.440488,,,,,,,2021.52774,0.368977,0.154444,...,,5.654699,89.797112,2022-03-01 05:20:02.217858560,2022-03-02 12:33:41.023449088,,,46.866582,,
min,4.0,,,,,,,2018.0,0.0,0.0,...,,-738.0,0.0,2019-07-02 00:00:00,2019-07-02 00:00:00,,,0.0,,
25%,27.0,,,,,,,2021.0,0.0,0.0,...,,0.0,44.0,2021-03-01 00:00:00,2021-03-02 00:00:00,,,3.0,,
50%,33.0,,,,,,,2022.0,0.0,0.0,...,,1.0,58.0,2022-03-14 00:00:00,2022-03-15 00:00:00,,,9.0,,
75%,41.0,,,,,,,2023.0,0.0,0.0,...,,4.0,74.0,2023-02-13 00:00:00,2023-02-13 00:00:00,,,27.0,,
max,88.0,,,,,,,2024.0,28.0,9.0,...,,93.0,31719.0,2024-06-21 00:00:00,2024-06-21 00:00:00,,,1786.0,,


In [6]:
df_nulls

Age                                                                          0
Age Group                                                                    0
Ethnicity                                                                    0
Race                                                                         0
Clinic assigned to                                                           0
Zip Code                                                                    21
City                                                                        22
Year                                                                         0
Text Follow up Prior Scheduling Completed PrEP Appointment                   0
Call Follow up Prior Scheduling Completed PrEP Appointment                   0
Email Follow up Prior Scheduling Completed PrEP Appointment                  0
Other Follow up Prior Scheduling Completed PrEP Appointment                  0
Text Follow up Between Scheduling and Completed PrEP


### Initial Overview of the Dataset

- The dataset contains 10,418 rows and 34 columns.
- The data includes a mix of numerical, categorical, and date-related features.
- Several columns have missing values that will need to be addressed during preprocessing.
- The dataset will undergo a detailed column-by-column analysis to identify any issues and prepare it for further modeling.



### Column-by-Column Analysis
 

#### **Age**
- **Type**: Numerical (int)
- **Analysis**: `Age` is a continuous variable representing the patient's age.

#### **Age Group**
- **Type**: Categorical (str)
- **Analysis**: Represents predefined age groups.

#### **Ethnicity**
- **Type**: Categorical (str)
- **Analysis**: Ethnicity of the patient.

#### **Race**
- **Type**: Categorical (str)
- **Analysis**: Racial group of the patient.

#### **Clinic assigned to**
- **Type**: Categorical (str)
- **Analysis**: The clinic where the patient is registered.

#### **Zip Code**
- **Type**: Categorical (str)
- **Analysis**: Patient's zip code.

#### **City**
- **Type**: Categorical (str)
- **Analysis**: Patient's city.

#### **Year**
- **Type**: Numerical (int)
- **Analysis**: Year of the appointment or event.

#### **Text Follow up Prior Scheduling Completed PrEP Appointment**
- **Type**: Categorical (int, treated as binary)
- **Analysis**: Indicates whether a text follow-up occurred before scheduling a completed PrEP appointment.

#### **Call Follow up Prior Scheduling Completed PrEP Appointment**
- **Type**: Categorical (int, treated as binary)
- **Analysis**: Indicates whether a call follow-up occurred before scheduling a completed PrEP appointment.

#### **Email Follow up Prior Scheduling Completed PrEP Appointment**
- **Type**: Categorical (int, treated as binary)
- **Analysis**: Indicates whether an email follow-up occurred before scheduling a completed PrEP appointment.

#### **Other Follow up Prior Scheduling Completed PrEP Appointment**
- **Type**: Categorical (int, treated as binary)
- **Analysis**: Indicates whether other types of follow-ups occurred before scheduling a completed PrEP appointment.

#### **Text Follow up Between Scheduling and Completed PrEP Appointment Date**
- **Type**: Categorical (int, treated as binary)
- **Analysis**: Indicates whether a text follow-up occurred between scheduling and the completed PrEP appointment date.

#### **Call Follow up Between Scheduling and Completed PrEP Appointment Date**
- **Type**: Categorical (int, treated as binary)
- **Analysis**: Indicates whether a call follow-up occurred between scheduling and the completed PrEP appointment date.

#### **Email Follow up Between Scheduling and Completed PrEP Appointment Date**
- **Type**: Categorical (int, treated as binary)
- **Analysis**: Indicates whether an email follow-up occurred between scheduling and the completed PrEP appointment date.

#### **Other Follow up Between Scheduling and Completed PrEP Appointment Date**
- **Type**: Categorical (int, treated as binary)
- **Analysis**: Indicates whether other types of follow-ups occurred between scheduling and the completed PrEP appointment date.

#### **First Completed PrEP Appointment Scheduled Date**
- **Type**: Date (or potentially categorical if encoded differently)
- **Analysis**: The date when the first PrEP appointment was scheduled.

#### **First Completed PrEP Appointment Date**
- **Type**: Date (or potentially categorical if encoded differently)
- **Analysis**: The date when the first PrEP appointment was completed.

#### **Telehealth type**
- **Type**: Categorical (str)
- **Analysis**: Indicates whether the appointment was telehealth or in-person.

#### **Number of Previous Incomplete appointments**
- **Type**: Numerical (int)
- **Analysis**: Number of previous incomplete appointments.

#### **Previous Incomplete appointments type**
- **Type**: Categorical (str)
- **Analysis**: The type of previous incomplete appointments.

#### **Provider**
- **Type**: Categorical (str)
- **Analysis**: The healthcare provider for the appointment.

#### **Timeslot**
- **Type**: Categorical (str)
- **Analysis**: The time of day when the appointment occurred.

#### **Day of week**
- **Type**: Categorical (str)
- **Analysis**: The day of the week when the appointment occurred.

#### **Month**
- **Type**: Categorical (str)
- **Analysis**: The month when the appointment occurred.

#### **Waiting time(Days)**
- **Type**: Numerical (int)
- **Analysis**: The number of days the patient waited for the appointment.

#### **Time spent at clinic(Min)**
- **Type**: Numerical (int)
- **Analysis**: The number of minutes the patient spent at the clinic.


### Creating binary indicator for latefill

The binary indicator for 'latefill' is created to classify whether a "Fill shipment waiting period" exceeds a certain threshold, in this case, 30 days. This threshold is used to determine if the shipment was "late" or "on time." This binary classification can be useful in identifying patterns or relationships that are associated with late shipments.. The analysis is also simplified.

In [7]:
# Create the binary indicator for latefill
df['latefill'] = df['Fill shipment waiting period'].apply(lambda x: 1 if x > 30 else (0 if x <= 30 else np.nan))

### Creating the clinic column

We replace occurrences of the clinic name 'Woodland' with 'Louisville'.

In [8]:
df['clinic'] = df['Clinic assigned to'].replace('Woodland', 'Louisville')

### Filling in missing values

We fill in missing values for 'Zip Code' and 'City' using the mode (most common value). This is done to ensure data completeness and avoid issues during analysis or model training.

In [9]:
# Check and fill missing values in 'Zip Code'
if int(df['Zip Code'].isna().sum()) > 0:
    print(f"Nulls found in Zip Code column: {int(df['Zip Code'].isna().sum())}")
    print(f"Filling missing values with mode...")
    df['Zip Code'] = df['Zip Code'].fillna(df['Zip Code'].mode()[0])
    print(f"Nulls in Zip Code column after imputing: {int(df['Zip Code'].isna().sum())}")
else:
    print(f"No nulls found in Zip Code column.")

# Check and fill missing values in 'City'
if int(df['City'].isna().sum()) > 0:
    print(f"\nNulls found in City column: {int(df['City'].isna().sum())}")
    print(f"Filling missing values with mode...")
    df['City'] = df['City'].fillna(df['City'].mode()[0])
    print(f"Nulls in City column after imputing: {int(df['City'].isna().sum())}")
else:
    print(f"\nNo nulls found in City column.")

# If there are nulls in Provider column then set them to "Unknown"
if int(df['Provider'].isna().sum()) > 0:
    print(f"\nNulls founds in Provider column: {int(df['Provider'].isna().sum())}")
    print(f"Filling nulls with 'Unknown")
    df['Provider'] = df['Provider'].fillna('Unknown')
    print(f"Nulls in Provider column after imputing: {int(df['Provider'].isna().sum())}")
else:
    print(f"\nNo nulls found in Provider column.")
    
# Check and fill missing values in 'Time spent at clinic(Min)'
if int(df['Time spent at clinic(Min)'].isna().sum()) > 0:
    print(f"\nNulls found in 'Time spent at clinic(Min)' column: {int(df['Time spent at clinic(Min)'].isna().sum())}")
    print(f"Filling missing values with median...")
    df['Time spent at clinic(Min)'] = df['Time spent at clinic(Min)'].fillna(df['Time spent at clinic(Min)'].median())
    print(f"Nulls in 'Time spent at clinic(Min)' column after imputing: {int(df['Time spent at clinic(Min)'].isna().sum())}")
else:
    print(f"\nNo nulls found in 'Time spent at clinic(Min)' column.")

Nulls found in Zip Code column: 21
Filling missing values with mode...
Nulls in Zip Code column after imputing: 0

Nulls found in City column: 22
Filling missing values with mode...
Nulls in City column after imputing: 0

Nulls founds in Provider column: 1867
Filling nulls with 'Unknown
Nulls in Provider column after imputing: 0

Nulls found in 'Time spent at clinic(Min)' column: 2177
Filling missing values with median...
Nulls in 'Time spent at clinic(Min)' column after imputing: 0


Categorize the 'Time spent at clinic(Min)' column into defined time intervals (bins). We group the continuous values into categories such as '< 30', '30 to 60', '60 to 90', '90 to 120', and '> 120'. The pd.cut function creates a new column 'Minutes_Spent_at_Clinic' representing these intervals.

In [10]:
df['Minutes_Spent_at_Clinic'] = pd.cut(
    df['Time spent at clinic(Min)'],
    bins=[-float('inf'), 30, 60, 90, 120, float('inf')],
    labels=['< 30', '30 to 60', '60 to 90', '90 to 120', '> 120'],
    include_lowest=True
)

df['Minutes_Spent_at_Clinic'].value_counts(dropna=False, normalize=True)

Minutes_Spent_at_Clinic
30 to 60     0.582165
60 to 90     0.275869
90 to 120    0.063256
< 30         0.057305
> 120        0.021405
Name: proportion, dtype: float64

### Create the target column Shipment Occurred

In [11]:
target_column = "First Shipped Date After Completed Appointment"
df['Shipment_Occurred'] = df[target_column].apply(lambda x: 0 if pd.isnull(x) else 1)

In [12]:
# Convert date columns to datetime format
df['First Completed PrEP Appointment Scheduled Date'] = pd.to_datetime(df['First Completed PrEP Appointment Scheduled Date'])
df['First Completed PrEP Appointment Date'] = pd.to_datetime(df['First Completed PrEP Appointment Date'])

# Display the data types to confirm conversion
df[['First Completed PrEP Appointment Scheduled Date', 'First Completed PrEP Appointment Date']].dtypes


First Completed PrEP Appointment Scheduled Date    datetime64[ns]
First Completed PrEP Appointment Date              datetime64[ns]
dtype: object

### Extract Date Components

In [13]:

# Extract date components (month, day of the week) from the date columns
df['Scheduled_Month'] = df['First Completed PrEP Appointment Scheduled Date'].dt.month
df['Scheduled_DayOfWeek'] = df['First Completed PrEP Appointment Scheduled Date'].dt.dayofweek
df['Appointment_Month'] = df['First Completed PrEP Appointment Date'].dt.month
df['Appointment_DayOfWeek'] = df['First Completed PrEP Appointment Date'].dt.dayofweek

# Display the first few rows to verify the new columns
df[['Scheduled_Month', 'Scheduled_DayOfWeek', 'Appointment_Month', 'Appointment_DayOfWeek']].head()


Unnamed: 0,Scheduled_Month,Scheduled_DayOfWeek,Appointment_Month,Appointment_DayOfWeek
0,6.0,5.0,6,1
1,6.0,5.0,6,1
2,6.0,5.0,6,5
3,7.0,5.0,8,0
4,8.0,1.0,8,1


In [14]:
df_filtered = (
    df.dropna(subset=['Provider'])
      .loc[lambda x: x['Provider'].map(x['Provider'].value_counts()) >= 250]
)

In [15]:
df_filtered.groupby('Provider').agg(
    total=pd.NamedAgg(column='Shipment_Occurred', aggfunc='count'),
    shipped=pd.NamedAgg(column='Shipment_Occurred', aggfunc='sum'),
    percentage_shipped=pd.NamedAgg(column='Shipment_Occurred', aggfunc='mean')
).reset_index()


Unnamed: 0,Provider,total,shipped,percentage_shipped
0,"Angela, Nunn",757,634,0.837517
1,"Angelica, Grooms",520,454,0.873077
2,"Anne, Sizemore",632,525,0.830696
3,"Elissa, Pelton",581,494,0.850258
4,"Emma, Metz",506,411,0.812253
5,"Joshua, Castlen",919,727,0.791077
6,"Korevina, Armstrong",252,196,0.777778
7,"Kristen, Spano",567,480,0.846561
8,"Kyle, Mullins",1264,1092,0.863924
9,"Stephen, Sharpe",295,254,0.861017


In [16]:
city_counts = df['City'].value_counts()
provider_counts = df['Provider'].value_counts()
zip_counts = df['Zip Code'].value_counts()

df['city_ft'] = df['City'].apply(lambda x: 'rare_city' if pd.notna(x) and city_counts.loc[x] < 50 else x)
df['provider_ft'] = df['Provider'].apply(lambda x: 'rare_provider' if pd.notna(x) and provider_counts.loc[x] < 50 else x)
df['zipcode_ft'] = df['Zip Code'].apply(lambda x: 'rare_zipcode' if pd.notna(x) and zip_counts.loc[x] < 50 else x)


In [17]:
categorical_columns = [
'Ethnicity', 
'Race', 
'clinic', 
'Telehealth type', 
'city_ft',
'provider_ft']

df_encoded = pd.get_dummies(df[categorical_columns], drop_first=True)
other_columns = [
'Age',
'Waiting time(Days)',
'Time spent at clinic(Min)',
'Text Follow up Prior Scheduling Completed PrEP Appointment',               
'Call Follow up Prior Scheduling Completed PrEP Appointment', 
'Email Follow up Prior Scheduling Completed PrEP Appointment',                  
'Other Follow up Prior Scheduling Completed PrEP Appointment',                 
'Text Follow up Between Scheduling and Completed PrEP Appointment Date',      
'Call Follow up Between Scheduling and Completed PrEP Appointment Date',       
'Email Follow up Between Scheduling and Completed PrEP Appointment Date' ,     
'Other Follow up Between Scheduling and Completed PrEP Appointment Date',
'Number of Previous Incomplete appointments']


df_other = df[other_columns]


### Class Balancing using SMOTE + Tomek Links

In [18]:
# Define the label column name
label_column = 'Shipment_Occurred'

# Combine relevant features from df_other (numerical), df_encoded (categorical), and the target label column
feature_set = pd.concat([df_other, df_encoded, df[[label_column]]], axis=1)

# Separate the feature matrix (X) and the target variable (y)
X = feature_set.drop(label_column, axis=1)  # Feature matrix
y = feature_set[label_column]               # Target variable

# Handle missing values in X:
X.fillna(X.median(numeric_only=True), inplace=True)

# Fill NaN values in categorical columns with the mode (most frequent value) of each column.
X.fillna(X.mode().iloc[0], inplace=True)


# Show the distribution of the target variable before applying hybrid sampling
print("Before Hybrid Sampling (Target Distribution):")
print(y.value_counts())  # Shows the count of each class (e.g., 0s and 1s)


# Perform hybrid sampling using SMOTE + Tomek Links to balance the classes
# SMOTE generates synthetic samples for the minority class, and Tomek Links removes noisy examples.
smote_tomek = SMOTETomek(random_state=42)   # Initialize SMOTE + Tomek Links with a fixed random seed for reproducibility
X_resampled, y_resampled = smote_tomek.fit_resample(X, y)  # Apply hybrid sampling to balance the dataset

# Show the distribution of the target variable after hybrid sampling
print("\nAfter Hybrid Sampling (Target Distribution):")
print(y_resampled.value_counts())  # Shows the balanced class distribution



Before Hybrid Sampling (Target Distribution):
Shipment_Occurred
1    8657
0    1761
Name: count, dtype: int64

After Hybrid Sampling (Target Distribution):
Shipment_Occurred
1    8567
0    8567
Name: count, dtype: int64


### XGBoost Model Training

In [27]:
# Define the hyperparameter grid with ranges for random search
param_distributions = {
    'n_estimators': [50, 100, 200],            # Number of trees (boosting rounds)
    'max_depth': [3, 5, 7],                    # Maximum depth of the trees
    'learning_rate': [0.01, 0.1, 0.2],         # Step size shrinkage
    'subsample': [0.8, 1.0],                   # Proportion of samples used for training each tree
    'colsample_bytree': [0.8, 1.0],            # Proportion of features used for each tree
    'gamma': [0, 0.1, 0.2],                    # Minimum loss reduction required for further partition
    'reg_alpha': [0, 0.1, 1],                  # L1 regularization
    'reg_lambda': [1, 0.1, 0],                 # L2 regularization
}

# Initialize the XGBoost classifier model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Use RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_distributions,
    n_iter=100,  # Increase this number for more fits
    cv=3,
    n_jobs=-1,
    random_state=42,
    scoring='accuracy',
    verbose=1
)

# Split the resampled dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Fit the model using random search for hyperparameter tuning (no early stopping during search)
random_search.fit(X_train, y_train)

# Get the best parameters and the best score
best_params = random_search.best_params_
best_score = random_search.best_score_

# Print the best hyperparameters and score
print(f"Best Hyperparameters: {best_params}")
print(f"Best Cross-Validation Accuracy: {best_score:.4f}")

# Use the best parameters to initialize the final model with early stopping
final_xgb_model = XGBClassifier(**best_params, use_label_encoder=False, eval_metric='logloss')

# Train the final model on the entire training data with early stopping
final_xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False)

# Evaluate the final model on the test data
y_pred = final_xgb_model.predict(X_test)




Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [22]:
xgb_model = final_xgb_model

### Model Evaluation (Accuracy, Precision, Recall, etc.)

In [23]:
# Make predictions on the test data
y_pred = xgb_model.predict(X_test)  # Predicted class labels
y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]  # Predicted probabilities for the positive class (1)

# Evaluate the model's performance using various metrics
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the evaluation metrics
print(f'Accuracy: {accuracy:.2f}')
print(f'Recall: {recall:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Confusion Matrix:\n{conf_matrix}')

Accuracy: 0.87
Recall: 0.92
Precision: 0.85
Confusion Matrix:
[[1389  280]
 [ 149 1609]]


### Feature Importance Analysis

In [None]:
# Plot the feature importance using XGBoost's built-in method
xgb_importances = xgb_model.feature_importances_

# Sort and plot feature importance
sorted_idx = np.argsort(xgb_importances)
plt.figure(figsize=(12, 15))  # Increase figure size for better readability
plt.barh(range(len(sorted_idx)), xgb_importances[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), X_train.columns[sorted_idx])
plt.xlabel('Feature Importance')
plt.title('Feature Importance from XGBoost Model')

# Apply tight layout to avoid overlap of labels
plt.tight_layout()
plt.show()

### SHAP Analysis for Model Explainability

In [None]:
# Initialize SHAP explainer with the trained XGBoost model
explainer = shap.TreeExplainer(xgb_model)

# Calculate SHAP values for the test dataset
shap_values = explainer.shap_values(X_test)

# Plot summary plot for SHAP values (global feature importance)
shap.summary_plot(shap_values, X_test, feature_names=X_test.columns)

# Plot individual SHAP explanation for a single prediction
shap.force_plot(explainer.expected_value, shap_values[0, :], X_test.iloc[0, :], matplotlib=True)

### Partial Dependence Plot

In [None]:
# Plot partial dependence for features
features_to_plot = [0, 1, 2]  # Features to plot
fig, ax = plt.subplots(figsize=(12, 6))
PartialDependenceDisplay.from_estimator(xgb_model, X_train, features_to_plot, ax=ax)
plt.suptitle('Partial Dependence Plot')
plt.subplots_adjust(top=0.9) 
plt.show()


### ROC Curve

In [None]:
# Compute the Receiver Operating Characteristic (ROC) curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')  # ROC curve
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')  # Diagonal line representing random guessing
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()