In [3]:
#General packages
import pandas as pd
import numpy as np
import sklearn as sns
import matplotlib.pyplot as plt

#sklearn packages
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score, train_test_split
from sklearn.metrics import classification_report, RocCurveDisplay, roc_auc_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.pipeline import Pipeline

#Grandient Boosted CARTs:
#import xgboost as xgb
from catboost import CatBoostClassifier, Pool

#SHAP
import shap


### Read CSV file:


In [None]:
filename = 'xyz.csv'
df = pd.read_csv(filename, sep = ',')
df.info()

### Data pre-processing

In [None]:
### Check for features with null values:
df.isna().sum()

In [None]:
### Fill null values with 0 (Other imputation methods could also be explored)
df.fillna(0, inplace = True)
df.isna().sum()

In [None]:
### Sometimes columns can have values that should be replaced with other values
df['A'] = df['A'].apply(lambda x: 'string' if x == 'xyz' else x)

### NLP techniques could also be used here

In [None]:
### Drop features not used for training. For example, identification keys for dataset instances, names, etc...:
df_ids = df[['IDENTIFICATION_KEY']]
df.drop(['IDENTIFICATION_KEY'],axis=1, inplace = True)

In [None]:
# Simple label encoder:
le = LabelEncoder()

In [None]:
### One-hot encoding of cardinal categories, where there is no ordinal relationship between the categories:
df_one_hot_encoded = pd.get_dummies(df[['A', 'B', 'C']])
df = df.drop(['A', 'B', 'C'], axis = 1)
  
df = pd.concat([df, df_one_hot_encoded], axis = 1)

In [None]:
#Confirm the one-hot encoding
print(df.dtypes)

In [None]:
#Splitting the dataset into categorical and numerical features:
### In some datasets it might be necessary to perform reset_index() and then delete the index column created:
df_cat = df.select_dtypes(include = ['object', 'bool'])
df_cat_columns = df_cat.columns

df_num = df.select_dtypes(include = ['int8', 'int32', 'float64'])
df_num_columns = df_num.columns

In [None]:
### Some categories need simple label encoding:
df_encoded = pd.DataFrame()

for category in df_cat_columns:
    le.fit(df[category])
    print("Original category values for {}: {}".format(category, le.classes_))
    print(le.transform(le.classes_))
    encoded_cat = le.transform(df[category])
    encoded_cat = pd.DataFrame(encoded_cat)
    df_encoded = pd.concat([df_encoded, encoded_cat], axis = 1)

df_encoded.columns = df_cat_columns
df_encoded = pd.concat([df_encoded, df_num], axis = 1)

#Drop the original categories that were one-hot encoded and are no longer necessary:
df_encoded = df_encoded.drop(['A', 'B', 'C'], axis = 1)

### Select and train a model

In [None]:
### Train-test-split (Usually test size is 20%) with stratification:
y = df_encoded['TARGET']
X = df_encoded.drop(['TARGET'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21, stratify=y)

print(X_train.shape)
print(X_test.shape)


In [None]:
### Check if the target values are balanced:
df_encoded['TARGET'].value_counts()

### Create a dictionary with several different models to be tested:

In [None]:
### Define the cross validation to analyze the bias-variance of results and do the apply proper regularization:
kf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)

#This dictionary can be used to implement several different classifiers to be tested:
models = {'cat_boost': CatBoostClassifier(silent = True, n_estimators = 1000, \
                                            learning_rate = 0.045, max_depth = 3, random_state = 42, l2_leaf_reg = 0.1)}

In [None]:
### Fit and evaluate each model:
for model in models.values():
    if type(model) ==  type(CatBoostClassifier()):
        clf = model
        print(model)
        steps = [('classifier', clf)]
        pipeline = Pipeline(steps)
        pipeline.fit(X_train, y_train)

        #Print the k-fold cross-validation results on the selected metric (e.g. "Recall")
        print(cross_val_score(pipeline, X_train, y_train, cv = kf, scoring = 'recall'))
        print(np.mean(cross_val_score(pipeline, X_train, y_train, cv = kf, scoring = 'recall')))

        #Predictions and Classification Report:
        y_train_pred = pipeline.predict(X_train)
        print(classification_report(y_train, y_train_pred))

### After evaluating the results above, select a model:

In [None]:
### If data is imbalanced you can adjsut class_weights parameter
class_weights = [1,1]
n_estimators = 1000
learning_rate = 0.05

selected_model = CatBoostClassifier(silent = True, n_estimators = n_estimators, learning_rate = learning_rate, class_weights = class_weights)

selected_model.fit(X_train, y_train)

In [None]:
### Classification performance and Confusion Matrix

y_pred = selected_model.predict(X_test)
print(classification_report(y_test, y_pred))

plt.clf()
cm = confusion_matrix(y_test, y_prest, labels = selected_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=selected_model.classes_)
disp.plot()
plt.show()

### The final part of this notebook is to calculate the Feature Importance.
### SHAP scores will be used to do that:

In [None]:
### Using SHAP to calculate features importance:

shap.initjs()
explainer = shap.TreeExplainer(selected_model)
shap_values = explainer.shap_values(X_test)

In [None]:
shap.summary_plot(shap_values, features = X_test, feature_names = X_test.columns)
shap.summary_plot(shap_values, features = X_test, feature_names = X_test.columns, plot_type = 'bar')

### Finally, AUC-ROC will be used as a final evaluation of model performance:

In [None]:
### Calculating the AUC-ROC curve for testing and training sets:

y_probs_test = pd.DataFarme(selected_model.predict_proba(X_test)[[1]])
y_probs_train = pd.DataFarme(selected_model.predict_proba(X_train)[[1]])

fig, ax = plt.subplots()

train = RocCurveDisplay.from_predictions(
    y_train,
    y_probs_train,
    name = f"ROC train"
    color = "blue"
)

test = RocCurveDisplay.from_predictions(
    y_test,
    y_probs,
    name = f"ROC test"
    color = "darkorange"
)

plt.plot([0,1], [0,1], "k--", label = "chance level (AUC = 0.5)")
plt.axis("square")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC curves:")
plt.legend()
plt.grid()
plt.show()

auc = np.round(roc_auc_score(y_test, y_probs), 2)
print(auc)