In [3]:
#General packages
import pandas as pd
import numpy as np
import sklearn as sns
import matplotlib.pyplot as plt

#sklearn packages
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score, train_test_split
from sklearn.metrics import classification_report, RocCurveDisplay, roc_auc_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.pipeline import Pipeline

#Grandient Boosted CARTs:
import xgboost as xgb
from catboost import CatBoostClassifier, Pool

#SHAP
import shap


### Read CSV file:


In [None]:
filename = 'xyz.csv'
df = pd.read_csv(filename, sep = ',')
df.info()

### Data pre-processing

In [None]:
### Check for features with null values:
df.isna().sum()

In [None]:
### Fill null values with 0 (Other imputation methods could also be explored)
df.fillna(0, inplace = True)

In [None]:
### Sometimes columns can have values that should be replaced with other values
df['A'] = df['A'].apply(lambda x: 'string' if x == 'xyz' else x)

### NLP techniques could also be used here

In [None]:
### One-hot encoding:
df = pd.get_dummies(df[['A', 'B', 'C']])
  

In [None]:
#Confirm the one-hot encoding
df.dtypes

In [None]:
#Splitting the dataset into categorical and numerical features:
df_cat = df.select_dtypes(include = ['object', 'bool'])
df_cat_columns = df_cat.columns

df_num = df.select_dtypes(include = ['int8', 'int32', 'float64'])
df_num_columns = df_num.columns

In [None]:
### Some categories need simple label encoding:
df_encoded = pd.DataFrame()

for category in df_cat_columns:
    le.fit(df[category])
    encoded_cat = le.transform(df[category])
    encoded_cat = pd.DataFrame(encoded_cat)
    df_encoded = pd.concat([df_encoded, encoded_cat], axis = 1)

df_encoded.columns = df_cat_columns
df_encoded = pd.concat([df_encoded, df_num], axis = 1)

df_encoded = df_encoded.drop(['A', 'B', 'C'], axis = 1)

### Select and train a model

In [None]:
### Train-test-split
y = df_encoded['TARGET']
X = df_encoded.drop(['TARGET'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21, stratify=y)

print(X_train.shape)
print(X_test.shape)


In [None]:
### Check if the target values are balanced:
df_encoded['TARGET'].value_counts()

### Using a Catboost classifier:

In [None]:
### If data is imbalanced you can adjsut class_weights parameter
class_weights = [1,1]

catboost = CatBoostClassifier(silent = True, n_estimators = 100, learning_rate = 0.05, class_weights = class_weights)

In [None]:
### Classification performance and Confusion Matrix

y_pred = catboost.predict(X_test)
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_prest, labels = catboost.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=catboost.classes_)
disp.plot()
plt.show()

In [None]:
### Using SHAP to calculate features importance:

shap.initjs()
explainer = shap.TreeExplainer(catboost)
shap_values = explainer.shap_values(X_test)

In [None]:
shap.summary_plot(shap_values, features = X_test, feature_names = X_test.columns)
shap.summary_plot(shap_values, features = X_test, feature_names = X_test.columns, plot_type = 'bar')

In [None]:
### Calculating the AUC-ROC curve:

y_probs = pd.DataFarme(catboost.predict_proba(X_test)[[1]])

RocCurveDisplay.from_predictions(
    y_test,
    y_probs,
    name = f"ROC"
    color = "darkorange"
)
plt.plot([0,1], [0,1], "k--", label = "chance level (AUC = 0.5)")
plt.axis("square")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC curves:")
plt.legend()
plt.grid()
plt.show()

auc = np.round(roc_auc_score(y_test, y_probs), 2)
print(auc)