# 1- Imports

In [324]:
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FixedLocator
import plotly.graph_objects as go
from IPython.display import display, HTML

import nannyml as nml

import evidently as evi
from evidently import ColumnMapping


from evidently.report import Report
from evidently.metrics.base_metric import generate_column_metrics
from evidently.metric_preset import DataQualityPreset, DataDriftPreset, TargetDriftPreset
# from evidently.metrics import *

from evidently.test_suite import TestSuite
from evidently.tests.base_test import generate_column_tests
from evidently.test_preset import DataQualityTestPreset, DataStabilityTestPreset, NoTargetPerformanceTestPreset, DataDriftTestPreset, RegressionTestPreset, BinaryClassificationTestPreset
from evidently.tests import TestColumnShareOfMissingValues, TestColumnNumberOfMissingValues, TestMostCommonValueShare, TestNumberOfConstantColumns, TestNumberOfDuplicatedRows

import joblib
import shap
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, mean_absolute_percentage_error
from sklearn.exceptions import UndefinedMetricWarning
from plotly.subplots import make_subplots
import plotly.graph_objects as go

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning, module="lightgbm")
warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)
warnings.filterwarnings("ignore", message="An input array is constant; the correlation coefficient is not defined.")
np.seterr(over='ignore')

{'divide': 'warn', 'over': 'ignore', 'under': 'ignore', 'invalid': 'warn'}

In [357]:
## Helpers
def dataFetchingFunc(
    Query, SERVER, DATABASE, USERNAME=None, PASSWORD=None, returnFlag=True):
    """
    Fetches data from a SQL database using a provided SQL query.
    """
    if USERNAME and PASSWORD:
        conn = pyodbc.connect(
            "DRIVER={ODBC Driver 17 for SQL Server};SERVER="
            + SERVER
            + ";DATABASE="
            + DATABASE
            + ";UID="
            + USERNAME
            + ";PWD="
            + PASSWORD
        )
    else:
        conn = pyodbc.connect(
            "DRIVER={ODBC Driver 17 for SQL Server};SERVER="
            + SERVER
            + ";DATABASE="
            + DATABASE
            + ";Trusted_Connection=yes;"
        )
    cursor = conn.cursor()
    cursor.execute(Query)
    rows = cursor.fetchall()
    names = [column[0] for column in cursor.description]
    data = pd.DataFrame.from_records(rows, columns=names)
    return data

def expl_pd_model_output(explainer, data_instance, plot=False):
    """
    Generates a DataFrame explaining SHAP values for a single instance in terms of probability of default (PD)
    and optionally prints the SHAP waterfall plot.

    Args:
        explainer (shap.TreeExplainer): The SHAP explainer object.
        data_instance (pd.DataFrame): A single row DataFrame containing the features for the prediction.
        plot (bool): If True, displays the SHAP waterfall plot. Default is False.

    Returns:
        pd.DataFrame: A DataFrame containing the feature names, values, SHAP contributions, and a simple explanation.

    Raises:
        ValueError: If the provided data_instance does not have exactly one row.
    """
    # Check if the data_instance has exactly one row
    if data_instance.shape[0] != 1:
        raise ValueError(f"Expected data_instance to have exactly 1 row, but got {data_instance.shape[0]} rows.")
    
    # Get SHAP values for the data instance
    shap_values = explainer(data_instance)

    # Optionally plot the waterfall plot
    if plot:
        shap.waterfall_plot(shap_values[0])

    # Extract the feature names, values, and SHAP values (log-odds contributions)
    shap_contributions = pd.DataFrame({
        'Feature Name': data_instance.columns,
        'Feature Value': data_instance.values[0],
        'Contribution to Prediction': shap_values.values[0]
    })

    # Base value (average model prediction) and total prediction
    base_value = shap_values.base_values[0] if isinstance(shap_values.base_values, np.ndarray) else shap_values.base_values
    total_prediction = base_value + shap_contributions['Contribution to Prediction'].sum()

    # Calculate the sum of absolute SHAP values (contributions) for relative impact calculation
    abs_shap_sum = shap_contributions['Contribution to Prediction'].abs().sum()

    # Calculate relative impact percentage of each feature
    shap_contributions['Relative Impact (%)'] = (shap_contributions['Contribution to Prediction'].abs() / abs_shap_sum) * 100

    # Simple explanation based on SHAP value sign (positive or negative)
    def simple_explanation(shap_value):
        return "Increases PD" if shap_value > 0 else "Decreases PD"

    # Adding a simple 'Explanation' column
    shap_contributions['Explanation'] = shap_contributions['Contribution to Prediction'].apply(simple_explanation)

    # Sorting the DataFrame by absolute contribution to see the most impactful features first
    shap_contributions = shap_contributions.reindex(shap_contributions['Contribution to Prediction'].abs().sort_values(ascending=False).index)

    return shap_contributions

def generate_combined_drift_rank(dataframes, aliases, join_column, simplify=True):
    # Check if the number of dataframes matches the number of aliases
    if len(dataframes) != len(aliases):
        raise ValueError("The number of dataframes must match the number of aliases.")
    
    # Rename columns for each dataframe
    renamed_dataframes = []
    for df, alias in zip(dataframes, aliases):
        # Rename 'column_name' to 'feature_name'
        df = df.rename(columns={join_column: 'feature_name'})
        # Rename other columns with the alias
        renamed_columns = {col: f"{alias}_{col}" for col in df.columns if col != 'feature_name'}
        renamed_df = df.rename(columns=renamed_columns)
        renamed_dataframes.append(renamed_df)
    
    # Merge dataframes on the 'feature_name' column
    merged_df = renamed_dataframes[0]
    for df in renamed_dataframes[1:]:
        merged_df = merged_df.merge(df, on='feature_name', how='inner')
    
    if simplify:
        # Filter for rows where 'pd_has_drifted' is True and 'feature_name' is not 'final_income'
        filtered_df = merged_df[(merged_df['pd_has_drifted'] == True) & (merged_df['feature_name'] != 'final_income')]
        # Rank the values by 'pd_rank' and 'alerts_rank'
        filtered_df = filtered_df.sort_values(by=['pd_rank', 'alerts_rank'])
        # Create 'final_rank' as a ranking from 1 to the length of the filtered dataframe
        filtered_df['drifting_rank'] = range(1, len(filtered_df) + 1)
        # Select only 'feature_name' and 'final_rank' columns
        result_df = filtered_df[['feature_name', 'alerts_number_of_alerts','drifting_rank']].rename(columns={'alerts_number_of_alerts': 'uni_drift_alerts'}).reset_index(drop=True)
        return result_df
    
    return merged_df

def plot_drift_and_distribution(uni_drift_results, column_names, period='all'):
    # Create a subplot figure with the appropriate number of rows and 2 columns
    num_columns = len(column_names)
    fig = make_subplots(rows=num_columns, cols=2, 
                        subplot_titles=[f"{col} Drift" for col in column_names] + 
                                       [f"{col} Distribution" for col in column_names],
                        vertical_spacing=0.1)
    
    for i, col in enumerate(column_names):
        # Generate the individual plots
        plot_figure = uni_drift_results.filter(period=period, column_names=[col]).plot(kind='drift', width=3)
        distrib_figure = uni_drift_results.filter(period=period, column_names=[col]).plot(kind='distribution', width=3)
        
        # Add the drift plot to the first column of the current row
        for trace in plot_figure['data']:
            fig.add_trace(trace, row=i+1, col=1)
        
        # Add the distribution plot to the second column of the current row
        for trace in distrib_figure['data']:
            fig.add_trace(trace, row=i+1, col=2)
        
        # Update y-axis titles
        fig.update_yaxes(title_text='drift_metric_units', row=i+1, col=1)
        fig.update_yaxes(title_text='values', row=i+1, col=2)
    
    # Update layout for better spacing and titles
    fig.update_layout(height=350*num_columns, width=1000, title_text="Drift and Distribution by Chunk")
    
    # Correct the subplot titles
    for i, col in enumerate(column_names):
        fig.layout.annotations[i*2].text = f"{col} Drift"
        fig.layout.annotations[i*2 + 1].text = f"{col} Distribution"
    
    # Show the combined figure
    fig.show()

In [326]:
# paths and custom modules
cwd_path = Path(os.getcwd())
data_path = cwd_path/'data/'
objects_path = cwd_path/'objects/'
sys.path.append(os.path.dirname(cwd_path))

# 2- Setup

In [327]:
# Reference data setup
data_date = '20240810'
income_model_version = 'v0.7'
pd_model_version = 'v0.7'
rule_based_version = 'v0.1'
calculation_center_version = 'v0.2'

# Analysis data setup
is_seperate_analysis_df = False # Always False till we have the complete data flow up
is_realized_performance = True
# Config parameters when the is_seperate_analysis_df is False (if True, those parameters are ignored)
analysis_df_base_size = 15000
max_records_date = '2023Q3'

# 3- Fetch

### Models Metadata

In [328]:
# load all stored models
income_model_metadata = joblib.load(objects_path/f"income_model_{income_model_version}.pkl")
income_model_obj = income_model_metadata['object']
income_model_feats_names = income_model_metadata['features_names']
income_model_feats_dtypes = income_model_metadata['features_dtypes']
income_model_explainer = income_model_metadata['explainer']

pd_model_metadata = joblib.load(objects_path/f"pd_model_{pd_model_version}.pkl")
pd_model_obj = pd_model_metadata['object']
pd_model_feats_names = pd_model_metadata['features_names']
pd_model_feats_dtypes = pd_model_metadata['features_dtypes']
pd_model_explainer = pd_model_metadata['explainer']

all_models_feats_names = list(set(income_model_feats_names + pd_model_feats_names))

### Reference Data

In [329]:
## Fetching Reference Data

# unprcessed attributes
risk_cust_attrib_df = pd.read_parquet(data_path/f"feed/{data_date}_risk_cust_attributes.parquet")

# processed all features
df_to_all_models = pd.read_parquet(data_path / f"features_store/{data_date}_L1_processed_features.parquet")

# income predictionss
income_groups = pd.read_parquet(data_path / f'analytics/{data_date}_income_{income_model_version}_groups.parquet')
income_predics = pd.read_parquet(data_path / f'models_preds/{data_date}_income_{income_model_version}_predics.parquet')
income_predics = income_predics.rename(columns={'predics': 'income_predics'})

# pd predictions
pd_predics = pd.read_parquet(data_path / f'models_preds/{data_date}_pd_{pd_model_version}_predics.parquet')
pd_predics = pd_predics.rename(columns={'cali_predics': 'pd_predics'})

# rule-based predictions
rule_based_predics = pd.read_parquet(data_path / f'models_preds/{data_date}_rule_based_{rule_based_version}_predics.parquet')
rule_based_predics = rule_based_predics.rename(columns={'predics_score': 'rule_based_predics_score',
                                                        'predics_pd':'rule_based_predics_pd'})

# calculation center results
cc_results = pd.read_parquet(data_path / f'models_preds/{data_date}_calculation_center_{calculation_center_version}_results.parquet')

# print all all columns
print(risk_cust_attrib_df.columns,
      '\n',
      df_to_all_models.columns,
      '\n',
      income_groups.columns,
      '\n',
      income_predics.columns,
      '\n',
      pd_predics.columns,
      '\n',
      cc_results.columns)

Index(['client_id', 'ssn', 'phone_number_1', 'phone_number_2',
       'flag_is_mc_customer', 'flag_is_prv_cash_trx', 'flag_is_rescore',
       'contract_date', 'job_name_map', 'job_type', 'net_income', 'net_burden',
       'first_income_up_date', 'net_income_first', 'net_burden_first',
       'last_income_up_date', 'net_income_last', 'net_burden_last',
       'ss_mc_program', 'ss_min_income', 'ss_max_income', 'ss_initial_cl',
       'job_map_min_salary', 'job_map_max_salary', 'insurance_type',
       'marital_status', 'children_count', 'address_governorate',
       'address_city', 'address_area', 'house_type', 'car_type_id',
       'car_model_year', 'club_level', 'mobile_os_type', 'is_iscore',
       'iscore_score', 'iscore_report', 'current_repayment_class',
       'first_ord_amount', 'first_ord_benefit', 'first_ord_tenor',
       'fo_par90_flag'],
      dtype='object') 
 Index(['client_id', 'ssn', 'phone_number_1', 'phone_number_2',
       'flag_is_mc_customer', 'flag_is_prv_cash_trx

In [330]:
# Select features and merge
def_df = df_to_all_models\
                .merge(income_predics, how='left', on='client_id')\
                .merge(pd_predics, how='left', on='client_id')\
                .merge(cc_results, how='left', on='client_id')\
                .merge(income_groups, how='left', on='client_id')\
                .merge(risk_cust_attrib_df[['client_id', 'contract_date']], how='left', on='client_id')

# selecting the test dataframe as reference
reference_df_raw = def_df[def_df['group'] == 'test']

### Analyis Data

In [331]:
if is_seperate_analysis_df:
    query = '''
    SELECT * FROM table
    '''

    analysis_df = dataFetchingFunc(
        Query=query,
        SERVER="BI-DR-DB",
        DATABASE="master"
    )

# 3- Datasets Configs

In [332]:
# Generic df preperation
if is_seperate_analysis_df == False:
    reference_df_raw['contract_date'] = pd.to_datetime(reference_df_raw['contract_date'])
    reference_df_raw['year_quarter'] = reference_df_raw['contract_date'].dt.to_period('Q')
    reference_df_raw['year_month'] = reference_df_raw['contract_date'].dt.to_period('M')
    # TODO as we are working with delayed responses, limiting the analysis records gives more meaningful results
    reference_df_raw = reference_df_raw[reference_df_raw['year_quarter'] <= max_records_date]

    # splitting raw reference to reference and analysis
    # Time-based splitting
    reference_df_raw = reference_df_raw.sort_values(by='contract_date', ascending=False)
    initial_analysis_df = reference_df_raw.head(analysis_df_base_size)

    # Get non-overlapping periods
    min_date_analysis_df = initial_analysis_df['contract_date'].min()
    start_of_month = min_date_analysis_df.replace(day=1)

    # Get final reference and analysis dataframe
    analysis_df = reference_df_raw[reference_df_raw['contract_date'] >= start_of_month]
    reference_df = reference_df_raw[reference_df_raw['contract_date'] < start_of_month]

elif is_seperate_analysis_df == True:
    reference_df_raw['contract_date'] = pd.to_datetime(reference_df_raw['contract_date'])
    reference_df_raw['year_quarter'] = reference_df_raw['contract_date'].dt.to_period('Q')
    reference_df_raw['year_month'] = reference_df_raw['contract_date'].dt.to_period('M')

    analysis_df['contract_date'] = pd.to_datetime(analysis_df['contract_date'])
    analysis_df['year_quarter'] = analysis_df['contract_date'].dt.to_period('Q')
    analysis_df['year_month'] = analysis_df['contract_date'].dt.to_period('M')

    reference_df = reference_df_raw.copy()

In [333]:
# Alter distributions if needed
if is_seperate_analysis_df == False:
    # noise = np.random.normal(0, 2, analysis_df['age'].shape)
    # analysis_df['age'] = analysis_df['age'] + noise
    pass

In [334]:
# Preprocess reference and analysis dataframes
reference_df, analysis_df = reference_df.reset_index(drop=True), analysis_df.reset_index(drop=True)
reference_df['group'], analysis_df['group'] = 'test_reference', 'test_analysis'
reference_df['default_predics'], analysis_df['default_predics'] = reference_df['pd_AR'].apply(lambda x: 1 if x == 0 else 0), analysis_df['pd_AR'].apply(lambda x: 1 if x == 0 else 0)

# Display Dataframe sizes and dates
print("Reference DataFrame:")
print(f"Length: {len(reference_df)}, Min Date: {reference_df['contract_date'].min()}, Max Date: {reference_df['contract_date'].max()}")

print("\nAnalysis DataFrame:")
print(f"Length: {len(analysis_df)}, Min Date: {analysis_df['contract_date'].min()}, Max Date: {analysis_df['contract_date'].max()}")

Reference DataFrame:
Length: 131451, Min Date: 2019-01-01 00:00:00, Max Date: 2022-11-30 00:00:00

Analysis DataFrame:
Length: 16112, Min Date: 2022-12-01 00:00:00, Max Date: 2023-09-30 00:00:00


# 4- Pre-Monitoring Analytics

In [335]:
# # Group by year and quarter
# df = pd.concat([reference_df, analysis_df], axis=0).reset_index(drop=True)
# grouped = df.groupby('year_quarter')

# # Calculate metrics
# records_count = grouped.size()
# average_actual_income = grouped['net_income_inflated'].mean()
# average_predicted_income = grouped['income_predics'].mean()
# average_defaults_percent = grouped['fo_par90_flag'].mean() * 100
# average_pd_percent = grouped['pd_predics'].mean() * 100

# def calculate_auc(group):
#     if len(group['fo_par90_flag'].unique()) == 2:
#         return roc_auc_score(group['fo_par90_flag'], group['pd_predics'])
#     return np.nan

# auc_per_group = grouped.apply(calculate_auc)
# gini_per_group = 2 * auc_per_group - 1

# def calculate_mape(group):
#     return mean_absolute_percentage_error(group['net_income_inflated'], group['income_predics'])*100

# mape_per_group = grouped.apply(calculate_mape)

# # Combine results into DataFrame
# result = pd.DataFrame({
#     'records_count': records_count,
#     'average_actual_income': average_actual_income,
#     'average_predicted_income': average_predicted_income,
#     'average_defaults_perc': average_defaults_percent,
#     'average_pd_perc': average_pd_percent,
#     'auc': auc_per_group,
#     'gini': gini_per_group,
#     'mape': mape_per_group
# })

# # Plot 1: Average Actual vs Predicted Income
# fig, ax1 = plt.subplots(figsize=(12, 6))
# bars = ax1.bar(result.index.astype(str), result['records_count'], color='navy', alpha=0.6, label='Records Count')
# ax1.set_xlabel('Year-Quarter')
# ax1.set_ylabel('Records Count', color='navy')
# ax1.tick_params(axis='y', labelcolor='navy')
# ax1.xaxis.set_major_locator(FixedLocator(range(len(result.index))))
# ax1.set_xticklabels(result.index.astype(str), rotation=45)
# for bar in bars:
#     yval = bar.get_height()
#     ax1.text(bar.get_x() + bar.get_width()/2, yval, int(yval), va='bottom', ha='center', color='navy')

# ax2 = ax1.twinx()
# ax2.plot(result.index.astype(str), result['average_actual_income'], color='darkred', marker='o', label='Average Actual Income')
# ax2.plot(result.index.astype(str), result['average_predicted_income'], color='darkred', linestyle='--', marker='x', label='Average Predicted Income')
# ax2.set_ylabel('Income', color='darkred')
# ax2.tick_params(axis='y', labelcolor='darkred')
# for i, txt in enumerate(result['average_actual_income']):
#     ax2.annotate(f'{txt:.0f}', (result.index.astype(str)[i], result['average_actual_income'][i]), textcoords="offset points", xytext=(0,10), ha='center', color='darkred')
# for i, txt in enumerate(result['average_predicted_income']):
#     ax2.annotate(f'{txt:.0f}', (result.index.astype(str)[i], result['average_predicted_income'][i]), textcoords="offset points", xytext=(0,10), ha='center', color='darkred')

# plt.title('Average Actual vs Predicted Income Over Time')
# fig.tight_layout()
# ax1.legend(loc='upper left')
# ax2.legend(loc='upper right')
# plt.show()

# # Plot 2: MAPE Over Time
# fig, ax4 = plt.subplots(figsize=(12, 6))
# ax4.plot(result.index.astype(str), result['mape'], color='teal', linestyle='-', marker='o', label='MAPE')
# ax4.set_xlabel('Year-Quarter')
# ax4.xaxis.set_major_locator(FixedLocator(range(len(result.index))))
# ax4.set_xticklabels(result.index.astype(str), rotation=45)
# for i, txt in enumerate(result['mape']):
#     ax4.annotate(f'{txt:.2f}', (result.index.astype(str)[i], result['mape'][i]), textcoords="offset points", xytext=(0,10), ha='center', color='teal')

# plt.title('Mean Absolute Percentage Error (MAPE) Over Time')
# fig.tight_layout()
# ax4.legend(loc='upper left')
# plt.show()

# # Plot 3: Average Defaults Percentage vs PD Percentage
# fig, ax1 = plt.subplots(figsize=(12, 6))
# bars = ax1.bar(result.index.astype(str), result['records_count'], color='navy', alpha=0.6, label='Records Count')
# ax1.set_xlabel('Year-Quarter')
# ax1.set_ylabel('Records Count', color='navy')
# ax1.tick_params(axis='y', labelcolor='navy')
# ax1.xaxis.set_major_locator(FixedLocator(range(len(result.index))))
# ax1.set_xticklabels(result.index.astype(str), rotation=45)
# for bar in bars:
#     yval = bar.get_height()
#     ax1.text(bar.get_x() + bar.get_width()/2, yval, int(yval), va='bottom', ha='center', color='navy')

# ax2 = ax1.twinx()
# ax2.plot(result.index.astype(str), result['average_defaults_perc'], color='darkred', marker='o', label='Average Defaults Percentage')
# ax2.plot(result.index.astype(str), result['average_pd_perc'], color='darkred', linestyle='--', marker='x', label='Average PD Percentage')
# ax2.set_ylabel('Percentage', color='darkred')
# ax2.tick_params(axis='y', labelcolor='darkred')
# for i, txt in enumerate(result['average_defaults_perc']):
#     ax2.annotate(f'{txt:.2f}', (result.index.astype(str)[i], result['average_defaults_perc'][i]), textcoords="offset points", xytext=(0,10), ha='center', color='darkred')
# for i, txt in enumerate(result['average_pd_perc']):
#     ax2.annotate(f'{txt:.2f}', (result.index.astype(str)[i], result['average_pd_perc'][i]), textcoords="offset points", xytext=(0,10), ha='center', color='darkred')

# plt.title('Average Defaults Percentage vs PD Percentage Over Time')
# fig.tight_layout()
# ax1.legend(loc='upper left')
# ax2.legend(loc='upper right')
# plt.show()

# # Plot 4: AUC and Gini Over Time
# fig, ax3 = plt.subplots(figsize=(12, 6))
# ax3.plot(result.index.astype(str), result['auc'], color='teal', linestyle='--', marker='s', label='AUC')
# ax3.plot(result.index.astype(str), result['gini'], color='purple', linestyle='-', marker='d', label='Gini')
# ax3.set_xlabel('Year-Quarter')
# ax3.set_ylabel('AUC / Gini')
# ax3.xaxis.set_major_locator(FixedLocator(range(len(result.index))))
# ax3.set_xticklabels(result.index.astype(str), rotation=45)
# for i, txt in enumerate(result['auc']):
#     ax3.annotate(f'{txt:.2f}', (result.index.astype(str)[i], result['auc'][i]), textcoords="offset points", xytext=(0,10), ha='center', color='forestgreen')
# for i, txt in enumerate(result['gini']):
#     ax3.annotate(f'{txt:.2f}', (result.index.astype(str)[i], result['gini'][i]), textcoords="offset points", xytext=(0,10), ha='center', color='purple')

# plt.title('AUC and Gini Coefficient Over Time')
# fig.tight_layout()
# ax3.legend(loc='upper left')
# plt.show()

# 5- ML Models Performance **estimated**

### Income Model Perf-Est

In [336]:
# Set thresholds
income_mape_threshold = nml.thresholds.StandardDeviationThreshold(std_lower_multiplier=2, std_upper_multiplier=1)

# Build estimator
income_estimator = nml.DLE(
    feature_column_names=income_model_feats_names,
    y_pred='income_predics',
    y_true='net_income_inflated',
    timestamp_column_name='contract_date',
    metrics=['mape'],
    chunk_period='M',
    # tune_hyperparameters=True,
    thresholds={
        'mape': income_mape_threshold
    }
)

# Train estimator
income_estimator.fit(reference_df)
# Estimate performance
income_estimated_results = income_estimator.estimate(analysis_df)
# plot
income_estimated_results.filter(period='all').plot().show(width=3)

### PD Model Perf-Est

In [337]:
# Set thresholds
pd_auc_threshold = nml.thresholds.StandardDeviationThreshold(std_lower_multiplier=1, std_upper_multiplier=2)

# Build estimator
pd_estimator = nml.CBPE(
    y_pred_proba='pd_predics',
    y_pred='default_predics',
    y_true='fo_par90_flag',
    timestamp_column_name='contract_date',
    metrics=['roc_auc'],
    chunk_period='M',
    problem_type='classification_binary',
    thresholds={
        'roc_auc': pd_auc_threshold
    }
)

# Train estimator
pd_estimator.fit(reference_df)
# Estimate performance
pd_estimated_results = pd_estimator.estimate(analysis_df)
# plot
pd_estimated_results.filter(period='all').plot().show(width=3)

# 6- ML Models Performance **Realized vs Estimated**

### Income Model Perf-Comp

In [338]:
if is_realized_performance:
    # Build performance calculator
    income_calculator = nml.PerformanceCalculator(
        y_pred='income_predics',
        y_true='net_income_inflated',
        timestamp_column_name='contract_date',
        metrics=['mape'],
        chunk_period='M',
        problem_type='regression',
        thresholds={
        'mape': income_mape_threshold
        }
    )

    # Train calculator
    income_calculator.fit(reference_df)
    # Calculate performance
    income_realized_results = income_calculator.calculate(analysis_df)
    # plot
    income_realized_results.filter(period='analysis').compare(income_estimated_results).plot().show(width=1)

### PD Model Perf-Comp

In [350]:
if is_realized_performance:
    # Calculate realized performance using targets
    pd_calculator = nml.PerformanceCalculator(
        y_pred_proba='pd_predics',
        y_pred='default_predics',
        y_true='fo_par90_flag',
        timestamp_column_name='contract_date',
        metrics=['roc_auc'],
        chunk_period='M',
        problem_type='classification_binary',
        thresholds={
        'roc_auc': pd_auc_threshold
        }
    )

    # Train calculator
    pd_calculator.fit(reference_df)
    # Estimate performance
    pd_realized_results = pd_calculator.calculate(analysis_df)
    # plot
    pd_realized_results.filter(period='analysis').compare(pd_estimated_results).plot().show(width=1)

# 7- Data Drift

### Setup

In [340]:
if is_realized_performance == True:
    income_results = income_realized_results
    pd_results = pd_realized_results
elif is_realized_performance == False:
    income_results = income_estimated_results
    pd_results = pd_estimated_results

### Univariate Analysis with Correlation

In [341]:
# Set thresholds
# uni_drift_threshold = nml.thresholds.StandardDeviationThreshold(std_lower_multiplier=3, std_upper_multiplier=3)

# Build uni-drift calculator
uni_drift_calculator = nml.UnivariateDriftCalculator(
    column_names=pd_model_feats_names,
    timestamp_column_name='contract_date',
    chunk_period='M',
    # thresholds={
    # 'jensen_shanon': uni_drift_threshold
    # }
)

# Train calculator
uni_drift_calculator.fit(reference_df)

# Estimate performance
uni_drift_results = uni_drift_calculator.calculate(analysis_df)

In [342]:
# Count alerts per feature
alerts_ranker = nml.AlertCountRanker()
alerts_ranked_features = alerts_ranker.rank(
    uni_drift_results.filter(methods=['jensen_shannon']), only_drifting = False)
column_order = ['column_name', 'number_of_alerts', 'rank']
alerts_ranked_features = alerts_ranked_features[column_order]

# Correlation ranking
income_drift_ranker = nml.CorrelationRanker()
pd_drift_ranker = nml.CorrelationRanker()

# ranker fits on one metric and reference period data only
income_drift_ranker.fit(income_results.filter(period='reference', metrics=['mape']))
pd_drift_ranker.fit(pd_results.filter(period='reference', metrics=['roc_auc']))

# ranker ranks on one drift method and one performance metric
income_correlation_ranked_features = income_drift_ranker.rank(
    uni_drift_results.filter(methods=['jensen_shannon']), income_results.filter(metrics=['mape']), only_drifting = False)
pd_correlation_ranked_features = pd_drift_ranker.rank(
    uni_drift_results.filter(methods=['jensen_shannon']), pd_results.filter(metrics=['roc_auc']), only_drifting = False)

In [343]:
# combine results
dataframes = [alerts_ranked_features, income_correlation_ranked_features, pd_correlation_ranked_features]
aliases = ['alerts', 'income', 'pd']
join_column = 'column_name'
ranked_features = generate_combined_drift_rank(dataframes, aliases, join_column, simplify=True)
ranked_features

Unnamed: 0,feature_name,uni_drift_alerts,drifting_rank
0,have_club_id,2,1
1,have_car,1,2
2,address_area,5,3
3,address_city,7,4
4,ssn_governorate,1,5
5,address_governorate,8,6
6,ClosedAccounts_count,1,7
7,ssn_is_male,1,8
8,car_model_category,2,9
9,ClosedAccounts_InstallmentAmount_sum,2,10


In [358]:
# plot distribution results
check_columns = ['ssn_is_male'] # ranked_features['feature_name']
plot_drift_and_distribution(uni_drift_results, check_columns, period='analysis')