In [None]:
!pip install nannyml


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nannyml
  Downloading nannyml-0.8.1-py3-none-any.whl (15.5 MB)
[K     |████████████████████████████████| 15.5 MB 10.7 MB/s 
[?25hCollecting category-encoders<3.0.0,>=2.3.0
  Downloading category_encoders-2.5.1.post0-py2.py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 923 kB/s 
Collecting FLAML<2.0.0,>=1.0.11
  Downloading FLAML-1.0.14-py3-none-any.whl (208 kB)
[K     |████████████████████████████████| 208 kB 71.7 MB/s 
[?25hCollecting matplotlib<4.0.0,>=3.5.1
  Downloading matplotlib-3.6.2-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (9.4 MB)
[K     |████████████████████████████████| 9.4 MB 57.4 MB/s 
[?25hCollecting lightgbm<4.0.0,>=3.3.2
  Downloading lightgbm-3.3.3-py3-none-manylinux1_x86_64.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 67.8 MB/s 
Collecting plotly<6.0.0,>=5.6.0
  Downloading plotly-5.11.0-py2.py3-

In [None]:
import nannyml as nml
from IPython.display import display


In [None]:
# Load synthetic data
reference, analysis, analysis_target = nml.load_synthetic_binary_classification_dataset()
display(reference.head())
display(analysis.head())

# Choose a chunker or set a chunk size
chunk_size = 5000

Unnamed: 0,distance_from_office,salary_range,gas_price_per_litre,public_transportation_cost,wfh_prev_workday,workday,tenure,identifier,work_home_actual,timestamp,y_pred_proba,period,y_pred
0,5.962247,40K - 60K €,2.119485,8.568058,False,Friday,0.212653,0,1,2014-05-09 22:27:20,0.99,reference,1
1,0.535872,40K - 60K €,2.357199,5.425382,True,Tuesday,4.927549,1,0,2014-05-09 22:59:32,0.07,reference,0
2,1.969519,40K - 60K €,2.366849,8.247158,False,Monday,0.520817,2,1,2014-05-09 23:48:25,1.0,reference,1
3,2.53041,20K - 40K €,2.318722,7.944251,False,Tuesday,0.453649,3,1,2014-05-10 01:12:09,0.98,reference,1
4,2.253635,60K+ €,2.221265,8.884478,True,Thursday,5.695263,4,1,2014-05-10 02:21:34,0.99,reference,1


Unnamed: 0,distance_from_office,salary_range,gas_price_per_litre,public_transportation_cost,wfh_prev_workday,workday,tenure,identifier,timestamp,y_pred_proba,period,y_pred
0,0.527691,0 - 20K €,1.800003,8.960724,False,Tuesday,4.224628,50000,2017-08-31 04:20:00,0.99,analysis,1
1,8.485134,20K - 40K €,2.222074,8.768792,False,Friday,4.963103,50001,2017-08-31 05:16:16,0.98,analysis,1
2,2.073876,40K - 60K €,2.310077,8.649979,True,Friday,4.588951,50002,2017-08-31 05:56:44,0.98,analysis,1
3,0.118456,20K - 40K €,2.171441,8.855418,False,Tuesday,4.711015,50003,2017-08-31 06:10:17,0.97,analysis,1
4,4.786705,0 - 20K €,2.368541,8.394966,False,Monday,0.906738,50004,2017-08-31 06:29:38,0.92,analysis,1


In [None]:
# initialize, specify required data columns, fit estimator and estimate
estimator = nml.CBPE(
    y_pred_proba='y_pred_proba',
    y_pred='y_pred',
    y_true='work_home_actual',
    metrics=['roc_auc'],
    chunk_size=chunk_size,
    problem_type='classification_binary',
)
estimator = estimator.fit(reference)
estimated_performance = estimator.estimate(analysis)


# CBPE provides unbiased estimation of performance of the monitored model based on the monitored model’s outputs only (i.e. without access to targets).


#y_pred_proba (ModelOutputsType) – Name(s) of the column(s) containing your model output.
#y_pred (str) – The name of the column containing your model predictions.
#y_true (str) – The name of the column containing target values (that are provided in reference data during fitting).
#metrics – A list of metrics to calculate.
#chunk_size – Splits the data into chunks containing chunks_size observations
#fit function = Fits a Metric on reference data
#estimate function = The data to estimate performance metrics for. Requires presence of either the predicted labels or prediction scores/probabilities (depending on the metric to be calculated).

In [None]:
# Show results
figure = estimated_performance.plot(kind='performance', metric='roc_auc', plot_reference=True)
figure.show()

#Here alerts are shown in the analysis period where the performance has went down.

In [None]:
# Define feature columns
feature_column_names = [
    col for col in reference.columns if col not in [
        'timestamp', 'period', 'work_home_actual', 'identifier'
    ]]

In [None]:
# Let's initialize the object that will perform the Univariate Drift calculations
univariate_calculator = nml.UnivariateDriftCalculator(
    column_names=feature_column_names,
    chunk_size=chunk_size,
    continuous_methods=['kolmogorov_smirnov', 'jensen_shannon'],
    categorical_methods=['chi2'],
)
univariate_calculator = univariate_calculator.fit(reference)
univariate_results = univariate_calculator.calculate(analysis)


#The UnivariateDriftCalculator class implements the functionality needed for univariate drift detection.
#Next, the fit() method needs to be called on the reference data, which provides the baseline that the analysis data will be compared with. 
#Then the calculate() method will calculate the drift results on the data provided to it.

In [None]:
# Plot drift results for all continuous columns
for column_name in univariate_calculator.continuous_column_names:
    figure = univariate_results.plot(
        kind='drift',
        method='jensen_shannon',
        column_name=column_name,
        plot_reference=True
    )
    figure.show()

In [None]:
# Plot drift results for all categorical columns
for column_name in univariate_calculator.categorical_column_names:
    figure = univariate_results.plot(
        kind='drift',
        method='chi2',
        column_name=column_name,
        plot_reference=True
    )
    figure.show()

In [None]:
# Let's initialize the object that will perform Data Reconstruction with PCA
rcerror_calculator = nml.DataReconstructionDriftCalculator(
    column_names=feature_column_names,
    chunk_size=chunk_size
).fit(reference_data=reference)
# let's see Reconstruction error statistics for all available data
rcerror_results = rcerror_calculator.calculate(analysis)
figure = rcerror_results.plot(kind='drift', plot_reference=True)
figure.show()