• The evidently library is used for data analysis and visualization.

• The Report class from the evidently.report module is used to generate a report on the data analysis.

• The DataDriftTable class from the evidently.metrics module is used to calculate the data drift between two datasets.

• The DatasetDriftMetric class from the evidently.metrics module is used to calculate the dataset drift between two datasets.

In [2]:

# import libraries

import pandas as pd

import numpy as np

from sklearn import datasets

import evidently
from evidently.report import Report

from evidently.metrics import DataDriftTable

from evidently.metrics import DatasetDriftMetric


In [5]:

# create ref and cur dataset for drift detection
#fetches the 'adult' dataset from the OpenML repository and stores it in the variable adult_data.
adult_data = datasets.fetch_openml(name='adult', version=2, as_frame='auto')

adult = adult_data.frame



#does not contain the values 'Some-college', 'HS-grad', or 'Bachelors'.
adult_ref = adult[~adult.education.isin(['Some-college', 'HS-grad', 'Bachelors'])]
#contains only the values 'Some-college', 'HS-grad', or 'Bachelors'.
adult_cur = adult[adult.education.isin(['Some-college', 'HS-grad', 'Bachelors'])]

#sets the values in columns 3 and 4 of the first 2000 rows of the adult_cur DataFrame to NaN.
adult_cur.iloc[:2000, 3:5] = np.nan
#This is done to simulate missing data and test the drift detection algorithm's ability to handle missing values.

In [4]:
#print first few rows of adult_data
print(adult.head())

   age  workclass  fnlwgt     education  education-num      marital-status  \
0   25    Private  226802          11th              7       Never-married   
1   38    Private   89814       HS-grad              9  Married-civ-spouse   
2   28  Local-gov  336951    Assoc-acdm             12  Married-civ-spouse   
3   44    Private  160323  Some-college             10  Married-civ-spouse   
4   18        NaN  103497  Some-college             10       Never-married   

          occupation relationship   race     sex  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black    Male             0             0   
1    Farming-fishing      Husband  White    Male             0             0   
2    Protective-serv      Husband  White    Male             0             0   
3  Machine-op-inspct      Husband  Black    Male          7688             0   
4                NaN    Own-child  White  Female             0             0   

   hours-per-week native-country  class  
0       

• The DatasetDriftMetric calculates the drift between two datasets based on the distribution of the features.

• The DataDriftTable creates a table that shows the drift between the reference and current datasets for each feature.

In [6]:

#dataset-level metrics

data_drift_dataset_report = Report(metrics=[

    DatasetDriftMetric(),

    DataDriftTable(),    

])

data_drift_dataset_report.run(reference_data=adult_ref, current_data=adult_cur)

#data_drift_dataset_report
data_drift_dataset_report.save_html("report.html")



In [22]:
#report in a JSON format

data_drift_dataset_report.json()
#show the report in a JSON format
import json

# Assuming data_drift_dataset_report.json() returns your JSON data
json_data = data_drift_dataset_report.json()

#Write JSON data to a file
with open('data_drift_report.json', 'w') as f:
    json.dump(json_data, f, indent=4)
#show the report in a JSON format
json_data


'{"version": "0.2.8", "timestamp": "2024-02-12 21:46:51.060986", "metrics": [{"metric": "DatasetDriftMetric", "result": {"drift_share": 0.5, "number_of_columns": 15, "number_of_drifted_columns": 5, "share_of_drifted_columns": 0.3333333333333333, "dataset_drift": false}}, {"metric": "DataDriftTable", "result": {"number_of_columns": 15, "number_of_drifted_columns": 5, "share_of_drifted_columns": 0.3333333333333333, "dataset_drift": false, "drift_by_columns": {"age": {"column_name": "age", "column_type": "num", "stattest_name": "Wasserstein distance (normed)", "stattest_threshold": 0.1, "drift_score": 0.18534692319042428, "drift_detected": true, "current": {"small_distribution": {"x": [17.0, 24.3, 31.6, 38.9, 46.2, 53.5, 60.8, 68.1, 75.4, 82.7, 90.0], "y": [0.02471021672878118, 0.025839691234843417, 0.0262859521410848, 0.025211766596857754, 0.015942967066340047, 0.010173168977679455, 0.0061528716099474344, 0.0018640278561586543, 0.000568686464590777, 0.0002369526935794904]}}, "reference":

2024-02-12 21:47:07.298 
  command:

    streamlit run /Users/axu/Library/Python/3.11/lib/python/site-packages/ipykernel_launcher.py [ARGUMENTS]
