### Install alibi_detect library

In [2]:
!pip install alibi alibi_detect

Collecting alibi
  Downloading alibi-0.9.6-py3-none-any.whl.metadata (22 kB)
Collecting alibi_detect
  Downloading alibi_detect-0.12.0-py3-none-any.whl.metadata (28 kB)
Collecting scikit-image<0.23,>=0.17.2 (from alibi)
  Downloading scikit_image-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting Pillow<11.0,>=5.4.1 (from alibi)
  Downloading pillow-10.4.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (9.2 kB)
Collecting attrs<24.0.0,>=19.2.0 (from alibi)
  Downloading attrs-23.2.0-py3-none-any.whl.metadata (9.5 kB)
Collecting dill<0.4.0,>=0.3.0 (from alibi)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting numba!=0.54.0,<0.60.0,>=0.50.0 (from alibi_detect)
  Downloading numba-0.59.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.7 kB)
Collecting llvmlite<0.43,>=0.42.0dev0 (from numba!=0.54.0,<0.60.0,>=0.50.0->alibi_detect)
  Downloading llvmlite-0.42.0-cp310-cp310-manylinux_2_17_x86_64.manylinux201

In [2]:
import alibi
from alibi_detect.cd import ChiSquareDrift, TabularDrift
from alibi_detect.saving import save_detector, load_detector
import numpy as np
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn


In [5]:
songs_data = pd.read_parquet( "https://raw.githubusercontent.com/anmol-master/isb-the-overfitters/main/MLCT1/Data/song_data/song_data.parquet" )

In [21]:
songs_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18835 entries, 0 to 18834
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   song_name         18835 non-null  object 
 1   song_popularity   18835 non-null  int16  
 2   song_duration_ms  18835 non-null  int16  
 3   acousticness      18835 non-null  float64
 4   danceability      18835 non-null  float64
 5   energy            18835 non-null  float64
 6   instrumentalness  18835 non-null  float64
 7   key               18835 non-null  int16  
 8   liveness          18835 non-null  float64
 9   loudness          18835 non-null  float64
 10  audio_mode        18835 non-null  int16  
 11  speechiness       18835 non-null  float64
 12  tempo             18835 non-null  float64
 13  time_signature    18835 non-null  int16  
 14  audio_valence     18835 non-null  float64
dtypes: float64(9), int16(5), object(1)
memory usage: 1.6+ MB


In [7]:
x_features = list(songs_data.columns)

In [8]:
x_features

['song_name',
 'song_popularity',
 'song_duration_ms',
 'acousticness',
 'danceability',
 'energy',
 'instrumentalness',
 'key',
 'liveness',
 'loudness',
 'audio_mode',
 'speechiness',
 'tempo',
 'time_signature',
 'audio_valence']

#### Specify the index of the columns which are categorical feautures

In [9]:
cat_vars = [0]

In [10]:
X = songs_data[x_features]
y = songs_data.song_popularity

### Split the dataset into two sets

**Note**: In this exampls, data is split to create train and production datasets. This is done only for the lab session. In real world, the production data will come from the inference stystem.

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_prod, y_train, y_prod = train_test_split(X,
                                                    y,
                                                    train_size = 0.8,
                                                    random_state = 45)

In [13]:
categories_per_feature = {f: None for f in cat_vars}

In [14]:
categories_per_feature

{0: None}

### Measure the drift

In [15]:
cd = TabularDrift(X_train.values,
                  p_val=.05,
                  categories_per_feature=categories_per_feature)

In [16]:
filepath = 'songsdrift'  # change to directory where detector is saved
save_detector(cd, filepath, legacy = True)



In [17]:
cd = load_detector(filepath)



In [18]:
preds = cd.predict(X_prod.to_numpy())

### Printing the test results

- KS test for the numerical features
- chi-squared test for the categorical features

In [19]:
for f in range(cd.n_features):
    stat = 'Chi2' if f in list(categories_per_feature.keys()) else 'K-S'
    fname = x_features[f]
    stat_val, p_val = preds['data']['distance'][f], preds['data']['p_val'][f]
    print(f'{fname} -- {stat} {stat_val:.3f} -- p-value {p_val:.3f}')

song_name -- Chi2 13053.816 -- p-value 0.536
song_popularity -- K-S 0.011 -- p-value 0.882
song_duration_ms -- K-S 0.009 -- p-value 0.958
acousticness -- K-S 0.012 -- p-value 0.743
danceability -- K-S 0.016 -- p-value 0.389
energy -- K-S 0.013 -- p-value 0.640
instrumentalness -- K-S 0.010 -- p-value 0.943
key -- K-S 0.010 -- p-value 0.923
liveness -- K-S 0.010 -- p-value 0.896
loudness -- K-S 0.017 -- p-value 0.379
audio_mode -- K-S 0.012 -- p-value 0.801
speechiness -- K-S 0.013 -- p-value 0.701
tempo -- K-S 0.036 -- p-value 0.001
time_signature -- K-S 0.005 -- p-value 1.000
audio_valence -- K-S 0.014 -- p-value 0.603
