In [2]:
import sys
sys.path.append('/data/joramvandriel/ai-monitoring-streamlit-dashboard/')
from drift_dataset import DriftDataset
from drift_detector import MeanStdDetector
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px

## Inlezen van DriftDataset

In [10]:
# postprocessing functie
def mark_datasets(df: pd.DataFrame):
    """Post processing function to add markings for labelled and unlabelled samples in each dataset"""
    
    def mark_labels(df: pd.DataFrame):
        """To be applied over groups, marking the datasets"""
        df = df.copy()
        [idx] = df.dataset_index.unique()
        if idx == 0:
            train_label = 'train'
            oos_label = 'oos'
        else:
            train_label = f'labelled_{idx}'
            oos_label = f'unlabelled_{idx}'

        begindatum = max(df.Begindatum)
        mask = df.Begindatum < begindatum
        df['dataset_markering'] = None
        df.loc[mask, 'dataset_markering'] = train_label
        df.loc[~mask, 'dataset_markering'] = oos_label
        
        return df
    
    df = df.copy()
    df = df.groupby(['dataset_index']).apply(mark_labels)
    df = df.reset_index(drop=True)
    
    return df


In [11]:
binnenvaart_pad = Path.home()/'share'/'Binnenvaart'/'Monitoring'
gini = pd.read_csv(binnenvaart_pad/'meta'/'meanDecreaseGini.csv')
features = gini.Variable[:10].tolist() + ['Begindatum', 'dataset_markering'] # laatste wordt toegevoegd door mark_datasets()

dd = DriftDataset(binnenvaart_pad/'data', post_process=mark_datasets)
dd.keep_columns(features) 

In [12]:
dd.keep_columns(features) 

# AI Binnenvaart specifiek, voorspellingsdata vergelijken alleen met de training data, niet met de out of sample op moment van de training.
# Daarom hierboven de postprocessing functie aangemaakt, daar kan ik nu dus op filteren:

data = dd.dataset
data = data[data.dataset_markering != 'oos']  # alleen wegfilteren bij uitlezen, deze data zit dus wel nog steeds in de DriftDataset. 

labelled_data = dd.dataset  
train_mask = labelled_data.dataset_markering == 'train'
labelled_mask = labelled_data.dataset_markering.str.contains('^labelled')
mask = train_mask | labelled_mask
labelled_data = labelled_data[mask]

In [16]:
labelled_data.dataset_markering

0               train
1               train
2               train
3               train
4               train
             ...     
397514    labelled_10
397578    labelled_10
397643    labelled_10
397644    labelled_10
397655    labelled_10
Name: dataset_markering, Length: 56824, dtype: object

In [17]:
avg = MeanStdDetector(labelled_data)

Python 3.9.7 (default, Sep 16 2021, 13:09:58) 
Type 'copyright', 'credits' or 'license' for more information
IPython 7.29.0 -- An enhanced Interactive Python. Type '?' for help.



In [1]:  quit





In [19]:
from scipy.stats import ks_2samp

In [65]:
features = dd.numeric_columns()
data = labelled_data

In [94]:
refset = data.loc[data['dataset_markering']=='train']
refset

Unnamed: 0,Lat_insploc,Bouwjaar,laatstAlgemeen,dataset_markering,Begindatum,soortBeroepsvaartuig,Tonnage,Lengte_vaartuig,Breedte_vaartuig,dataset_date,dataset_index,insp_weekdag,Diepgang_vaartuig,Lon_insploc
0,52.191315,1965,3680,train,2015-05-22,onbekend,1037.0,49.0,7.0,2020-03-16,0,Fri,2.0,4.528120
1,52.175522,1965,3680,train,2015-05-22,onbekend,1037.0,49.0,7.0,2020-03-16,0,Fri,2.0,4.517810
2,51.636654,1965,3680,train,2017-09-27,onbekend,1037.0,49.0,7.0,2020-03-16,0,Wed,2.0,4.247420
3,0.000000,1965,3680,train,2018-01-11,onbekend,1037.0,49.0,7.0,2020-03-16,0,Thu,2.0,0.000000
4,51.707363,1965,3680,train,2016-09-07,onbekend,1037.0,49.0,7.0,2020-03-16,0,Wed,2.0,4.589214
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37858,0.000000,1965,3680,train,2015-07-28,onbekend,1037.0,49.0,7.0,2020-03-16,0,Tue,2.0,0.000000
37860,52.195004,1965,3680,train,2015-08-07,onbekend,1037.0,49.0,7.0,2020-03-16,0,Fri,2.0,6.538424
37875,0.000000,1965,3680,train,2015-05-28,onbekend,1037.0,49.0,7.0,2020-03-16,0,Thu,2.0,0.000000
37876,52.060761,2013,3680,train,2015-04-21,onbekend,1037.0,11.0,7.0,2020-03-16,0,Tue,2.0,4.661255


In [92]:
def kstest(x,y):
    features = list(x.select_dtypes('number').columns)
    result = { fi : ks_2samp(x[fi],y[fi]).statistic for fi in features }
    return pd.Series(result)   

In [100]:
ks = data.groupby(['dataset_index', 'dataset_date']).apply(kstest, y=refset)

In [101]:
ks

Unnamed: 0_level_0,Unnamed: 1_level_0,Lat_insploc,Bouwjaar,laatstAlgemeen,Tonnage,Lengte_vaartuig,Breedte_vaartuig,dataset_index,Diepgang_vaartuig,Lon_insploc
dataset_index,dataset_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,2020-03-16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2021-02-01,0.066496,0.035978,0.318698,0.030782,0.033209,0.033232,1.0,0.026302,0.062052
2,2021-09-28,0.115403,0.056107,0.282146,0.067065,0.064261,0.059185,1.0,0.075982,0.106442
3,2021-10-06,0.116597,0.055523,0.284064,0.066709,0.06413,0.058355,1.0,0.075607,0.107625
4,2021-10-28,0.123092,0.056629,0.287166,0.06902,0.067018,0.06007,1.0,0.077271,0.114759
5,2021-11-10,0.124187,0.057235,0.365751,0.070185,0.067946,0.06144,1.0,0.077758,0.116855
6,2021-11-10,0.124187,0.057235,0.365751,0.070185,0.067946,0.06144,1.0,0.077758,0.116855
7,2021-11-15,0.123674,0.058359,0.365427,0.071094,0.069255,0.062026,1.0,0.07859,0.11639
8,2021-12-01,0.126366,0.058286,0.363396,0.071156,0.06999,0.062028,1.0,0.079195,0.119018
9,2021-12-15,0.128192,0.057677,0.362892,0.069676,0.068786,0.061226,1.0,0.078142,0.120731


In [96]:
drift = data.groupby(['dataset_index', 'dataset_date']).agg(['mean']) # use as filler

In [98]:
drift.columns = ['_'.join(pair) for pair in drift.columns] # resetting the columns

In [99]:
drift

Unnamed: 0_level_0,Unnamed: 1_level_0,Lat_insploc_mean,Bouwjaar_mean,laatstAlgemeen_mean,Tonnage_mean,Lengte_vaartuig_mean,Breedte_vaartuig_mean,Diepgang_vaartuig_mean,Lon_insploc_mean
dataset_index,dataset_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,2020-03-16,42.169242,1979.251445,1371.423943,1734.507514,77.66911,9.271678,2.713059,3.993804
1,2021-02-01,38.975015,1980.93786,1312.792,1812.505512,79.382618,9.412327,2.744605,3.697811
2,2021-09-28,36.654288,1982.199415,1179.385756,1941.944029,82.246094,9.588423,2.835298,3.472688
3,2021-10-06,36.59338,1982.154268,1186.382733,1940.483403,82.191521,9.585732,2.83416,3.468035
4,2021-10-28,36.225854,1982.203866,1195.562375,1946.743058,82.323499,9.588776,2.835783,3.432665
5,2021-11-10,36.11876,1982.247844,1507.814518,1947.435029,82.405071,9.596449,2.835947,3.422339
6,2021-11-10,36.11876,1982.247844,1507.814518,1947.435029,82.405071,9.596449,2.835947,3.422339
7,2021-11-15,36.143282,1982.292864,1506.225327,1951.684985,82.437429,9.599552,2.83843,3.423794
8,2021-12-01,36.007546,1982.285395,1497.176315,1955.285209,82.472799,9.601227,2.837465,3.410873
9,2021-12-15,35.920496,1982.254147,1494.12779,1950.502285,82.346172,9.596756,2.835259,3.404352


In [40]:
for ti in list(data.dataset_markering.unique()):
    for fi in features:
        # Kolmogorov-Smirnov test
        KS = ks_2samp(data.loc[data['dataset_markering']=='train'][fi].to_numpy(), data.loc[data['dataset_markering']==ti][fi].to_numpy())
        drift.append(
            {
                'dataset_markering': ti,
                'kstest': KS.statistic,
                'feature': fi
            }
        )

## Kullback-Leibler divergence

In [116]:
from scipy.stats import entropy
import numpy as np
from scipy.spatial import distance

In [105]:
p = data.loc[data['dataset_index']==0]['Bouwjaar'].to_numpy()
q = data.loc[data['dataset_index']==1]['Bouwjaar'].to_numpy()

In [109]:
def jsdist(df1,df2):
    if len(df1)>len(df2):
        np.random.shuffle(df1)
        df1 = df1[0:len(df2)]
    elif len(df1)<len(df2):
        np.random.shuffle(df2)
        df2 = df2[0:len(df1)]
    return distance.jensenshannon(df1, df2)


In [110]:
def kldist(df1,df2):
    if len(df1)>len(df2):
        np.random.shuffle(df1)
        df1 = df1[0:len(df2)]
    elif len(df1)<len(df2):
        np.random.shuffle(df2)
        df2 = df2[0:len(df1)]
    return entropy(df1, df2)

In [113]:
kl = kldist(p,q)

In [114]:
kl

0.00020436009850348423

In [117]:
js = jsdist(p,q)
js

5.029229207909709e-05