In [38]:

from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# from IPython.core.display_functions import display
import sklearn.metrics as metrics
import multiprocessing as mp

random_seed = 1
np.random.seed(random_seed)


In [39]:
plt.style.use('seaborn-colorblind')

# from https://jwalton.info/Embed-Publication-Matplotlib-Latex/
tex_fonts = {
    # Use LaTeX to write all text
    "text.usetex": True,
    "font.family": "serif",
    # Use 11pt font in plots, to match 11pt font in document
    "axes.labelsize": 11,
    "font.size": 11
}
plt.rcParams.update(tex_fonts)
# tex_plots_path = f'../bachelor-thesis/plots/pdfs/{common_id}/'


In [40]:
import glob

file_list = glob.glob('./data/predictions/raw/**/*.parquet', recursive=True)

predictions = []

for fp in file_list:
    file_name = fp.split('/')[-1]
    metadata = file_name.split('_')
    # remove .parquet
    metadata = [m.split('.')[0] for m in metadata]
    # df = pd.read_parquet(fp)
    metadata_dict = {
        'file_path': fp,
        'normalized': fp.split('/')[-3] == 'normalized',
        'window_size': None if metadata[0] == 'None' else int(metadata[0]),
        'center_window': metadata[1] == 'cw',
        'model_type': metadata[2],
        'common_id': fp.split('/')[-2]
    }
    predictions.append(metadata_dict)
    # print(fp.split('/'))
    # break

In [41]:
len(predictions)


5880

In [42]:
def get_prediction_summary(metadata_dict):
    predictions_summary = []
    pred_df = pd.read_parquet(metadata_dict['file_path'])
    threshold_min = 1
    threshold_max = 100
    threshold_steps = 300
    thresholds = np.linspace(threshold_min, threshold_max, threshold_steps)
    y_true = pred_df['is_outlier'].astype(int).to_numpy()
    m = pred_df['m'].to_numpy()
    for threshold in thresholds:
        y_pred = np.where(m > threshold, 1, 0)
        tn, fp, fn, tp = metrics.confusion_matrix(y_true, y_pred).ravel()
        f1_score = metrics.f1_score(y_true, y_pred, zero_division=0)

        predictions_summary.append({
            'common_id': metadata_dict['common_id'],
            'window_size': metadata_dict['window_size'],
            'center_window': metadata_dict['center_window'],
            'model_type': metadata_dict['model_type'],
            'normalized': metadata_dict['normalized'],
            'threshold': threshold,
            'f1_score': f1_score,
            'tn': tn,
            'fp': fp,
            'fn': fn,
            'tp': tp,
        })
    return predictions_summary
# predictions_summary_df = pd.DataFrame(predictions_summary)
# predictions_summary_df.info()

In [43]:
with mp.Pool(processes=10) as executor:
    results = executor.map(get_prediction_summary, predictions)
    result_lst = [item for sublist in results for item in sublist]


In [44]:
predictions_summary_df = pd.DataFrame(result_lst)

In [45]:
predictions_summary_df

Unnamed: 0,common_id,window_size,center_window,model_type,normalized,threshold,f1_score,tn,fp,fn,tp
0,2386-ch,2.0,False,mad-z-score,False,1.0,0.023202,42390,8080,3,96
1,2386-ch,27.0,True,delta-z-score,False,1.0,0.009620,30292,20178,1,98
2,2386-ch,16.0,False,mad-z-score,False,1.0,0.003908,0,50470,0,99
3,2386-ch,9.0,False,mean,False,1.0,0.018554,40210,10260,2,97
4,2386-ch,6.0,True,z-score,False,1.0,0.015164,43108,7362,42,57
...,...,...,...,...,...,...,...,...,...,...,...
5875,2720050000-de,9.0,False,median,True,1.0,0.428571,49445,5,43,18
5876,2720050000-de,13.0,True,mad-z-score,True,1.0,0.039978,47697,1753,24,37
5877,2720050000-de,11.0,False,z-score,True,1.0,0.006270,46331,3119,51,10
5878,2720050000-de,40.0,True,mean,True,1.0,0.038147,48431,1019,40,21


In [46]:
predictions_summary_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5880 entries, 0 to 5879
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   common_id      5880 non-null   object 
 1   window_size    5760 non-null   float64
 2   center_window  5880 non-null   bool   
 3   model_type     5880 non-null   object 
 4   normalized     5880 non-null   bool   
 5   threshold      5880 non-null   float64
 6   f1_score       5880 non-null   float64
 7   tn             5880 non-null   int64  
 8   fp             5880 non-null   int64  
 9   fn             5880 non-null   int64  
 10  tp             5880 non-null   int64  
dtypes: bool(2), float64(3), int64(4), object(2)
memory usage: 425.0+ KB


In [47]:
predictions_summary_df.to_parquet(f'./data/predictions/predictions_summary.parquet')