In [1]:

from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# from IPython.core.display_functions import display
import sklearn.metrics as metrics
import multiprocessing as mp

random_seed = 1
np.random.seed(random_seed)


In [2]:
plt.style.use('seaborn-colorblind')

# from https://jwalton.info/Embed-Publication-Matplotlib-Latex/
tex_fonts = {
    # Use LaTeX to write all text
    "text.usetex": True,
    "font.family": "serif",
    # Use 11pt font in plots, to match 11pt font in document
    "axes.labelsize": 11,
    "font.size": 11
}
plt.rcParams.update(tex_fonts)
# tex_plots_path = f'../bachelor-thesis/plots/pdfs/{common_id}/'


In [3]:
import glob

file_list = glob.glob('./data/predictions/raw_preprocessed/**/*.parquet', recursive=True)

predictions = []

for fp in file_list:
    file_name = fp.split('/')[-1]
    metadata = file_name.split('_')
    # remove .parquet
    metadata = [m.split('.')[0] for m in metadata]
    # df = pd.read_parquet(fp)
    metadata_dict = {
        'file_path': fp,
        'normalized': fp.split('/')[-3] == 'normalized',
        'window_size': None if metadata[0] == 'None' else int(metadata[0]),
        'center_window': metadata[1] == 'cw',
        'model_type': metadata[2],
        'common_id': fp.split('/')[-2]
    }
    predictions.append(metadata_dict)

In [4]:
len(predictions)


5000

In [5]:
def get_prediction_summary(metadata_dict):
    predictions_summary = []
    pred_df = pd.read_parquet(metadata_dict['file_path'])
    threshold_min = 1
    threshold_max = 100
    threshold_steps = 300
    thresholds = np.linspace(threshold_min, threshold_max, threshold_steps)
    y_true = pred_df['is_outlier'].astype(int).to_numpy()
    m = pred_df['result'].to_numpy()
    for threshold in thresholds:
        y_pred = np.where(m > threshold, 1, 0)
        tn, fp, fn, tp = metrics.confusion_matrix(y_true, y_pred).ravel()
        f1_score = metrics.f1_score(y_true, y_pred, zero_division=0)

        predictions_summary.append({
            'common_id': metadata_dict['common_id'],
            'window_size': metadata_dict['window_size'],
            'center_window': metadata_dict['center_window'],
            'model_type': metadata_dict['model_type'],
            'normalized': metadata_dict['normalized'],
            'threshold': threshold,
            'f1_score': f1_score,
            'tn': tn,
            'fp': fp,
            'fn': fn,
            'tp': tp,
        })
    return predictions_summary
# predictions_summary_df = pd.DataFrame(predictions_summary)
# predictions_summary_df.info()

In [6]:
with mp.Pool(processes=12) as executor:
    results = executor.map(get_prediction_summary, predictions)
    result_lst = [item for sublist in results for item in sublist]


In [7]:
predictions_summary_df = pd.DataFrame(result_lst)

In [8]:
predictions_summary_df

Unnamed: 0,common_id,window_size,center_window,model_type,normalized,threshold,f1_score,tn,fp,fn,tp
0,2386-ch,2.0,False,mad-z-score,False,1.000000,0.0,50467,0,102,0
1,2386-ch,2.0,False,mad-z-score,False,1.331104,0.0,50467,0,102,0
2,2386-ch,2.0,False,mad-z-score,False,1.662207,0.0,50467,0,102,0
3,2386-ch,2.0,False,mad-z-score,False,1.993311,0.0,50467,0,102,0
4,2386-ch,2.0,False,mad-z-score,False,2.324415,0.0,50467,0,102,0
...,...,...,...,...,...,...,...,...,...,...,...
1499995,2720050000-de,31.0,False,z-score,True,98.675585,0.0,49445,7,59,0
1499996,2720050000-de,31.0,False,z-score,True,99.006689,0.0,49445,7,59,0
1499997,2720050000-de,31.0,False,z-score,True,99.337793,0.0,49445,7,59,0
1499998,2720050000-de,31.0,False,z-score,True,99.668896,0.0,49445,7,59,0


In [9]:
predictions_summary_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500000 entries, 0 to 1499999
Data columns (total 11 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   common_id      1500000 non-null  object 
 1   window_size    1470000 non-null  float64
 2   center_window  1500000 non-null  bool   
 3   model_type     1500000 non-null  object 
 4   normalized     1500000 non-null  bool   
 5   threshold      1500000 non-null  float64
 6   f1_score       1500000 non-null  float64
 7   tn             1500000 non-null  int64  
 8   fp             1500000 non-null  int64  
 9   fn             1500000 non-null  int64  
 10  tp             1500000 non-null  int64  
dtypes: bool(2), float64(3), int64(4), object(2)
memory usage: 105.9+ MB


In [10]:
predictions_summary_df.to_parquet(f'./data/predictions/predictions_preprocessed_summary.parquet')

In [12]:
for id in predictions_summary_df['common_id'].unique():
    print(id)
    df = predictions_summary_df[predictions_summary_df['common_id'] == id]
    df.to_csv(f'./data/predictions/predictions_preprocessed_summary/{id}.csv', index=False)

2386-ch
39003-ie
36022-ie
42960105-de
2720050000-de
