In [1]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', 'src')))
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', 'src/features')))

import pandas as pd
import numpy as np
import warnings

from tqdm import tqdm
from datetime import datetime

from sklearn.metrics import fbeta_score
from sklearn.model_selection import train_test_split

from process_features import preprocess_data
from forests import ForestKind, TaskType
from naf_model import NeuralAttentionForest, NAFParams

from IPython.core.debugger import set_trace

warnings.filterwarnings("ignore")
%load_ext autoreload
%autoreload 2

path = '../../gsoc_incidents_raw3.parquet'
df = pd.read_parquet(path)

df['target'] = df['Вердикт'].apply( lambda x: True if x == 'False Positive' else (pd.NA if x == 'Не указан' else False))
df = df[df['target'].notnull()]
df['target'] = df['target'].astype(float)
df = df[::10]

y = df['target'].astype(float).to_numpy()
X = df.drop(columns=['target'])

used_columns = pd.read_csv('../src/features/used_columns.csv')

X = X[used_columns['column'].to_numpy()]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
X_train = preprocess_data(X_train, '../data/transform_data_pipeline.pkl')

params = NAFParams(
    kind=ForestKind.RANDOM,
    task=TaskType.CLASSIFICATION,
    mode='end_to_end',
    loss='cross_entropy',
    weights_init_type='general_rule_normal',
    n_epochs=30,
    lr=0.001,
    lam=0.0,
    target_loss_weight=1.0,
    hidden_size=128,
    gpu=True,
    gpu_device = 3,
    n_layers=4,
    forest=dict(
        n_estimators=200,
        min_samples_leaf=10,
        n_jobs=-1,
        max_depth=20,
        min_samples_split = 4,
    ),
    random_state=67890
)
model = NeuralAttentionForest(params)

In [2]:
model.fit(X_train, y_train)

min          = 2
max          = 884
std          = 222.5
median       = 386.0
unique count = 879


In [6]:
model.optimize_weights(X_train, y_train, batch_size=1024, background_batch_size=512, n_parts=4)

Epochs:   0%|          | 0/30 [00:00<?, ?it/s]

leafs: 100%|██████████| 1/1 [00:00<00:00,  1.59it/s]
leafs: 100%|██████████| 1/1 [00:00<00:00,  1.59it/s]
leafs: 100%|██████████| 1/1 [00:00<00:00,  1.57it/s]
leafs: 100%|██████████| 1/1 [00:00<00:00,  1.58it/s]
leafs: 100%|██████████| 1/1 [00:00<00:00,  1.59it/s]
leafs: 100%|██████████| 1/1 [00:00<00:00,  1.58it/s]
leafs: 100%|██████████| 1/1 [00:00<00:00,  1.57it/s]
leafs: 100%|██████████| 1/1 [00:00<00:00,  1.46it/s]
leafs: 100%|██████████| 1/1 [00:00<00:00,  1.61it/s]
leafs:   0%|          | 0/1 [00:00<?, ?it/s]
Epochs:   0%|          | 0/30 [00:10<?, ?it/s]


KeyboardInterrupt: 

In [8]:
X_test_proc = preprocess_data(X_test, '../data/transform_data_pipeline.pkl')

In [9]:
y_proba = model.predict_batch(X_test_proc, batch_size=1024, background_batch_size=512, n_parts=1)[:, 1]

leafs: 100%|██████████| 13/13 [00:12<00:00,  1.03it/s]
Predict batches: 100%|██████████| 4/4 [00:00<00:00, 14.10it/s]


In [10]:
#y_proba = model.predict(X_test_proc)

thresholds = np.linspace(0, 1, 100)

beta = 2

max_f1_vals = []
max_f1_args = []

f1_scores = []

y_true = np.array([1 if label == 0 else 0 for label in y_test])
for thr in thresholds:
    y_pred = (np.array([1 - score for score in y_proba]) >= 1 - thr).astype(int)
    f1 = fbeta_score(y_true, y_pred, beta=beta)
    f1_scores.append(f1)

f1_scores = np.array(f1_scores)
max_f1 = f1_scores.max()
arg_f1 = f1_scores.argmax()

print(f'max F2 = {max_f1:.3f}, threshold = {arg_f1 / 100}')

#max F2 = 0.637, threshold = 0.98


max F2 = 0.578, threshold = 0.98
