In [1]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', 'src')))
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', 'src/features')))

import re
import pytz
import torch
import joblib
import pandas as pd
import numpy as np
import warnings

from tqdm import tqdm
from datetime import datetime

from sklearn.metrics import fbeta_score
from sklearn.model_selection import train_test_split

from process_features import preprocess_data
from forests import ForestKind, TaskType
from naf_model import NeuralAttentionForest, NAFParams

from IPython.core.debugger import set_trace

warnings.filterwarnings("ignore")
%load_ext autoreload
%autoreload 2

path = '../../gsoc_incidents_raw3.parquet'
df = pd.read_parquet(path)

df['target'] = df['Вердикт'].apply( lambda x: True if x == 'False Positive' else (pd.NA if x == 'Не указан' else False))
df = df[df['target'].notnull()]
df['target'] = df['target'].astype(float)
df = df[::1000]

y = df['target'].astype(float).to_numpy()
X = df.drop(columns=['target'])

used_columns = pd.read_csv('../src/features/used_columns.csv')

X = X[used_columns['column'].to_numpy()]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
X_train = preprocess_data(X_train, '../data/transform_data_pipeline.pkl')

params = NAFParams(
    kind=ForestKind.RANDOM,
    task=TaskType.CLASSIFICATION,
    mode='end_to_end',
    loss='cross_entropy',
    n_epochs=80,
    lr=0.01,
    lam=0.0,
    target_loss_weight=1.0,
    hidden_size=16,
    gpu=True,
    gpu_device = 3,
    n_layers=1,
    forest=dict(
        n_estimators=100,
        min_samples_leaf=1,
        n_jobs=-1
    ),
    random_state=67890
)
model = NeuralAttentionForest(params)

In [2]:
model.fit(X_train, y_train)

In [3]:
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [4]:
os.environ["CUDA_VISIBLE_DEVICES"] = str(params.gpu_device)

In [3]:
model.optimize_weights(X_train, y_train, batch_size=2048, background_batch_size=1024)

matplotlib data path: /home/agubarev/miniconda3/envs/naf_work/lib/python3.11/site-packages/matplotlib/mpl-data
CONFIGDIR=/raid/agubarev/.config/matplotlib
interactive is False
platform is linux
CACHEDIR=/raid/agubarev/.cache/matplotlib
Using fontManager instance from /raid/agubarev/.cache/matplotlib/fontlist-v390.json
Train E2E:   0%|          | 0/80 [00:00<?, ?it/s]


RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/agubarev/miniconda3/envs/naf_work/lib/python3.11/site-packages/torch/utils/data/_utils/worker.py", line 349, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
           ^^^^^^^^^^^^^^^^^^^^
  File "/home/agubarev/miniconda3/envs/naf_work/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/agubarev/miniconda3/envs/naf_work/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 52, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
            ~~~~~~~~~~~~^^^^^
  File "/home/agubarev/miniconda3/envs/naf_work/lib/python3.11/site-packages/torch/utils/data/dataset.py", line 211, in __getitem__
    return tuple(tensor[index] for tensor in self.tensors)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/agubarev/miniconda3/envs/naf_work/lib/python3.11/site-packages/torch/utils/data/dataset.py", line 211, in <genexpr>
    return tuple(tensor[index] for tensor in self.tensors)
                 ~~~~~~^^^^^^^
RuntimeError: CUDA error: initialization error
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.



In [None]:
model.load(X_train.shape[1], '../models/')

In [3]:
X_test_proc = preprocess_data(X_test, '../data/transform_data_pipeline.pkl')

In [7]:
y_proba = model.predict(X_test_proc)[:, 1]
#y_proba = model.predict(X_test_proc)

thresholds = np.linspace(0, 1, 100)

beta = 2

max_f1_vals = []
max_f1_args = []

f1_scores = []

y_true = np.array([1 if label == 0 else 0 for label in y_test])
for thr in thresholds:
    y_pred = (np.array([1 - score for score in y_proba]) >= 1 - thr).astype(int)
    f1 = fbeta_score(y_true, y_pred, beta=beta)
    f1_scores.append(f1)

f1_scores = np.array(f1_scores)
max_f1 = f1_scores.max()
arg_f1 = f1_scores.argmax()

print(f'max F2 = {max_f1:.3f}, threshold = {arg_f1 / 100}')


predict: 100%|██████████| 2/2 [00:00<00:00, 243.93it/s]




max F2 = 0.550, threshold = 0.99


In [None]:
model.save('../models/')