# Построение предсказания на основе агрегирации ответов моделей - best models

In [1]:
import os
import re
import pickle
import json
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
from collections import defaultdict
from hack_lap.utils.evaluate import precision_recall, estimate_prediction, calculate_metrics_one_vs_rest_

In [2]:
DIR_DATA = os.path.join('..', 'data')
DIR_PREDICT = os.path.join(DIR_DATA, 'predict') 
DIR_MODEL = os.path.join(DIR_DATA, 'model')
MIN_F1 = 0.30

ps = re.compile(f'seed-(\d+)', re.I)
pr = re.compile(f'r-(\d+)', re.I)

files_json = [n for n in os.listdir(DIR_MODEL) if n.endswith('.json') and n.startswith('b') and 'best' in n]

files_by_try = defaultdict(list)
for f in files_json:
    seed = [int(g.group(1)) for g in ps.finditer(f)]
    r = [int(g.group(1)) for g in pr.finditer(f)]
    assert len(seed) == 1
    assert len(r) == 1
    seed = seed[0]
    r = r[0]
    files_by_try[(seed, r)].append(f)
    
files_json = []
for _, v in files_by_try.items():
    best_v = None
    best_f1 = 0.0
    for vi in v:
        with open(os.path.join(DIR_MODEL, vi)) as fp:
            res = json.load(fp)
        if best_f1 < res['best_f1']:
            best_f1 = res['best_f1']
            best_v = vi
    if best_f1 < MIN_F1:
        continue
    files_json.append((best_v, best_f1))
len(files_json)

38

In [8]:
dump_factor = np.linspace(0.9, 1.0, 11)
predict = []
for f_json, f1 in tqdm(files_json):
    f = f_json.replace('json', 'pkl')
    with open(os.path.join(DIR_MODEL, f), 'rb') as fp:
        data = pickle.load(fp)
    yt_dev, yp_dev, yp_test = data['yt_dev'], data['yp_dev'], data['yp_test']
    
    score = []
    for f in dump_factor:
        _, (_, _, f11) = calculate_metrics_one_vs_rest_(yt_dev, yp_dev, dump_factor=f)
        score.append(f11)
    ss = np.nanargmax(score)
    

    yp_dev = np.mean(yp_dev, axis=1).ravel()
    rp0, rp1, th = precision_recall(yt_dev, yp_dev)
    f1 = 2 * rp1[0] * rp1[1] / (rp1[0] + rp1[1] + 1e-6)
    ii = np.argmax(f1)
    f1 = f1[ii]
    th = th[ii]

    yp_test = np.mean(yp_test, axis=1)
    yp_test = (yp_test > th * dump_factor[ss]).astype(int).reshape(-1, 1)
    predict.append(yp_test)

  0%|          | 0/38 [00:00<?, ?it/s]

In [12]:
submission = np.concatenate(predict, axis=1)
print(submission.shape[1])
submission = np.mean(submission, axis=1)
th = 0.5
cls_pred = (submission > th).astype(int)
print(f'# active: {np.sum(cls_pred)}')

38
# active: 25


In [19]:
df = pd.read_csv(os.path.join(DIR_DATA, 'test.csv'))
df['Active'] = submission

seeds = tuple(sorted(files_by_try.keys()))
seed_hash = hash(''.join(map(str, seeds)))
name = f'b-L4-H32-Patt-NF-BF-#seeds-{len(seeds)}_hash-{seed_hash}_mean_th-{th}'

print(name)
df.to_csv(os.path.join(DIR_PREDICT, name + '.csv'), index=False)

with open(os.path.join(DIR_PREDICT, name + '.json'), 'w') as fp:
    json.dump({'seeds': seeds}, fp, indent=2)

b-L4-H32-Patt-NF-BF-#seeds-44_hash--1757033375511565524_mean_th-0.5
