* optunaを用いてBCEWithLogitLossが最小になるように比率を求める。  <-
* LinearRegression, Ridge, scipyを用いてMSEが最小になるように比率を求める

In [None]:
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-3.1.0-py3-none-any.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.3/365.3 KB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
Collecting alembic>=1.5.0
  Downloading alembic-1.10.2-py3-none-any.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting cmaes>=0.9.1
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 KB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.10.2 cmaes-0.9.1 colorlog-6.7.0 optuna-3.1.0


In [None]:
import scipy as sp
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from collections import OrderedDict

import torch
import torch.nn as nn

import optuna

In [None]:
def fbeta_score(y_true_ids: pd.Series, y_pred_ids: pd.Series, beta=2, eps=1e-15):
    """
    Args:
        y_true_ids: true labels
        y_pred_ids: predictions

    It is assumed that the above two are in the same topic order.
    """
    true_ids = y_true_ids.str.split()
    pred_ids = y_pred_ids.str.split()
    score_list = []
    for true, pred in zip(true_ids, pred_ids):
        TP = (set(true) & set(pred))
        try:
            precision = len(TP) / len(pred)
            recall = len(TP) / len(true)
            f2 = (1+beta**2) * (precision*recall) / ((beta**2)*precision+recall+eps)
        except:
            f2 = 0
        score_list.append(f2)

    try:
        score = sum(score_list) / len(score_list)
    except:
        score = 0
    return score


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def get_f2_score(predictions, valid_folds, TOP, apply_sigmoid):
    if apply_sigmoid:
        valid_folds["sigmoid"] = sigmoid(predictions)
    else:
        valid_folds["sigmoid"] = predictions
    best_score = -np.inf
    best_threshold = 0
    df_corr = pd.read_csv("/content/drive/MyDrive/KAGGLE-LECR/correlations.csv")

    # sigmoidが大きい順にN個選ぶ
    top_sigmoids = valid_folds.groupby('topic_id').apply(lambda x: x.sort_values(by='sigmoid', ascending=False).head(TOP)).reset_index(drop=True)
    top_sigmoids = pd.DataFrame(top_sigmoids.groupby("topic_id")["predictions"].agg(list)).reset_index()
    top_sigmoids["predictions"] = top_sigmoids["predictions"].apply(lambda x: " ".join(x))
    count = 0
    with tqdm(np.arange(0.001, 0.5, 0.001), desc="Search best threshold") as pbar:
        for thre in pbar:
            if count == 30:
                break
            valid_folds["pred"] = np.where(valid_folds["sigmoid"] > thre, 1, 0)        
            pred_1 = valid_folds[valid_folds["pred"] == 1].reset_index(drop=True)        

            topic_true = pd.DataFrame(pred_1.groupby("topic_id")["predictions"].agg(list)).reset_index()
            topic_true["predictions"] = topic_true["predictions"].apply(lambda x: " ".join(x))

            # predictionsがないものも考慮する
            least_df = top_sigmoids[~top_sigmoids["topic_id"].isin(topic_true["topic_id"].values)].reset_index(drop=True)
            topic_true = pd.concat([topic_true, least_df], ignore_index=True)
            topic_true = pd.merge(topic_true, df_corr, on="topic_id", how="left")
            score = fbeta_score(topic_true["content_ids"], topic_true["predictions"])        
            count += 1
            if score > best_score:
                count = 0
                best_score = score
                best_threshold = thre
                pbar.set_postfix(OrderedDict(best_score=score, best_threshold = thre))
    return best_score, best_threshold



def get_max_score(y_true_ids, y_pred_ids, beta=2, eps=1e-15):
    """
    precision -> 1
    recall -> No change
    """
    true_ids = y_true_ids.str.split()
    pred_ids = y_pred_ids.str.split()
    score_list = []
    for true, pred in zip(tqdm(true_ids), pred_ids):
        TP = (set(true) & set(pred))
        #precision = len(TP) / len(pred) 
        precision = 1
        recall = len(TP) / len(true)     
        f2 = (1+beta**2) * (precision*recall) / ((beta**2)*precision+recall+eps)
        score_list.append(f2)
    score = sum(score_list) / len(score_list)
    return score


def get_score(y_true_ids, y_pred_ids, beta=2, eps=1e-15):
    true_ids = y_true_ids.str.split()
    pred_ids = y_pred_ids.str.split()
    score_list = []
    for true, pred in zip(true_ids, pred_ids):
        TP = (set(true) & set(pred))
        try:
            precision = len(TP) / len(pred) 
            #precision = 1
            recall = len(TP) / len(true)     
            f2 = (1+beta**2) * (precision*recall) / ((beta**2)*precision+recall+eps)
        except:
            f2 = 0
        score_list.append(f2)
    score = sum(score_list) / len(score_list)
    return score


def get_pos_score(y_true_ids, y_pred_ids):
    """
    calculate recall
    ref: https://www.kaggle.com/code/ragnar123/lecr-unsupervised-train-set-public
    """
    y_true_ids = y_true_ids.str.split()
    y_pred_ids = y_pred_ids.str.split()
    score_list = []
    for true, pred in zip(y_true_ids, y_pred_ids):
        score_list.append(np.array([len(set(true) & set(pred)) / len(true)]))
    return round(np.mean(score_list), 5)  

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df_1 = pd.read_csv("/content/drive/MyDrive/KAGGLE-LECR/last_data/2nd/exp004/fold0/oof_df_fold0_epoch3.csv")
df_2 = pd.read_csv("/content/drive/MyDrive/KAGGLE-LECR/last_data/2nd/exp006/fold0/oof_df_fold0_epoch3.csv")
df_3 = pd.read_csv("/content/drive/MyDrive/KAGGLE-LECR/last_data/2nd/exp007/fold0/oof_df_fold0_epoch3.csv")
df_4 = pd.read_csv("/content/drive/MyDrive/KAGGLE-LECR/last_data/2nd/exp008/fold0/oof_df_fold0_epoch3.csv")

In [None]:
df_1

Unnamed: 0,topic_id,predictions,target,topic_sentence,topic_language,content_sentence,content_language,tokenize_length,valid_pred
0,t_30dd476279c8,c_bd164013582b,0,Medicine[SEP][SEP]Medicine,en,Cancer[SEP],en,9,-2.507750
1,t_30dd476279c8,c_e7e9bf83b9ea,0,Medicine[SEP][SEP]Medicine,en,Cáncer[SEP],es,9,-2.507750
2,t_30dd476279c8,c_d7446dc9061f,0,Medicine[SEP][SEP]Medicine,en,Diabetes[SEP],en,9,-3.415649
3,t_30dd476279c8,c_ee401e1ae04b,0,Medicine[SEP][SEP]Medicine,en,Education[SEP],en,9,-4.518500
4,t_30dd476279c8,c_a7926808742b,0,Medicine[SEP][SEP]Medicine,en,Medicine[SEP],en,9,1.307234
...,...,...,...,...,...,...,...,...,...
359495,t_39c32304ce48,c_db4154fba355,0,যোগ ও বিয়োগ সম্পর্কিত সমস্যা[SEP][SEP]খান একাড...,bn,অজানা সংখ্যাটিকে নির্ণয় (১০০ এর চেয়ে ছোট সংখ্য...,bn,256,-3.802538
359496,t_b6a07c7e7250,c_0114ea7c86bb,0,Protozoa and Humans[SEP]Students should be abl...,en,"Taenia solium[SEP]Taenia solium, also called t...",en,256,-5.537945
359497,t_59eee9f625ef,c_30719c89f57e,0,Application of elastic properties (Bonus) [SEP...,en,English: Elasticity: Studying How Solids Chang...,en,256,-6.857806
359498,t_709934dd18de,c_621085d694ee,0,Factoring polynomials with quadratic forms (Hi...,en,Factoring difference of squares: two variables...,en,256,-8.203346


In [None]:
df = pd.merge(df_1.rename(columns={"valid_pred":"valid_pred_ver1"}), 
                df_2[["topic_id", "predictions", "valid_pred"]].rename(columns={"valid_pred":"valid_pred_ver2"}),
                on=["topic_id", "predictions"], how="left")

df = pd.merge(df, 
                df_3[["topic_id", "predictions", "valid_pred"]].rename(columns={"valid_pred":"valid_pred_ver3"}),
                on=["topic_id", "predictions"], how="left")

df = pd.merge(df, 
                df_4[["topic_id", "predictions", "valid_pred"]].rename(columns={"valid_pred":"valid_pred_ver4"}),
                on=["topic_id", "predictions"], how="left")

In [None]:
df

Unnamed: 0,topic_id,predictions,target,topic_sentence,topic_language,content_sentence,content_language,tokenize_length,valid_pred_ver1,valid_pred_ver2,valid_pred_ver3,valid_pred_ver4
0,t_30dd476279c8,c_bd164013582b,0,Medicine[SEP][SEP]Medicine,en,Cancer[SEP],en,9,-2.507750,-6.396333,-3.069698,-4.736598
1,t_30dd476279c8,c_e7e9bf83b9ea,0,Medicine[SEP][SEP]Medicine,en,Cáncer[SEP],es,9,-2.507750,-6.396333,-10.459763,-3.716154
2,t_30dd476279c8,c_d7446dc9061f,0,Medicine[SEP][SEP]Medicine,en,Diabetes[SEP],en,9,-3.415649,-6.252289,0.979025,-5.983439
3,t_30dd476279c8,c_ee401e1ae04b,0,Medicine[SEP][SEP]Medicine,en,Education[SEP],en,9,-4.518500,-8.135379,-8.154634,-8.281442
4,t_30dd476279c8,c_a7926808742b,0,Medicine[SEP][SEP]Medicine,en,Medicine[SEP],en,9,1.307234,6.970675,-5.653092,-3.832255
...,...,...,...,...,...,...,...,...,...,...,...,...
359495,t_39c32304ce48,c_db4154fba355,0,যোগ ও বিয়োগ সম্পর্কিত সমস্যা[SEP][SEP]খান একাড...,bn,অজানা সংখ্যাটিকে নির্ণয় (১০০ এর চেয়ে ছোট সংখ্য...,bn,256,-3.802538,-8.776278,-7.507927,-8.620546
359496,t_b6a07c7e7250,c_0114ea7c86bb,0,Protozoa and Humans[SEP]Students should be abl...,en,"Taenia solium[SEP]Taenia solium, also called t...",en,256,-5.537945,-9.081018,-9.828444,-9.149844
359497,t_59eee9f625ef,c_30719c89f57e,0,Application of elastic properties (Bonus) [SEP...,en,English: Elasticity: Studying How Solids Chang...,en,256,-6.857806,-9.108803,-12.234751,-10.364032
359498,t_709934dd18de,c_621085d694ee,0,Factoring polynomials with quadratic forms (Hi...,en,Factoring difference of squares: two variables...,en,256,-8.203346,-9.114568,-11.376850,-10.480355


In [None]:
criterion = nn.BCEWithLogitsLoss()

def objective(trial):
    a = trial.suggest_float("a", 0, 1)
    b = trial.suggest_float("b", 0, 1)
    c = trial.suggest_float("c", 0, 1)
    d = trial.suggest_float("d", 0, 1)

    df["pred"] = df["valid_pred_ver1"] * a + df["valid_pred_ver2"] * b + df["valid_pred_ver3"] * c + df["valid_pred_ver4"] * d
    y_pred = torch.tensor(df["pred"].values, dtype=torch.float)
    target = torch.tensor(df["target"], dtype=torch.float)

    loss = criterion(y_pred, target)
    return loss

In [None]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [None]:
study = optuna.create_study(direction="minimize",
                            sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(objective, 
               n_trials=1000, 
               show_progress_bar=True)

  self._init_valid()


  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
best_params = study.best_params
best_params

{'a': 0.0886588204083501,
 'b': 0.14489460948545882,
 'c': 0.10778672130553446,
 'd': 0.07987491625275309}

In [None]:
df["valid_pred"] = df["valid_pred_ver1"] * best_params["a"] \
                 + df["valid_pred_ver2"] * best_params["b"] \
                 + df["valid_pred_ver3"] * best_params["c"] \
                 + df["valid_pred_ver4"] * best_params["d"]

In [None]:
for top in range(1, 20, 1):
    best_score, best_threshold = get_f2_score(df["valid_pred"].values, df, TOP=top, apply_sigmoid=True)
    print(top, best_score, best_threshold)

Search best threshold:   0%|          | 0/499 [00:00<?, ?it/s]

1 0.5900639000494646 0.063


Search best threshold:   0%|          | 0/499 [00:00<?, ?it/s]

2 0.5920085380591308 0.063


Search best threshold:   0%|          | 0/499 [00:00<?, ?it/s]

3 0.5928270473893416 0.063


Search best threshold:   0%|          | 0/499 [00:00<?, ?it/s]

4 0.5938261155045604 0.063


Search best threshold:   0%|          | 0/499 [00:00<?, ?it/s]

5 0.5945514910467365 0.063


Search best threshold:   0%|          | 0/499 [00:00<?, ?it/s]

6 0.5951261477718053 0.063


Search best threshold:   0%|          | 0/499 [00:00<?, ?it/s]

7 0.5952919657110424 0.063


Search best threshold:   0%|          | 0/499 [00:00<?, ?it/s]

8 0.5952511077359379 0.063


Search best threshold:   0%|          | 0/499 [00:00<?, ?it/s]

9 0.5951763523047697 0.063


Search best threshold:   0%|          | 0/499 [00:00<?, ?it/s]

10 0.5954664453353677 0.063


Search best threshold:   0%|          | 0/499 [00:00<?, ?it/s]

11 0.5955119521840888 0.063


Search best threshold:   0%|          | 0/499 [00:00<?, ?it/s]

12 0.595579685785456 0.063


Search best threshold:   0%|          | 0/499 [00:00<?, ?it/s]

13 0.5954684367492565 0.063


Search best threshold:   0%|          | 0/499 [00:00<?, ?it/s]

14 0.5953805662355822 0.063


Search best threshold:   0%|          | 0/499 [00:00<?, ?it/s]

15 0.5954598500462008 0.063


Search best threshold:   0%|          | 0/499 [00:00<?, ?it/s]

16 0.5954788060862318 0.063


Search best threshold:   0%|          | 0/499 [00:00<?, ?it/s]

17 0.5953549305589904 0.063


Search best threshold:   0%|          | 0/499 [00:00<?, ?it/s]

18 0.5953974938233273 0.063


Search best threshold:   0%|          | 0/499 [00:00<?, ?it/s]

19 0.5953177612847337 0.063
