In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from collections import OrderedDict

In [5]:
def fbeta_score(y_true_ids: pd.Series, y_pred_ids: pd.Series, beta=2, eps=1e-15):
    """
    Args:
        y_true_ids: true labels
        y_pred_ids: predictions

    It is assumed that the above two are in the same topic order.
    """
    true_ids = y_true_ids.str.split()
    pred_ids = y_pred_ids.str.split()
    score_list = []
    for true, pred in zip(true_ids, pred_ids):
        TP = (set(true) & set(pred))
        try:
            precision = len(TP) / len(pred)
            recall = len(TP) / len(true)
            f2 = (1+beta**2) * (precision*recall) / ((beta**2)*precision+recall+eps)
        except:
            f2 = 0
        score_list.append(f2)

    try:
        score = sum(score_list) / len(score_list)
    except:
        score = 0
    return score


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def get_f2_score(predictions, valid_folds, TOP):
    valid_folds["sigmoid"] = sigmoid(predictions)
    best_score = -np.inf
    best_thrshold = 0
    df_corr = pd.read_csv("/content/drive/MyDrive/KAGGLE-LECR/correlations.csv")

    # sigmoidが大きい順に5個選ぶ
    top_5_sigmoids = valid_folds.groupby('topic_id').apply(lambda x: x.sort_values(by='sigmoid', ascending=False).head(TOP)).reset_index(drop=True)
    top_5_sigmoids = pd.DataFrame(top_5_sigmoids.groupby("topic_id")["predictions"].agg(list)).reset_index()
    top_5_sigmoids["predictions"] = top_5_sigmoids["predictions"].apply(lambda x: " ".join(x))
    count = 0
    with tqdm(np.arange(0.0001, 0.5, 0.0001), desc="Search best threshold") as pbar:
        for thre in pbar:
            if count == 10:
                break
            valid_folds["pred"] = np.where(valid_folds["sigmoid"] > thre, 1, 0)        
            pred_1 = valid_folds[valid_folds["pred"] == 1].reset_index(drop=True)        

            topic_true = pd.DataFrame(pred_1.groupby("topic_id")["predictions"].agg(list)).reset_index()
            topic_true["predictions"] = topic_true["predictions"].apply(lambda x: " ".join(x))

            # predictionsがないものも考慮する
            least_df = top_5_sigmoids[~top_5_sigmoids["topic_id"].isin(topic_true["topic_id"].values)].reset_index(drop=True)
            topic_true = pd.concat([topic_true, least_df], ignore_index=True)
            topic_true = pd.merge(topic_true, df_corr, on="topic_id", how="left")
            score = fbeta_score(topic_true["content_ids"], topic_true["predictions"])        
            count += 1
            if score > best_score:
                count = 0
                best_score = score
                best_threshold = thre
                pbar.set_postfix(OrderedDict(best_score=score, best_threshold = thre))
    return best_score, best_threshold



def get_max_score(y_true_ids, y_pred_ids, beta=2, eps=1e-15):
    """
    precision -> 1
    recall -> No change
    """
    true_ids = y_true_ids.str.split()
    pred_ids = y_pred_ids.str.split()
    score_list = []
    for true, pred in zip(tqdm(true_ids), pred_ids):
        TP = (set(true) & set(pred))
        #precision = len(TP) / len(pred) 
        precision = 1
        recall = len(TP) / len(true)     
        f2 = (1+beta**2) * (precision*recall) / ((beta**2)*precision+recall+eps)
        score_list.append(f2)
    score = sum(score_list) / len(score_list)
    return score


def get_score(y_true_ids, y_pred_ids, beta=2, eps=1e-15):
    true_ids = y_true_ids.str.split()
    pred_ids = y_pred_ids.str.split()
    score_list = []
    for true, pred in zip(true_ids, pred_ids):
        TP = (set(true) & set(pred))
        try:
            precision = len(TP) / len(pred) 
            #precision = 1
            recall = len(TP) / len(true)     
            f2 = (1+beta**2) * (precision*recall) / ((beta**2)*precision+recall+eps)
        except:
            f2 = 0
        score_list.append(f2)
    score = sum(score_list) / len(score_list)
    return score


def get_pos_score(y_true_ids, y_pred_ids):
    """
    calculate recall
    ref: https://www.kaggle.com/code/ragnar123/lecr-unsupervised-train-set-public
    """
    y_true_ids = y_true_ids.str.split()
    y_pred_ids = y_pred_ids.str.split()
    score_list = []
    for true, pred in zip(y_true_ids, y_pred_ids):
        score_list.append(np.array([len(set(true) & set(pred)) / len(true)]))
    return round(np.mean(score_list), 5)  

In [6]:
df_valid = pd.read_csv("/content/drive/MyDrive/KAGGLE-LECR/last_data/2nd/exp006/fold0/oof_df_fold0_epoch3.csv")
for top in range(1, 20, 1):
    best_score, best_threshold = get_f2_score(df_valid["valid_pred"].values, df_valid, TOP=top)
    print(top, best_score, best_threshold)
del df_valid

Search best threshold:   0%|          | 0/4999 [00:00<?, ?it/s]

1 0.5512639799767193 0.0007000000000000001


Search best threshold:   0%|          | 0/4999 [00:00<?, ?it/s]

2 0.5535339318284449 0.0007000000000000001


Search best threshold:   0%|          | 0/4999 [00:00<?, ?it/s]

3 0.5554607429935107 0.001


Search best threshold:   0%|          | 0/4999 [00:00<?, ?it/s]

4 0.5568218873716986 0.001


Search best threshold:   0%|          | 0/4999 [00:00<?, ?it/s]

5 0.5573536751826885 0.001


Search best threshold:   0%|          | 0/4999 [00:00<?, ?it/s]

6 0.5580052426286475 0.001


Search best threshold:   0%|          | 0/4999 [00:00<?, ?it/s]

7 0.55828606649912 0.001


Search best threshold:   0%|          | 0/4999 [00:00<?, ?it/s]

8 0.5585640570659712 0.001


Search best threshold:   0%|          | 0/4999 [00:00<?, ?it/s]

9 0.5586004157973775 0.001


Search best threshold:   0%|          | 0/4999 [00:00<?, ?it/s]

10 0.5586947697126098 0.001


Search best threshold:   0%|          | 0/4999 [00:00<?, ?it/s]

11 0.558697796927239 0.001


Search best threshold:   0%|          | 0/4999 [00:00<?, ?it/s]

12 0.5589707565479161 0.001


Search best threshold:   0%|          | 0/4999 [00:00<?, ?it/s]

13 0.5588617746565581 0.001


Search best threshold:   0%|          | 0/4999 [00:00<?, ?it/s]

14 0.5591040105156814 0.001


Search best threshold:   0%|          | 0/4999 [00:00<?, ?it/s]

15 0.5589891136701118 0.001


Search best threshold:   0%|          | 0/4999 [00:00<?, ?it/s]

16 0.558690218271333 0.001


Search best threshold:   0%|          | 0/4999 [00:00<?, ?it/s]

17 0.558701672010265 0.001


Search best threshold:   0%|          | 0/4999 [00:00<?, ?it/s]

18 0.5585902095021554 0.001


Search best threshold:   0%|          | 0/4999 [00:00<?, ?it/s]

19 0.5583740213543594 0.001


| epoch | best score | best top | best thres |
| - | - | - | - |
| 1 | 0.5091 | 12 | 0.074 |
| 2 | 0.5399 | 8 | 0.043 |
| 3 | 0.5526 | 14 | 0.013 |
| 4 | 0.5591 | 14 | 0.001 |