In [None]:
# リポジトリ一覧にgcsfuseのものを追加
!echo "deb http://packages.cloud.google.com/apt gcsfuse-`lsb_release -c -s` main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -

# インストール
!apt-get update
!apt-get install gcsfuse

deb http://packages.cloud.google.com/apt gcsfuse-focal main
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1210  100  1210    0     0  24200      0 --:--:-- --:--:-- --:--:-- 24200
OK
Get:1 http://packages.cloud.google.com/apt gcsfuse-focal InRelease [5002 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64  InRelease
Get:3 https://packages.cloud.google.com/apt cloud-sdk InRelease [6361 B]       
Get:4 https://packages.cloud.google.com/apt google-fast-socket InRelease [5015 B]
Err:3 https://packages.cloud.google.com/apt cloud-sdk InRelease               
  The following signatures couldn't be verified because the public key is not available: NO_PUBKEY B53DC80D13EDEF05
Get:5 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]
Get:6 https://packages.cloud.google.com/apt google-fast-socket/main amd64 Packages [415 B]
Hit:7 http

In [None]:
# マウント用のディレクトリを準備
!mkdir -p /content/gcs

# バケットをディレクトリにマウント
!gcsfuse cal10000-kaggle-lecr /content/gcs

2023/03/11 11:26:29.865280 Start gcsfuse/0.42.1 (Go version go1.19.5) for app "" using mount point: /content/gcs
2023/03/11 11:26:29.883604 Opening GCS connection...
2023/03/11 11:26:30.166009 Mounting file system "cal10000-kaggle-lecr"...
2023/03/11 11:26:30.166368 File system has been successfully mounted.


In [None]:
#一回実行したらコメントアウトする
#!cp -r content/gcs/lecr .

In [None]:
# 一回実行したらコメントアウトする
#!pip install torch==1.10.0+cu113 torchvision==0.11.1+cu113 torchaudio===0.10.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
#!pip install transformers==4.20.1
#!pip install sentencepiece==0.1.97
#!pip install tokenizers==0.12.1
#!pip -qqq install sentence-transformers

In [None]:
# =========================================================================================
# Libraries
# =========================================================================================
import os
import sys
import gc
import time
from datetime import datetime
import math
import random
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding
from sklearn.model_selection import StratifiedGroupKFold
from sentence_transformers import SentenceTransformer, models, InputExample, losses
from sklearn.model_selection import KFold

os.environ['TOKENIZERS_PARALLELISM'] = 'True'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#torch.autograd.set_detect_anomaly(True) #debug用

# =========================================================================================
# Configurations
# =========================================================================================
class CFG:
    exp_no = "Reranker078" # 毎回変える＆確認する事！
    #num_workers = 4
    model = f"lecr/models/Retriever054/model_trained" # 毎回変える＆確認する事！
    model_name = Path(model).name
    tokenizer = AutoTokenizer.from_pretrained(model)
    epochs = 5
    lr = 2e-05
    batch_size = 224
    batch_size_eval = 128
    seed = 42

    # Load Paths: Train(毎回変える＆確認すること！):
    # train.pkl path
    train_pkl_path = "lecr/input/Reranker072/lecr-retriever-054-reranker-dataset-100/train.pkl" 
    reranker_text_dataset = "lecr/input/Retriever037-dataset-with-reranker/topics_with_fold.pkl"

    # Load Paths: Common(基本変えなくていい)
    # competition_data
    compdata_topics = "lecr/input/00_competition/topics.csv"
    compdata_content = "lecr/input/00_competition/content.csv"
    compdata_correlations = "lecr/input/00_competition/correlations.csv"
    compdata_sample_sub = "lecr/input/00_competition/sample_submission.csv"

    # Save paths: 基本変えなくていい
    model_save_base = "/content/gcs/lecr/models"
    # model save path
    model_save_path = f"{model_save_base}/{exp_no}/model_trained"
    model_chk_path = f"{model_save_base}/{exp_no}/model_chk"
    model_final_save_path = f"{model_save_base}/{exp_no}/model_trained_final"
    # predictions save path
    predictions_save_path = f"{model_save_base}/{exp_no}/predictions"

    # logging Paths: （動的に変わる):
    logging_path = f"{model_save_base}/{exp_no}/log"

os.makedirs(CFG.model_save_path, exist_ok=True)
os.makedirs(CFG.model_chk_path, exist_ok=True)
os.makedirs(CFG.model_final_save_path, exist_ok=True)
os.makedirs(CFG.predictions_save_path, exist_ok=True)
os.makedirs(CFG.logging_path, exist_ok=True)

# =========================================================================================
# Set Logger
# =========================================================================================
import logging
from logging import StreamHandler, FileHandler, Formatter
from logging import INFO, DEBUG, NOTSET

logger = logging.getLogger()

# ストリームハンドラの設定
stream_handler = StreamHandler()
stream_handler.setLevel(INFO)
stream_handler.setFormatter(Formatter("%(message)s"))

# ファイルハンドラの設定
file_handler = FileHandler(
    f"{CFG.logging_path}/log{datetime.now():%Y%m%d%H%M%S}.log"
)
file_handler.setLevel(DEBUG)
file_handler.setFormatter(
    Formatter("%(asctime)s@ %(name)s [%(levelname)s] %(funcName)s: %(message)s")
)

# ルートロガーの設定
logger.addHandler(stream_handler)
logger.addHandler(file_handler)
#logging.basicConfig(level=NOTSET, handlers=[stream_handler, file_handler], force=True

logging.debug('Set logger')
logging.info(f'exp_no: {CFG.exp_no}')
logging.info(f'device: {device}')

# =========================================================================================
# Seed everything for deterministic results
# =========================================================================================
def seed_everything(cfg):
    random.seed(cfg.seed)
    os.environ['PYTHONHASHSEED'] = str(cfg.seed)
    np.random.seed(cfg.seed)
    torch.manual_seed(cfg.seed)
    torch.cuda.manual_seed(cfg.seed)
    torch.backends.cudnn.deterministic = True

logging.debug('Seed everything for deterministic results')


# =========================================================================================
# Utils
# =========================================================================================
import pickle

def show_df(df:pd.DataFrame, rows_=5):
    logging.info(df.shape)
    return display(df.head(rows_))

def pickle_dump(obj_, filename):
    with open(f'{filename}', 'wb') as f:
        pickle.dump(obj_, f)
        
def pickle_load(path_):
    with open(f'{path_}', 'rb') as f:
        obj_ = pickle.load(f)
    return obj_

# =========================================================================================
# F2 score metric
# =========================================================================================
def f2_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    tp = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    fp = np.array([len(x[1] - x[0]) for x in zip(y_true, y_pred)])
    fn = np.array([len(x[0] - x[1]) for x in zip(y_true, y_pred)])
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f2 = tp / (tp + 0.2 * fp + 0.8 * fn)
    return round(f2.mean(), 4)

def val_f2score_with_same_length(x_val, correlations):
    correlations['content_ids_cnt'] = correlations['content_ids'].map(lambda x: len(x.split(' ')))
    x_val_tmp = x_val.sort_values(['topics_ids', 'predictions'], ascending=False)
    x_val_grp = x_val_tmp.groupby('topics_ids')['content_ids'].apply(list)
    x_val_grp.name = 'content_ids_pred'
    correlations = correlations.merge(x_val_grp, how='inner', left_on='topic_id', right_index=True)
    correlations['content_ids_pred'] = correlations.apply(lambda x: ' '.join(x['content_ids_pred'][:x['content_ids_cnt']]), axis='columns')
    return f2_score(correlations['content_ids'], correlations['content_ids_pred'])

# =========================================================================================
# Data Loading
# =========================================================================================
def read_data(cfg):
    train = pickle_load(cfg.train_pkl_path)
    train['title1'].fillna("Title does not exist", inplace = True)
    train['title2'].fillna("Title does not exist", inplace = True)
    correlations = pd.read_csv(cfg.compdata_correlations)
    # Create feature column
    train['text'] = train['title1'] + '[SEP]' + train['title2']

    logging.info(f'train.shape: {train.shape}')
    logging.info(f"correlations.shape: {correlations.shape}")
    return train, correlations

# =========================================================================================
# Dataset
# =========================================================================================
class InputExampleDatasetTrain(Dataset):
    def __init__(self, df):
        self.topics_text = df['topic_all_text'].values
        self.content_text = df['content_all_text'].values
        self.label = df['target'].values
    def __len__(self):
        return len(self.topics_text)
    def __getitem__(self, item):
        # prepare topics&contents inputs
        topics_input = self.topics_text[item]
        content_input = self.content_text[item] 
        label = self.label[item]
        return InputExample(texts=[topics_input, content_input], label=label)

exp_no: Reranker078
device: cuda


In [None]:
torch.__version__

'1.10.0+cu113'

In [None]:
torch.version.cuda

'11.3'

In [None]:
# =========================================================================================
# Run
# =========================================================================================
# Seed everything
seed_everything(CFG)

# Read data
train, correlations = read_data(CFG)
topics_reranker = pickle_load(CFG.reranker_text_dataset)
train['title1'] = train['topics_ids'].map(topics_reranker.set_index('id')['all_text_reranker'])
logging.info(f"check: title1 null count={train['title1'].isnull().sum()}")
#train = train.sample(5000)
logging.info(f"{train['target'].value_counts()}")

val_fold = 2
train = train.rename(columns={'title1':'topic_all_text', 'title2':'content_all_text'})
train_tr = train[train['fold'] != val_fold]
train_va = train[train['fold'] == val_fold]

train_dataset = InputExampleDatasetTrain(train_tr)

from sentence_transformers import evaluation
evaluator = evaluation.BinaryClassificationEvaluator(sentences1=train_va['topic_all_text'].tolist(),
                                                   sentences2=train_va['content_all_text'].tolist(),
                                                   labels=train_va['target'].tolist(),
                                                   name='train_va_eval',
                                                   batch_size=CFG.batch_size_eval,
                                                   show_progress_bar=True)

model = SentenceTransformer(CFG.model)
train_loss = losses.OnlineContrastiveLoss(model=model)
num_epochs = CFG.epochs
train_dataloader = DataLoader(
    train_dataset, 
    batch_size = CFG.batch_size, 
    shuffle = True, 
    #num_workers = CFG.num_workers, 
    pin_memory = True, 
    drop_last = True
)
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) #10% of train data

model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          save_best_model = True,
          output_path=CFG.model_save_path,
          evaluator=evaluator,
          #evaluation_steps=len(train_dataloader),
          use_amp=True,
          checkpoint_path=CFG.model_chk_path,
          checkpoint_save_steps=len(train_dataloader),
          checkpoint_save_total_limit=num_epochs,
          optimizer_params={'lr': CFG.lr},
          warmup_steps=warmup_steps)

model.save(f"{CFG.model_final_save_path}")

logging.info(f"{CFG.model_save_path}")
logging.info(f"{CFG.model_final_save_path}")

train.shape: (6151700, 7)
correlations.shape: (61517, 2)
check: title1 null count=0
0    5879233
1     272467
Name: target, dtype: int64
Load pretrained SentenceTransformer: lecr/models/Retriever054/model_trained
Use pytorch device: cuda


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/24903 [00:00<?, ?it/s]

Save model to /content/gcs/lecr/models/Reranker078/model_chk/24903
Binary Accuracy Evaluation of the model on train_va_eval dataset after epoch 0:


Batches:   0%|          | 0/749 [00:00<?, ?it/s]

Accuracy with Cosine-Similarity:           97.18	(Threshold: 0.7024)
F1 with Cosine-Similarity:                 49.62	(Threshold: 0.6516)
Precision with Cosine-Similarity:          53.38
Recall with Cosine-Similarity:             46.35
Average Precision with Cosine-Similarity:  48.31

Accuracy with Manhattan-Distance:           97.04	(Threshold: 137.8848)
F1 with Manhattan-Distance:                 45.07	(Threshold: 152.8870)
Precision with Manhattan-Distance:          47.98
Recall with Manhattan-Distance:             42.49
Average Precision with Manhattan-Distance:  42.92

Accuracy with Euclidean-Distance:           97.04	(Threshold: 6.5298)
F1 with Euclidean-Distance:                 45.50	(Threshold: 7.2080)
Precision with Euclidean-Distance:          45.76
Recall with Euclidean-Distance:             45.25
Average Precision with Euclidean-Distance:  43.55

Accuracy with Dot-Product:           96.88	(Threshold: 56.1802)
F1 with Dot-Product:                 42.92	(Threshold: 49.8260)


Iteration:   0%|          | 0/24903 [00:00<?, ?it/s]

Save model to /content/gcs/lecr/models/Reranker078/model_chk/49806
Binary Accuracy Evaluation of the model on train_va_eval dataset after epoch 1:


Batches:   0%|          | 0/749 [00:00<?, ?it/s]

Accuracy with Cosine-Similarity:           97.37	(Threshold: 0.7003)
F1 with Cosine-Similarity:                 53.53	(Threshold: 0.6567)
Precision with Cosine-Similarity:          57.04
Recall with Cosine-Similarity:             50.42
Average Precision with Cosine-Similarity:  53.76

Accuracy with Manhattan-Distance:           97.13	(Threshold: 138.6186)
F1 with Manhattan-Distance:                 47.08	(Threshold: 153.7906)
Precision with Manhattan-Distance:          48.00
Recall with Manhattan-Distance:             46.19
Average Precision with Manhattan-Distance:  46.98

Accuracy with Euclidean-Distance:           97.13	(Threshold: 6.4575)
F1 with Euclidean-Distance:                 47.67	(Threshold: 7.0828)
Precision with Euclidean-Distance:          50.62
Recall with Euclidean-Distance:             45.05
Average Precision with Euclidean-Distance:  47.59

Accuracy with Dot-Product:           97.07	(Threshold: 56.5311)
F1 with Dot-Product:                 47.53	(Threshold: 50.8577)


Iteration:   0%|          | 0/24903 [00:00<?, ?it/s]

Save model to /content/gcs/lecr/models/Reranker078/model_chk/74709
Binary Accuracy Evaluation of the model on train_va_eval dataset after epoch 2:


Batches:   0%|          | 0/749 [00:00<?, ?it/s]

Accuracy with Cosine-Similarity:           97.49	(Threshold: 0.7139)
F1 with Cosine-Similarity:                 56.56	(Threshold: 0.6777)
Precision with Cosine-Similarity:          63.05
Recall with Cosine-Similarity:             51.28
Average Precision with Cosine-Similarity:  57.30

Accuracy with Manhattan-Distance:           97.27	(Threshold: 141.9218)
F1 with Manhattan-Distance:                 50.84	(Threshold: 153.2797)
Precision with Manhattan-Distance:          53.31
Recall with Manhattan-Distance:             48.58
Average Precision with Manhattan-Distance:  51.01

Accuracy with Euclidean-Distance:           97.28	(Threshold: 6.5415)
F1 with Euclidean-Distance:                 51.15	(Threshold: 7.0858)
Precision with Euclidean-Distance:          54.30
Recall with Euclidean-Distance:             48.34
Average Precision with Euclidean-Distance:  51.62

Accuracy with Dot-Product:           97.14	(Threshold: 59.5090)
F1 with Dot-Product:                 49.18	(Threshold: 55.0361)


Iteration:   0%|          | 0/24903 [00:00<?, ?it/s]

Save model to /content/gcs/lecr/models/Reranker078/model_chk/99612
Binary Accuracy Evaluation of the model on train_va_eval dataset after epoch 3:


Batches:   0%|          | 0/749 [00:00<?, ?it/s]

Accuracy with Cosine-Similarity:           97.54	(Threshold: 0.7239)
F1 with Cosine-Similarity:                 58.11	(Threshold: 0.6886)
Precision with Cosine-Similarity:          62.81
Recall with Cosine-Similarity:             54.06
Average Precision with Cosine-Similarity:  59.47

Accuracy with Manhattan-Distance:           97.33	(Threshold: 141.0785)
F1 with Manhattan-Distance:                 52.43	(Threshold: 153.4688)
Precision with Manhattan-Distance:          54.85
Recall with Manhattan-Distance:             50.22
Average Precision with Manhattan-Distance:  53.37

Accuracy with Euclidean-Distance:           97.35	(Threshold: 6.5708)
F1 with Euclidean-Distance:                 52.83	(Threshold: 7.0478)
Precision with Euclidean-Distance:          57.56
Recall with Euclidean-Distance:             48.82
Average Precision with Euclidean-Distance:  54.05

Accuracy with Dot-Product:           97.18	(Threshold: 63.1875)
F1 with Dot-Product:                 49.59	(Threshold: 58.6940)


Iteration:   0%|          | 0/24903 [00:00<?, ?it/s]

Save model to /content/gcs/lecr/models/Reranker078/model_chk/124515
Binary Accuracy Evaluation of the model on train_va_eval dataset after epoch 4:


Batches:   0%|          | 0/749 [00:00<?, ?it/s]

Accuracy with Cosine-Similarity:           97.56	(Threshold: 0.7337)
F1 with Cosine-Similarity:                 57.82	(Threshold: 0.7010)
Precision with Cosine-Similarity:          61.16
Recall with Cosine-Similarity:             54.83
Average Precision with Cosine-Similarity:  59.72

Accuracy with Manhattan-Distance:           97.24	(Threshold: 136.3507)
F1 with Manhattan-Distance:                 51.31	(Threshold: 149.1303)
Precision with Manhattan-Distance:          53.60
Recall with Manhattan-Distance:             49.21
Average Precision with Manhattan-Distance:  52.08

Accuracy with Euclidean-Distance:           97.26	(Threshold: 6.3411)
F1 with Euclidean-Distance:                 51.75	(Threshold: 6.9249)
Precision with Euclidean-Distance:          53.45
Recall with Euclidean-Distance:             50.16
Average Precision with Euclidean-Distance:  52.77

Accuracy with Dot-Product:           97.16	(Threshold: 64.1162)
F1 with Dot-Product:                 49.25	(Threshold: 59.4674)
