<a href="https://colab.research.google.com/github/YujiK-github/kaggle_LECR/blob/main/1st_stage_model/1st_stage_model_inference_exp006_fold0_top50_ipynb_%E3%81%AE%E3%82%B3%E3%83%94%E3%83%BC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvidia-smi

Sat Mar  4 17:42:42 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    50W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
%pip install -U sentence_transformers
%pip install datasets transformers[sentencepiece]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 KB[0m [31m597.6 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m84.0 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%writefile 1st_stage_model_inference.py

# ===============================================================
#  Library
# ===============================================================
import os
import re
import gc
import sys
import math
import json
import heapq
import pickle
import random
import requests
import argparse
import scipy as sp
import numpy as np
import pandas as pd
from typing import Union
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter("ignore")

from sklearn.model_selection import GroupKFold

import torch 
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import transformers
from transformers import AutoModel, AutoTokenizer, AutoConfig, DataCollatorWithPadding
transformers.logging.set_verbosity_error()

from sentence_transformers import models, SentenceTransformer

device = "cuda" if torch.cuda.is_available() else "cpu"
print("device:", device)

# ===============================================================
#  args
# ===============================================================
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--seed", type=int, default=42, required=False)
    parser.add_argument("--input_dir", type=str, default="/content/drive/MyDrive/KAGGLE-LECR/", required=False)
    parser.add_argument("--output_dir", type=str, default="/content/drive/MyDrive/KAGGLE-LECR/last_data/", required=False)
    parser.add_argument("--filename", type=str, default="exp006", required=False)
    parser.add_argument("--model", type=str, default="sentence-transformers/all-mpnet-base-v2", required=False)
    parser.add_argument("--trn_fold", type=int, default=0, required=False)
    parser.add_argument("--n_splits", type=int, default=3, required=False)
    parser.add_argument("--max_len", type=int, default=256, required=False)
    parser.add_argument("--n_neighbors", type=int, default=50, required=False)
    parser.add_argument("--corpus_chunk_size", type=int, default=10_000, required=False)
    parser.add_argument("--mode", type=str, default="for_validation", choices=["for_validation", "for_training"], required=False)
    parser.add_argument("--batch_size", type=int, default=96, required=False)
    return parser.parse_args()


# ===============================================================
#  Utils
# ===============================================================

# 任意のメッセージを通知する関数
def send_slack_message_notification(message):
    webhook_url = ' [URL] '  
    data = json.dumps({'text': message})
    headers = {'content-type': 'application/json'}
    requests.post(webhook_url, data=data, headers=headers)


def seed_everything(cfg):
    """set seed"""
    random.seed(cfg.seed)
    os.environ['PYTHONHASHSEED'] = str(cfg.seed)
    np.random.seed(cfg.seed)
    torch.manual_seed(cfg.seed)
    torch.cuda.manual_seed(cfg.seed)
    torch.backends.cudnn.deterministic = True
    

def cos_sim(a: Tensor, b: Tensor):
    """
    Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
    :return: Matrix with res[i][j]  = cos_sim(a[i], b[j])
    cited: https://github.com/UKPLab/sentence-transformers/blob/master/sentence_transformers/util.py
    """
    if not isinstance(a, torch.Tensor):
        a = torch.tensor(a)

    if not isinstance(b, torch.Tensor):
        b = torch.tensor(b)

    if len(a.shape) == 1:
        a = a.unsqueeze(0)

    if len(b.shape) == 1:
        b = b.unsqueeze(0)

    a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
    b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
    return torch.mm(a_norm, b_norm.transpose(0, 1))

# ===============================================================
#  Data Loading
# ===============================================================
def data_load(cfg):
    print("========== Data Loading ==========")
    df_content = pd.read_csv(cfg.input_dir+"content.csv").fillna({"title": "", "description": "", "text":""})
    df_topics = pd.read_csv(cfg.input_dir+"topics.csv").fillna({"title": "", "description": ""})
    df_corr = pd.read_csv(cfg.input_dir+"correlations.csv")   
    
    df_content.rename(columns={"id":"content_id"}, inplace=True)
    df_topics.rename(columns={"id":"topic_id"}, inplace=True)
    df_corr.rename(columns={"content_ids":"content_id"}, inplace=True)
    
    cfg.tokenizer = AutoTokenizer.from_pretrained(cfg.model, is_fast=True)
    
    df_content["sentence"] = df_content["title"] + cfg.tokenizer.sep_token + df_content["description"]
    df_topics["sentence"] = df_topics["title"] + cfg.tokenizer.sep_token +  df_topics["description"] +\
    cfg.tokenizer.sep_token + df_topics["context"]
    df_topics["sentence"] = df_topics["sentence"].str.replace(" >> ",  " ")
    df_topics = pd.merge(df_topics, df_corr, on="topic_id", how="left")
    
    df_content["content_sentence"] = df_content["sentence"]
    df_topics["topic_sentence"] = df_topics["sentence"]
    
    df_topics["content_id"] = df_topics["content_id"].str.split()
    df_topics = df_topics.explode("content_id", ignore_index=True)
    
    #df_topics = pd.merge(df_topics, df_content[["content_id", "content_sentence"]], on="content_id", how="left")
    
    print(df_topics.shape)
    # category == sourceのtopic
    df_train = df_topics[(df_topics["category"] == "source")&(df_topics["has_content"] == True)].reset_index(drop=True)
    
    # category == 
    df_valid = df_topics[(df_topics["category"] != "source")&(df_topics["has_content"] == True)].reset_index(drop=True)    
    
    """
    split cv using GroupKFold
    - split based on number of target's content_id or topic_id
    """
    print("========== CV split ==========")
    gkf = GroupKFold(cfg.n_splits)
    for i, (tr, val) in enumerate(gkf.split(X=df_valid, groups=df_valid["channel"])):
        df_valid.loc[val, "fold"] = i
    print(df_valid.groupby("fold").size())
    print(df_valid[["topic_id", "fold"]].drop_duplicates().groupby("fold").size())
    
    _df_train = df_valid[df_valid["fold"] != cfg.trn_fold].reset_index(drop=True)
    df_valid = df_valid[df_valid["fold"] == cfg.trn_fold].reset_index(drop=True)
    
    df_train = pd.concat([df_train, _df_train], ignore_index=True)

    print("df_content", df_content.shape)
    print("df_corr", df_corr.shape)
    
    print("Input Sentence Example")
    print("========== Topics ==========")
    print(df_topics["sentence"].values.tolist()[:2])
    print("========== Content ==========")
    print(df_content["sentence"].values.tolist()[:2])

    if cfg.mode == "for_validation":
        df = df_valid
    elif cfg.mode == "for_training":
        df = df_train

    print("df.shape: ", df.shape)

    return df_content, df

def prepare_valid(df_content, df_valid):
    """
    Create a query and corpus like the folloing.
    
    ex)
    queries = {'q1': 'What is machine learning?',
               'q2': 'How does deep learning work?'}
    corpus = {'d1': 'Machine learning is a method of data analysis.', 
              'd2': 'Deep learning is a subfield of machine learning.', 
              'd3': 'Neural networks are used in deep learning.'}
    relevant_docs = {'q1': {'d1'}, 
                     'q2': {'d2', 'd3'}}

    evaluator = evaluation.InformationRetrievalEvaluator(queries, corpus, relevant_docs)
    """
    queries = df_valid[["topic_id", "topic_sentence"]].set_index('topic_id').to_dict()['topic_sentence']
    corpus = df_content[["content_id", "content_sentence"]].set_index('content_id').to_dict()['content_sentence']
    relevant_docs =  df_valid.groupby('topic_id')['content_id'].apply(set).to_dict()
    return queries, corpus, relevant_docs


# ===============================================================
#  Convert sentence to embeddings
# ===============================================================
def FeatureExtractor(cfg):
    word_embedding_model = models.Transformer(cfg.model, max_seq_length=cfg.max_len)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode='mean')
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    return model


def get_pair(cfg, queries: dict, corpus: dict, model, device):
    """
    https://github.com/UKPLab/sentence-transformers/blob/master/sentence_transformers/evaluation/InformationRetrievalEvaluator.py
    """
    model.eval()
    queries_ids = list(queries.keys())
    queries = [queries[qid] for qid in queries_ids]

    corpus_ids = list(corpus.keys())
    corpus = [corpus[cid] for cid in corpus_ids]

    query_embeddings = model.encode(queries,  
                                    batch_size=cfg.batch_size, 
                                    convert_to_tensor=True)

    queries_result_list = [[] for _ in range(len(query_embeddings))]

    for corpus_start_idx in tqdm(range(0, len(corpus), cfg.corpus_chunk_size)):
        corpus_end_idx = min(corpus_start_idx + cfg.corpus_chunk_size, len(corpus))

        sub_corpus_embeddings = model.encode(corpus[corpus_start_idx:corpus_end_idx], 
                                             show_progress_bar=False, 
                                             batch_size=cfg.batch_size, 
                                             convert_to_tensor=True)

        # Compute cosine similarites
        pair_scores = cos_sim(query_embeddings, sub_corpus_embeddings)

        # Get top-k values
        pair_scores_top_k_values, pair_scores_top_k_idx = torch.topk(pair_scores, 
                                                                     min(cfg.n_neighbors, len(pair_scores[0])), 
                                                                     dim=1, largest=True, sorted=False)
        
        pair_scores_top_k_values = pair_scores_top_k_values.cpu().tolist()
        pair_scores_top_k_idx = pair_scores_top_k_idx.cpu().tolist()

        for query_itr in range(len(query_embeddings)):
            for sub_corpus_id, score in zip(pair_scores_top_k_idx[query_itr], pair_scores_top_k_values[query_itr]):
                corpus_id = corpus_ids[corpus_start_idx+sub_corpus_id]
                if len(queries_result_list[query_itr]) < cfg.n_neighbors:
                    heapq.heappush(queries_result_list[query_itr], (score, corpus_id))  # heaqp tracks the quantity of the first element in the tuple
                else:
                    heapq.heappushpop(queries_result_list[query_itr], (score, corpus_id))

    for query_itr in range(len(queries_result_list)):
        for doc_itr in range(len(queries_result_list[query_itr])):
            score, corpus_id = queries_result_list[query_itr][doc_itr]
            queries_result_list[query_itr][doc_itr] = {'corpus_id': corpus_id, 'score': score}
    return queries_ids, queries_result_list


def save_pair(cfg, queries_ids, queries_result_list):
    pair = {}
    for query_itr in range(len(queries_result_list)):
        query_id = queries_ids[query_itr]
        # Sort scores
        top_hits = sorted(queries_result_list[query_itr], key=lambda x: x['score'], reverse=True)
        corpus_id_list = [(d['corpus_id'], d['score']) for d in top_hits[0:cfg.n_neighbors]]
        pair[query_id] = corpus_id_list

    if cfg.mode == "for_validation":
        path = cfg.output_dir+f"validation_top_{cfg.n_neighbors}_ver2.pkl"
    elif cfg.mode == "for_training":
        path = cfg.output_dir+f"train_top_{cfg.n_neighbors}_ver3.pkl"

    with open(path, "wb") as f:
        pickle.dump(pair, f)
    print(f"{path} saved!")


def get_recall(cfg, queries_ids, queries_result_list, relevant_docs):
    recall_at_k = []

    for query_itr in range(len(queries_result_list)):
        query_id = queries_ids[query_itr]
        # Sort scores
        top_hits = sorted(queries_result_list[query_itr], key=lambda x: x['score'], reverse=True)
        query_relevant_docs = relevant_docs[query_id]
        
        num_correct = 0
        for hit in top_hits[0:cfg.n_neighbors]:
            if hit['corpus_id'] in query_relevant_docs:
                num_correct += 1

        recall_at_k.append(num_correct / len(query_relevant_docs))
    recall_at_k = np.mean(recall_at_k)
    return recall_at_k


# ===============================================================
#  main
# ===============================================================
def main(cfg):
    seed_everything(cfg)
    df_content, df_topics = data_load(cfg)
    model = FeatureExtractor(cfg)
    model.to(device)
    queries, corpus, relevant_docs = prepare_valid(df_content, df_topics)
    queries_ids, queries_result_list = get_pair(cfg, queries, corpus, model, device)
    save_pair(cfg, queries_ids, queries_result_list)
    score = get_recall(cfg, queries_ids, queries_result_list, relevant_docs)
    print(score)


# ===============================================================
#  Execute
# ===============================================================
if __name__ == "__main__":
    args = parse_args()

    
    args.output_dir = args.output_dir + "1st/" + args.filename + f"/fold{str(args.trn_fold)}/"
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    args.model = "/content/drive/MyDrive/KAGGLE-LECR/last_data/1st/" + args.filename + f"/fold{str(args.trn_fold)}/" \
                    + args.model.replace("/", "-") + "_fine-tuned/"

    for k, v in vars(args).items():
        print(f"{k}: {v}")
    main(args)
    send_slack_message_notification(f"[{args.filename} : {args.trn_fold} : {args.mode}] finished!")

Writing 1st_stage_model_inference.py


In [None]:
!python 1st_stage_model_inference.py\
--trn_fold 0\
--n_neighbors 50

2023-03-04 17:43:33.761537: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-04 17:43:33.916826: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-03-04 17:43:34.675028: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia
2023-03-04 17:43:34.675137: W tensorflow/compiler/xla/stream_executor

In [None]:
!python 1st_stage_model_inference.py\
--mode for_training\
--trn_fold 0\
--n_neighbors 50

2023-03-04 17:47:15.237672: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-04 17:47:15.394007: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-03-04 17:47:16.160830: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia
2023-03-04 17:47:16.160956: W tensorflow/compiler/xla/stream_executor

In [None]:
from google.colab import runtime
runtime.unassign()