In [1]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install transformers==4.20.1
!pip install sentencepiece==0.1.97
!pip install tokenizers==0.12.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Check
- exp_noを変えたか？: ok
- common_processed_data: topics_contextを変えたか？: ok

In [4]:
# =========================================================================================
# Libraries
# =========================================================================================
import os
import gc
import time
import math
import random
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint
#import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding#
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.model_selection import GroupKFold

%env TOKENIZERS_PARALLELISM=true
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# =========================================================================================
# Config
# =========================================================================================
class CFG:
    exp_no = "Retriever037"
    seed = 42

    # competition_data
    compdata_topics = "/content/drive/MyDrive/Colab_Files/kaggle/lecr/input/00_competition/topics.csv"
    compdata_content = "/content/drive/MyDrive/Colab_Files/kaggle/lecr/input/00_competition/content.csv"
    compdata_correlations = "/content/drive/MyDrive/Colab_Files/kaggle/lecr/input/00_competition/correlations.csv"
    compdata_sample_sub = "/content/drive/MyDrive/Colab_Files/kaggle/lecr/input/00_competition/sample_submission.csv"
    # common_processed_data: topics_context
    topics_context = "/content/drive/MyDrive/Colab_Files/kaggle/lecr/input/01_for_retriever/topics-context-for-retriever_v3/topics_with_context.pkl"
    # common_processed_data: contents_for_retriever
    contents_processed_for_ret = "/content/drive/MyDrive/Colab_Files/kaggle/lecr/input/01_for_retriever/contents-for-retriever_v4/content_preprocessed_for_retriever.pkl"
    content_txt = "/content/drive/MyDrive/Colab_Files/kaggle/lecr/input/01_for_retriever/contents-for-retriever_v4/topics_with_content_pos_neg.pkl"
    content_token_str_dics = "/content/drive/MyDrive/Colab_Files/kaggle/lecr/input/01_for_retriever/contents-for-retriever_v4/content_token_str_dics.pkl"

    # model save path
    #model_save_path = f"/content/drive/MyDrive/Colab_Files/kaggle/lecr/models/{exp_no}/model_add_token"
    # topics save path
    topics_save_path = f"/content/drive/MyDrive/Colab_Files/kaggle/lecr/input/01_for_retriever/{exp_no}"

! mkdir -p {CFG.topics_save_path}
DEBUG = False
# =========================================================================================
# Seed everything for deterministic results
# =========================================================================================
def seed_everything(cfg):
    random.seed(cfg.seed)
    os.environ['PYTHONHASHSEED'] = str(cfg.seed)
    np.random.seed(cfg.seed)
    torch.manual_seed(cfg.seed)
    torch.cuda.manual_seed(cfg.seed)
    torch.backends.cudnn.deterministic = True

    
# =========================================================================================
# Utils
# =========================================================================================
import pickle

def show_df(df:pd.DataFrame, rows_=5):
    print(df.shape)
    return display(df.head(rows_))

def pickle_dump(obj_, filename):
    with open(f'{filename}', 'wb') as f:
        pickle.dump(obj_, f)
        
def pickle_load(path_):
    with open(f'{path_}', 'rb') as f:
        obj_ = pickle.load(f)
    return obj_

def create_list_batch(list_:list, batch_size:int):
    out_ = []
    for i in range(len(list_)):
        batch_ = list_[i*batch_size:(i+1)*batch_size]
        if len(batch_) == 0:
            pass
        else:
            out_.append(batch_)
    return out_


# =========================================================================================
# Token追加関連
# =========================================================================================
# channelからtoken文字列を生成する関数
def create_token_str_dic(df:pd.DataFrame(), column=str):
    token_str_dic = {i:f"[{column}={i}]" for i in df[column].unique()}
    return token_str_dic

# token文字列をtokenizerに追加
def add_token_str_to_tokenizer(token_str_dic, null_token, cfg):
    print(len(cfg.tokenizer))
    tokens = [i[1] for i in token_str_dic.items()] + [null_token]
    cfg.tokenizer.add_tokens(tokens)
    print(len(cfg.tokenizer))
    lengths = []
    for text in tqdm(tokens, total = len(tokens)):
        length = len(cfg.tokenizer(text, add_special_tokens = False)['input_ids'])
        lengths.append(length)
    margin_length = max(lengths)
    return margin_length

# token文字列をdfに適用する処理(辞書に無いchannelはothersとする)
def convert_texts_to_token_str(sr:pd.Series(), token_str_dic, null_token):    
    out_sr = sr.map(token_str_dic)
    out_sr = out_sr.fillna(null_token)
    return out_sr

# topicsのdfとカラム名を与えると、処理後のtopicsとtoken辞書を返す処理
def category_cols_to_token_str_col(topics, column):
    token_str_dic = create_token_str_dic(topics, column)
    null_token = f"[{column}==others]"
    topics[f'{column}_token'] = convert_texts_to_token_str(topics[column], token_str_dic, null_token)
    return topics, token_str_dic

env: TOKENIZERS_PARALLELISM=true


In [5]:
# set seed
seed_everything(CFG)

# Load data
print('Load data')
topics = pd.read_csv(CFG.compdata_topics)
correlations = pd.read_csv(CFG.compdata_correlations)
if DEBUG:
    topics = topics.sample(5000)

# add correlations
print('Add correlations')
correlations['content_ids_list'] = correlations['content_ids'].map(lambda x: x.split())
correlations = correlations.rename(columns={'topic_id':'id'}) 
topics = topics.merge(correlations, how='inner', on='id')
    
# Create Fold
print('Create Fold')
topics_src = topics[topics['category']=='source'].reset_index(drop=True)
topics_not_src = topics[topics['category']!='source'].reset_index(drop=True)
topics_src['fold'] = 99
topics_not_src['fold'] = -1
topics_src["channel_known"] = True
topics_not_src["channel_known"] = True

kf = GroupKFold(n_splits=8)
kf_folds = kf.split(topics_not_src, groups=topics_not_src['channel'])
for num_, (tr_idx, val_idx) in enumerate(kf_folds):
    topics_not_src.loc[val_idx, 'fold'] = num_    
    
#print(topics_not_src['fold'].value_counts())

# 特定のfold以外からランダムでサンプリングし、そのfoldに加える(既知のchannelを含む集団に対する評価用)
# 一旦、fold2をvalidationに使用するものとして実行する。
val_fold = 2
topics_not_src.loc[topics_not_src["fold"]==val_fold, "channel_known"] = False
len_fold = len(topics_not_src[topics_not_src['fold']==val_fold])*1
sample_idx = topics_not_src[topics_not_src['fold']!=val_fold].sample(len_fold, random_state=CFG.seed).index
topics_not_src.loc[sample_idx, 'fold'] = val_fold

topics = pd.concat([topics_src, topics_not_src], axis='rows').reset_index(drop=True)

# check
print(f"check: channel_unknown/val_fold = {topics['channel_known'].value_counts()[False] / len(topics[topics['fold']==val_fold])}")

# fillna
print('fillna')
topics['title'] = topics['title'].fillna("")
topics['description'] = topics['description'].fillna("")

# load context
print('load context')
topics_context = pickle_load(CFG.topics_context)
topics = topics.merge(topics_context, how='left', on='id')

# cut length
print('cut length')
topics['title'] = topics['title'].map(lambda x: ' '.join(x.split(' ')[:45]))
topics['description'] = topics['description'].map(lambda x: ' '.join(x.split(' ')[:115]))

# token文字列への変換＆辞書の取得
print('convert categories to token str')
token_str_dics = {}

topics, token_str_dics['level'] = category_cols_to_token_str_col(topics, 'level')
topics, token_str_dics['language'] = category_cols_to_token_str_col(topics, 'language')

# 未知channelが含まれるようvalidation設計しているので、未知channelはothersになるようにする。
tr_topics = topics[topics["fold"] != val_fold]
va_topics = topics[topics["fold"] == val_fold]
tr_topics, token_str_dics['channel'] = category_cols_to_token_str_col(tr_topics, 'channel')
va_topics['channel_token'] = va_topics['channel'].map(token_str_dics['channel'])
va_topics['channel_token'] = va_topics['channel_token'].fillna("[channel==others]")
topics = pd.concat([tr_topics, va_topics], axis='rows').reset_index(drop=True)

print(f"check: channel_others/all_valid_data = {va_topics['channel_token'].value_counts()['[channel==others]']/len(va_topics['channel_token'])}")
pickle_dump(token_str_dics, f'{CFG.topics_save_path}/token_str_dics.pkl')

# concat text
print('concat text')
sep_ = '[SEP]'
topics['all_text'] = topics['channel_token'] + sep_ \
                     + topics['language_token'] + sep_ \
                     + topics['level_token'] + sep_ \
                     + topics['title'] + sep_ \
                     + topics['description'] + sep_ \
                     + topics['context_title_str'] + sep_ \
                     + topics['context_desc_str'] + sep_ \
                     + topics['children_title'] 

# load content texts
print('load content texts')
content_txt = pickle_load(CFG.content_txt)
topics = topics.merge(content_txt, how='left', on='id')

# 不要なカラムを除外
topics = topics[['id',
                'all_text',
                'content_texts',
                'hard_negative_texts',
                'channel_known',
                'fold']]

print(topics['fold'].value_counts())

!mkdir -p {CFG.topics_save_path}
pickle_dump(topics, f'{CFG.topics_save_path}/topics_with_fold.pkl')
del topics_src, topics_not_src
gc.collect()

Load data
Add correlations
Create Fold
check: channel_unknown/val_fold = 0.5
fillna
load context
cut length
convert categories to token str
check: channel_others/all_valid_data = 0.5052631578947369
concat text
load content texts
99    2361
2      380
0      247
1      203
6      173
7      167
5      164
4      162
3      155
Name: fold, dtype: int64


0

In [6]:
print(f"{CFG.topics_save_path}/topics_with_fold.pkl")

/content/drive/MyDrive/Colab_Files/kaggle/lecr/input/01_for_retriever/Retriever037/topics_with_fold.pkl


In [7]:
print(topics.shape)
topics.head()

(4012, 6)


Unnamed: 0,id,all_text,content_texts,hard_negative_texts,channel_known,fold
0,t_39ea124896a1,[channel=0ec697][SEP][language=en][SEP][level=...,[[kind=video][SEP][language=en][SEP]Adding & s...,[[kind=document][SEP][language=en][SEP]Chapter...,True,99
1,t_639930e1909b,[channel=7b47c5][SEP][language=ar][SEP][level=...,[[kind=exercise][SEP][language=ar][SEP]المفردا...,[[kind=exercise][SEP][language=ar][SEP]الاستيع...,True,99
2,t_d9e41c8a030d,[channel=0c929f][SEP][language=sw][SEP][level=...,[[kind=exercise][SEP][language=sw][SEP]Subtrac...,[[kind=exercise][SEP][language=sw][SEP]Badili ...,True,99
3,t_180d26af637e,[channel=5139e9][SEP][language=ar][SEP][level=...,[[kind=video][SEP][language=en][SEP]Verbs Part...,[[kind=video][SEP][language=ar][SEP]Arabic: Th...,True,99
4,t_9b4516541cac,[channel=000cf7][SEP][language=bg][SEP][level=...,[[kind=video][SEP][language=bg][SEP]Въведение ...,[[kind=video][SEP][language=bg][SEP]Разработен...,True,99
