In [13]:
# ----------
# ライブラリ
# ----------
import os
import random
import numpy as np
import torch
from psutil import virtual_memory

import polars as pl
import pandas as pd
from sklearn.model_selection import StratifiedKFold

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F
from tqdm.auto import tqdm

import xgboost as xgb
import scipy.stats as stats
import lightgbm as lgbm
from sklearn.metrics import accuracy_score
import fontstyle
import warnings
warnings.simplefilter('ignore')

In [3]:
import os

# DIR = "/content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測"
DIR = '/workspace'
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [4]:
# ----------
# 設定
# ----------
num_fold = 5
seed = 0

DEVICE = "cuda" # "cpu" or "cuda"
tokenizer = None
BATCH_SIZE = 16
MAX_LEN = 768

# テキスト特徴として連結するカラム
txt_columns = ['title', 'keywords', 'abstract']

In [5]:
# ----------
# 関数
# ----------
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        current_device = torch.cuda.current_device()
        print("Device:", torch.cuda.get_device_name(current_device))
        ram_gb = virtual_memory().total / 1e9
        print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

def get_stratifiedkfold(train, target_col, n_splits, seed):
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train.to_numpy(), train.get_column(target_col).to_numpy())
    fold_array = np.zeros(len(train))
    for fold, (_, idx_valid) in enumerate(generator):
        fold_array[idx_valid] = fold
    return fold_array

## 学習用にtitle, abstract, keywordsの要素数を特徴として追加しています。

In [7]:
# ----------
# データ
# ----------
train = pl.read_csv(os.path.join(INPUT_DIR,'train_data.csv'))
test = pl.read_csv(os.path.join(INPUT_DIR,'test_data.csv'))
sub = pl.read_csv(os.path.join(INPUT_DIR,'submission.csv'))

# ----------
# 前処理・特徴生成
# ----------
set_seed(seed)

# テキスト特徴の作成
# グループごとにFold数を設定
train =\
train.with_columns(
    pl.concat_str(txt_columns, separator='_').alias('txt_feat'),
    # title
    pl.when(pl.col('title').str.to_lowercase().is_in(['', 'nan', '0', 'blank']))
    .then(0)
    .otherwise(pl.col('title').str.to_lowercase().str.count_match(' ') + 1)
    .alias('num_title'),
    # abstract
    pl.when(pl.col('abstract').str.to_lowercase().is_in(['', 'nan', '0', 'blank']))
    .then(0)
    .otherwise(pl.col('abstract').str.to_lowercase().str.count_match(' ') + 1)
    .alias('num_abstract'),
    # keywords
    pl.when(pl.col('keywords').str.to_lowercase().is_in(['', 'nan', '0', 'blank']))
    .then(0)
    .otherwise(pl.col('keywords').str.to_lowercase().str.count_match(' ') + 1)
    .alias('num_keywords'),
    # group
    pl.concat_str(['year', 'y'], separator='-').alias('group'),
    )

test = \
test.with_columns(
    pl.concat_str(txt_columns, separator='. ').alias('txt_feat'),
    # title
    pl.when(pl.col('title').str.to_lowercase().is_in(['', 'nan', '0', 'blank']))
    .then(0)
    .otherwise(pl.col('title').str.to_lowercase().str.count_match(' ') + 1)
    .alias('num_title'),
    # abstract
    pl.when(pl.col('abstract').str.to_lowercase().is_in(['', 'nan', '0', 'blank']))
    .then(0)
    .otherwise(pl.col('abstract').str.to_lowercase().str.count_match(' ') + 1)
    .alias('num_abstract'),
    # keywords
    pl.when(pl.col('keywords').str.to_lowercase().is_in(['', 'nan', '0', 'blank']))
    .then(0)
    .otherwise(pl.col('keywords').str.to_lowercase().str.count_match(' ') + 1)
    .alias('num_keywords'),
    )

display(train.head(3))
display(test.head(3))

Device: NVIDIA RTX A4000
Your runtime has 16.6 gigabytes of available RAM



id,title,year,abstract,keywords,y,txt_feat,num_title,num_abstract,num_keywords,group
i64,str,i64,str,str,i64,str,u32,u32,u32,str
1,"""Hierarchical A…",2018,"""We propose a n…","""generative, hi…",0,"""Hierarchical A…",4,155,7,"""2018-0"""
2,"""Learning to Co…",2018,"""Words in natur…","""NLU, word embe…",0,"""Learning to Co…",8,130,5,"""2018-0"""
3,"""Graph2Seq: Sca…",2018,"""Neural network…","""""",0,"""Graph2Seq: Sca…",6,143,0,"""2018-0"""


id,title,year,abstract,keywords,txt_feat,num_title,num_abstract,num_keywords
i64,str,i64,str,str,str,u32,u32,u32
1,"""StyleAlign: An…",2022,"""In this paper,…","""StyleGAN, tran…","""StyleAlign: An…",8,209,11
2,"""Embedding a ra…",2021,"""We develop a t…","""Graph neural n…","""Embedding a ra…",16,272,11
3,"""BBRefinement: …",2021,"""We present a c…","""object detecti…","""BBRefinement: …",11,152,6


In [8]:
# ----------
# BERT
# ----------

# Dataset
class EmbedDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        text = self.df[idx, 'txt_feat']
        tokens = tokenizer(
                text,
                None,
                add_special_tokens=True,
                padding='max_length',
                truncation=True,
                max_length=MAX_LEN,
                return_tensors="pt")
        tokens = {k:v.squeeze(0) for k,v in tokens.items()}
        return tokens

# Pooling
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state.detach().cpu()
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )

def get_embeddings(MODEL_NM='', MAX_LEN=512, BATCH_SIZE=4, verbose=True):
    global tokenizer, DEVICE

    model = AutoModel.from_pretrained(MODEL_NM)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NM)
    
    model = model.to(DEVICE)
    model.eval()
        
    # train
    all_train_text_feats = []
    for batch in tqdm(embed_dataloader_tr,total=len(embed_dataloader_tr)):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        with torch.no_grad():
            model_output = model(input_ids=input_ids,attention_mask=attention_mask)
        sentence_embeddings = mean_pooling(model_output, attention_mask.detach().cpu())
        # Normalize the embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        sentence_embeddings =  sentence_embeddings.squeeze(0).detach().cpu().numpy()
        all_train_text_feats.extend(sentence_embeddings)
    all_train_text_feats = np.array(all_train_text_feats)
    if verbose:
        print('Train embeddings shape',all_train_text_feats.shape)
    
    # test
    te_text_feats = []
    for batch in tqdm(embed_dataloader_te,total=len(embed_dataloader_te)):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        with torch.no_grad():
            model_output = model(input_ids=input_ids,attention_mask=attention_mask)
        sentence_embeddings = mean_pooling(model_output, attention_mask.detach().cpu())
        # Normalize the embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        sentence_embeddings =  sentence_embeddings.squeeze(0).detach().cpu().numpy()
        te_text_feats.extend(sentence_embeddings)
    te_text_feats = np.array(te_text_feats)
    if verbose:
        print('Test embeddings shape',te_text_feats.shape)
        
    # save feat
    np.save(f"{MODEL_NM.split('/')[-1]}_train", all_train_text_feats)
    np.save(f"{MODEL_NM.split('/')[-1]}_test", te_text_feats)

    return all_train_text_feats, te_text_feats

In [9]:
ds_tr = EmbedDataset(train)
embed_dataloader_tr = torch.utils.data.DataLoader(ds_tr,\
                        batch_size=BATCH_SIZE,\
                        shuffle=False)
ds_te = EmbedDataset(test)
embed_dataloader_te = torch.utils.data.DataLoader(ds_te,\
                        batch_size=BATCH_SIZE,\
                        shuffle=False)

In [10]:
%%time
MODEL_NM = 'microsoft/deberta-v3-base'
train_emb, test_emb = get_embeddings(MODEL_NM, MAX_LEN, BATCH_SIZE)

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.weight', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Special tokens have b

  0%|          | 0/311 [00:00<?, ?it/s]

Train embeddings shape (4974, 768)


  0%|          | 0/400 [00:00<?, ?it/s]

Test embeddings shape (6393, 768)
CPU times: user 9min 56s, sys: 30.9 s, total: 10min 26s
Wall time: 9min 25s


In [11]:
emb_col = [f'emb{i}' for i in range(train_emb.shape[1])]
train_emb_df = pl.DataFrame(train_emb, schema=emb_col)
train = pl.concat([train, train_emb_df], how='horizontal')
test_emb_df = pl.DataFrame(test_emb, schema=emb_col)
test = pl.concat([test, test_emb_df], how='horizontal')

In [53]:
# Run XGBoost
use_col = ['num_title', 'num_abstract', 'num_keywords'] + emb_col
test_x = test.select(use_col).to_numpy()

whole_va_preds = []
whole_test_preds = []
for seed in range(3):
    print(fontstyle.apply(f'< Seed : {seed} >', 'BLACK/BOLD'))
    set_seed(seed)
    train = train.with_columns(
        pl.Series(get_stratifiedkfold(train, 'group', num_fold, seed))
        .alias('folds')
        )
    
    oof_preds = np.zeros((len(train), ), dtype=np.float32)
    preds = []
    for fold in range(num_fold):
        tr_x = train.filter(pl.col('folds')!=fold).select(use_col).to_numpy()
        tr_y = train.filter(pl.col('folds')!=fold).select('y').to_numpy()
        va_x = train.filter(pl.col('folds')==fold).select(use_col).to_numpy()
        va_y = train.filter(pl.col('folds')==fold).select('y').to_numpy()

        params = {
        'objective': 'binary:logistic',
        'n_estimators': 10000,
        'random_state': 0, 
        'learning_rate': 0.01,
        'max_depth': 8,
        'colsample_bytree': 1.0,
        'colsample_bylevel': 0.5,
        'subsample': 0.9,
        'gamma': 0,
        'lambda': 1,
        'alpha': 0,
        'min_child_weight': 1,
        'tree_method': 'gpu_hist',
        }

        clf = xgb.XGBClassifier(**params)
        clf.fit(
            tr_x, tr_y,
            eval_set=[(va_x, va_y)],
            early_stopping_rounds=100,
            verbose=100)

        va_preds_p = clf.predict_proba(va_x)[:, 1]
        oof_preds[
            train.select(
                pl.when(pl.col('folds')==fold).then(True).otherwise(False)
                ).to_numpy().reshape(-1).astype(bool)
                ] = va_preds_p
        va_preds = (va_preds_p > 0.5).astype(int)
        score = accuracy_score(va_y, va_preds)
        print(f'Fold : {fold+1} Accuracy score: {score}')
        print()
        test_preds_p = clf.predict_proba(test_x)[:, 1]
        preds.append(test_preds_p)

    score_s = accuracy_score(train.select('y').to_numpy(), oof_preds > 0.5)
    print(fontstyle.apply(f'Seed{seed} Accuracy score : {score_s}', 'BLACK/BOLD'))
    print()
    whole_va_preds.append(oof_preds)
    whole_test_preds.append(preds)

# preds_va_p = np.mean(whole_va_preds, axis=0)
# whole_score = accuracy_score(train.select('y').to_numpy(), preds_va_p > 0.5)
# preds_test = (np.mean(np.mean(whole_test_preds, axis=0), axis=0) > 0.5).astype(int)
preds_va = np.array([np.where(preds > 0.5, 1, 0) for preds in whole_va_preds])
whole_score = accuracy_score(train.select('y').to_numpy(), stats.mode(preds_va, axis=0).mode.flatten())
test_preds_array = np.array(whole_test_preds)
test_preds_array = test_preds_array.reshape(test_preds_array.shape[0]*test_preds_array.shape[1], -1)
preds_test = np.array([np.where(preds > 0.5, 1, 0) for preds in test_preds_array])
preds_test = stats.mode(preds_test, axis=0).mode.flatten()
print()  
print(fontstyle.apply(f'whole Accuracy score: {whole_score}', 'BLACK/BOLD'))
print()

display(pl.Series(preds_test).value_counts())

[30m[1m< Seed : 0 >[0m
Device: NVIDIA RTX A4000
Your runtime has 16.6 gigabytes of available RAM

[0]	validation_0-logloss:0.69119
[100]	validation_0-logloss:0.59761
[200]	validation_0-logloss:0.57961
[300]	validation_0-logloss:0.57701
[389]	validation_0-logloss:0.57869
Fold : 1 Accuracy score: 0.7045226130653266

[0]	validation_0-logloss:0.69144
[100]	validation_0-logloss:0.60074
[200]	validation_0-logloss:0.58018
[300]	validation_0-logloss:0.57685
[400]	validation_0-logloss:0.57729
[473]	validation_0-logloss:0.57966
Fold : 2 Accuracy score: 0.6924623115577889

[0]	validation_0-logloss:0.69109
[100]	validation_0-logloss:0.59339
[200]	validation_0-logloss:0.57009
[300]	validation_0-logloss:0.56556
[400]	validation_0-logloss:0.56567
[421]	validation_0-logloss:0.56658
Fold : 3 Accuracy score: 0.7175879396984924

[0]	validation_0-logloss:0.69121
[100]	validation_0-logloss:0.59920
[200]	validation_0-logloss:0.58108
[300]	validation_0-logloss:0.57978
[369]	validation_0-logloss:0.58167
Fo

Unnamed: 0_level_0,counts
i64,u32
0,6082
1,311
