In [1]:
import pickle as pickle
import os
import pandas as pd
import torch
from torch.utils.data import DataLoader
from torch import nn
import sklearn
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, Trainer, TrainingArguments, RobertaConfig, RobertaTokenizer, RobertaForSequenceClassification, BertTokenizer
from load_data import *
from train import *
import warnings
warnings.filterwarnings('ignore')
from GPUtil import showUtilization
from tqdm.notebook import tqdm

In [116]:
class FeatureExtractionBert(nn.Module):
    def __init__(self, MODEL_NAME):
        super().__init__()
        self.config =  AutoConfig.from_pretrained(MODEL_NAME)
        self.Bert = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=self.config).bert
        
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.Bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        return outputs

In [117]:
# 가장 점수 높았던 베이스라인 모델 불러오기
device = "cuda:0" if torch.cuda.is_available() else "cpu"
features = FeatureExtractionBert('./results/checkpoint-2500').to(device)

In [118]:
# 특성 추출할 데이터 로드
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")

# load dataset
train_dataset = load_data("../dataset/train/train.csv")
dev_dataset = load_data("../dataset/train/dev.csv") # validation용 데이터는 따로 만드셔야 합니다.

train_label = label_to_num(train_dataset['label'].values)
dev_label = label_to_num(dev_dataset['label'].values)

# tokenizing dataset
tokenized_train = tokenized_dataset(train_dataset, tokenizer)
tokenized_dev = tokenized_dataset(dev_dataset, tokenizer)

# make dataset for pytorch.
RE_train_dataset = RE_Dataset(tokenized_train, train_label)
RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label)

In [119]:
showUtilization()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
| ID | GPU | MEM |
------------------
|  0 |  0% | 35% |


In [120]:
train_dataloader = DataLoader(RE_train_dataset, batch_size=16, shuffle=False)
dev_dataloader = DataLoader(RE_dev_dataset, batch_size=16, shuffle=False)

def FeatureExtraction(feature_extractor, dataloader):
    for param in feature_extractor.parameters():
        param.requires_grad = False
    
    feature_extractor.eval()
    outputs = []
    for i, data in enumerate(tqdm(dataloader)):
        with torch.no_grad():
            outputs.append(feature_extractor(
                input_ids=data['input_ids'].to(device),
                attention_mask=data['attention_mask'].to(device),
                token_type_ids=data['token_type_ids'].to(device)
                ).pooler_output)
    return torch.cat(outputs).detach().cpu().numpy()

In [11]:
torch.cat(outputs).size()

torch.Size([32470, 768])

In [122]:
ml_train = FeatureExtraction(features, train_dataloader)
ml_valid = FeatureExtraction(features, dev_dataloader)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2030.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=486.0), HTML(value='')))




In [123]:
ml_train.shape, ml_valid.shape

((32470, 768), (7765, 768))

In [22]:
len(train_label)

32470

# PCA

In [29]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
ml_train_reduced = pca.fit_transform(ml_train)

In [30]:
ml_train_reduced.shape

(32470, 41)

In [31]:
ml_valid_reduced = pca.transform(ml_valid)

In [32]:
ml_valid_reduced.shape

(7765, 41)

# Catboost

In [34]:
# !pip install catboost

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting catboost
  Downloading catboost-1.0.0-cp38-none-manylinux1_x86_64.whl (76.4 MB)
[K     |████████████████████████████████| 76.4 MB 269 kB/s  eta 0:00:01
Collecting plotly
  Downloading plotly-5.3.1-py2.py3-none-any.whl (23.9 MB)
[K     |████████████████████████████████| 23.9 MB 54.6 MB/s eta 0:00:01    |███▉                            | 2.9 MB 54.6 MB/s eta 0:00:01
[?25hCollecting matplotlib
  Downloading matplotlib-3.4.3-cp38-cp38-manylinux1_x86_64.whl (10.3 MB)
[K     |████████████████████████████████| 10.3 MB 62.6 MB/s eta 0:00:01
[?25hCollecting graphviz
  Downloading graphviz-0.17-py3-none-any.whl (18 kB)
Collecting tenacity>=6.2.0
  Downloading tenacity-8.0.1-py3-none-any.whl (24 kB)
Col

In [35]:
from catboost import CatBoostClassifier, Pool

In [42]:
train_data = Pool(data=ml_train_reduced, label=train_label)
valid_data = Pool(data=ml_valid_reduced, label=dev_label)

model_cat_reduced = CatBoostClassifier(task_type="GPU", devices='cuda:0')
model_cat_reduced.fit(train_data, eval_set=valid_data, use_best_model=True, early_stopping_rounds=100, verbose=100)

Learning rate set to 0.149448
0:	learn: 1.5059796	test: 1.7172364	best: 1.7172364 (0)	total: 13.7ms	remaining: 13.6s




100:	learn: 0.3218465	test: 0.8715834	best: 0.8715834 (100)	total: 1.14s	remaining: 10.2s
200:	learn: 0.2631672	test: 0.8578229	best: 0.8532011 (167)	total: 2.18s	remaining: 8.68s
bestTest = 0.8532010876
bestIteration = 167
Shrink model to first 168 iterations.


<catboost.core.CatBoostClassifier at 0x7fec3231e940>

In [43]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

y_pred_valid_reduced = model_cat_reduced.predict(ml_valid_reduced)
y_true_valid = dev_label

print(f'Accuracy: {accuracy_score(y_true_valid, y_pred_valid_reduced)}, f1_score: {f1_score(y_true_valid, y_pred_valid_reduced, average="micro")}')

Accuracy: 0.7426915647134579, f1_score: 0.742691564713458


In [44]:
train_data = Pool(data=ml_train, label=train_label)
valid_data = Pool(data=ml_valid, label=dev_label)

model_cat = CatBoostClassifier(task_type="GPU", devices='cuda:0')
model_cat.fit(train_data, eval_set=valid_data, use_best_model=True, early_stopping_rounds=100, verbose=100)

y_pred_valid = model_cat.predict(ml_valid)
y_true_valid = dev_label

print(f'Accuracy: {accuracy_score(y_true_valid, y_pred_valid)}, f1_score: {f1_score(y_true_valid, y_pred_valid, average="micro")}')

Learning rate set to 0.149448




0:	learn: 1.7300992	test: 1.7983392	best: 1.7983392 (0)	total: 41.1ms	remaining: 41s
100:	learn: 0.3280669	test: 0.8607522	best: 0.8591111 (98)	total: 3.65s	remaining: 32.5s
200:	learn: 0.2734966	test: 0.8402805	best: 0.8386717 (194)	total: 7.28s	remaining: 28.9s
300:	learn: 0.2353541	test: 0.8417031	best: 0.8375543 (261)	total: 10.8s	remaining: 25s
bestTest = 0.8375542674
bestIteration = 261
Shrink model to first 262 iterations.
Accuracy: 0.7451384417256922, f1_score: 0.7451384417256922


# Submission

In [99]:
from inference import num_to_label, load_test_dataset

In [77]:
test_dataset_dir = "../dataset/test/test_data.csv"
test_id, test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer)
Re_test_dataset = RE_Dataset(test_dataset ,test_label)

In [82]:
test_dataloader = DataLoader(Re_test_dataset, batch_size=16, shuffle=False)

ml_test = FeatureExtraction(features, test_dataloader)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=486.0), HTML(value='')))




In [88]:
output_pred = num_to_label(np.argmax(model_cat.predict_proba(ml_test), axis=-1))
output_prob = model_cat.predict_proba(ml_test).tolist()

In [89]:
output = pd.DataFrame({'id':test_id,'pred_label':output_pred,'probs':output_prob,})

In [91]:
output.to_csv('./prediction/bert_base_catboost.csv', index=False)

In [111]:
# PCA
ml_test_reduced = pca.transform(ml_test)

In [112]:
output_pred = num_to_label(np.argmax(model_cat_reduced.predict_proba(ml_test_reduced), axis=-1))
output_prob = model_cat_reduced.predict_proba(ml_test_reduced).tolist()

In [113]:
output = pd.DataFrame({'id':test_id,'pred_label':output_pred,'probs':output_prob,})
output.to_csv('./prediction/bert_base_catboost_pca.csv', index=False)