In [17]:
import os
import datetime

import numpy as np
import pandas as pd
import wandb
import yaml
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.loggers import WandbLogger
from utils.tokenizer import get_tokenizer
from data_loader.data_loaders import TextDataLoader
from utils.util import set_seed
from model.model import STSModel
from utils.clean import clean_texts

# Configuration / Random Seeding

In [18]:
## parameters
config = {
    'BATCH_SIZE': 32,
    'MAX_LEN': 128,
    'LEARNING_RATE': 0.0007,
    'EPOCHS': 20,
    'MODEL_NAME': 'FacebookAI/xlm-roberta-large',
    'LORA_RANK': 16,
    'MODULE_NAMES': ['query', 'key', 'value'],
    'SEED': 12345
}

## seed setting
set_seed(config["SEED"])

# Data Reading

In [19]:
data_dir = './data'
train_dir = os.path.join(data_dir, 'train.csv')
dev_dir = os.path.join(data_dir, 'dev.csv')

train = pd.read_csv(train_dir, dtype={'label': np.float32})
dev = pd.read_csv(dev_dir, dtype={'label': np.float32})

train.head()

Unnamed: 0,id,source,sentence_1,sentence_2,label,binary-label
0,boostcamp-sts-v1-train-000,nsmc-sampled,스릴도있고 반전도 있고 여느 한국영화 쓰레기들하고는 차원이 다르네요~,"반전도 있고,사랑도 있고재미도있네요.",2.2,0.0
1,boostcamp-sts-v1-train-001,slack-rtt,앗 제가 접근권한이 없다고 뜹니다;;,"오, 액세스 권한이 없다고 합니다.",4.2,1.0
2,boostcamp-sts-v1-train-002,petition-sampled,주택청약조건 변경해주세요.,주택청약 무주택기준 변경해주세요.,2.4,0.0
3,boostcamp-sts-v1-train-003,slack-sampled,입사후 처음 대면으로 만나 반가웠습니다.,화상으로만 보다가 리얼로 만나니 정말 반가웠습니다.,3.0,1.0
4,boostcamp-sts-v1-train-004,slack-sampled,뿌듯뿌듯 하네요!!,꼬옥 실제로 한번 뵈어요 뿌뿌뿌~!~!,0.0,0.0


# Preprocessing

In [14]:
print('***Train dataset information***')
print(train.info())
print()
print('***Text length summary***')
print(train['sentence_1'].apply(lambda x: len(x)).describe())

***Train dataset information***
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9324 entries, 0 to 9323
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            9324 non-null   object 
 1   source        9324 non-null   object 
 2   sentence_1    9324 non-null   object 
 3   sentence_2    9324 non-null   object 
 4   label         9324 non-null   float32
 5   binary-label  9324 non-null   float64
dtypes: float32(1), float64(1), object(4)
memory usage: 400.8+ KB
None

***Text length summary***
count    9324.000000
mean       23.258151
std        14.395189
min         9.000000
25%        14.000000
50%        19.000000
75%        28.000000
max        98.000000
Name: sentence_1, dtype: float64


In [20]:
train['sentence_1'] = clean_texts(train['sentence_1'])
train['sentence_2'] = clean_texts(train['sentence_2'])
# train = preprocessing(train)

dev['sentence_1'] = clean_texts(dev['sentence_1'])
dev['sentence_2'] = clean_texts(dev['sentence_2'])
# dev = preprocessing(dev)

train.head()

Unnamed: 0,id,source,sentence_1,sentence_2,label,binary-label
0,boostcamp-sts-v1-train-000,nsmc-sampled,스릴도있고 반전도 있고 여느 한국영화 쓰레기들하고는 차원이 다르네요,반전도 있고사랑도 있고재미도있네요 .,2.2,0.0
1,boostcamp-sts-v1-train-001,slack-rtt,앗 제가 접근권한이 없다고 뜹니다,오 액세스 권한이 없다고 합니다 .,4.2,1.0
2,boostcamp-sts-v1-train-002,petition-sampled,주택청약조건 변경해주세요 .,주택청약 무주택기준 변경해주세요 .,2.4,0.0
3,boostcamp-sts-v1-train-003,slack-sampled,입사후 처음 대면으로 만나 반가웠습니다 .,화상으로만 보다가 리얼로 만나니 정말 반가웠습니다 .,3.0,1.0
4,boostcamp-sts-v1-train-004,slack-sampled,뿌듯뿌듯 하네요 ! !,꼬옥 실제로 한번 뵈어요 뿌뿌뿌 ! !,0.0,0.0


## Data Augmentataion

### Random Deletion

In [21]:
from konlpy.tag import Mecab
import random

def random_deletion(text, p=0.2):
    mecab = Mecab()
    tokens_with_pos = mecab.pos(text)
    tokens = [token for token, pos in tokens_with_pos]
    target_tags = ['IC', 'J', 'E', 'XP', 'XS']

    if len(tokens) == 1:
        return text
    
    remaining = [(token, pos) for token, pos in tokens_with_pos if random.random() > p or not any([pos.startswith(tag) for tag in target_tags])]
    
    if len(remaining) == 0:
        return random.choice(tokens)
    
    result = []
    for token, pos in remaining:
        if pos.startswith('J') or pos.startswith('E'):
            if len(result) != 0:
                result[-1] += token
            else:
                result.append(token)
        else:
            result.append(token)
    
    return ' '.join(result)

train_deleted = []
for idx, row in train.iterrows():
    deleted_text = random_deletion(row['sentence_1'])
    diff = 0.2 * (len(row['sentence_1'])-len(deleted_text))
    row['sentence_1'] = deleted_text
    row['label'] = max(row['label']-diff, 0)
    train_deleted.append(row)

train_deleted = pd.DataFrame(train_deleted)
train = pd.concat([train, train_deleted])
train.reset_index(drop=True, inplace=True)

train.to_csv('data/train_augmented.csv', index=False)

### Augmentation With Masked LM

In [114]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
import random

model_name = "klue/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
model = AutoModelForMaskedLM.from_pretrained(model_name)

def augment_text(text, num_augmentations=1, mask_prob=0.4):
    mecab = Mecab()
    tokens_with_pos = mecab.pos(text)
    tokens = [token for token, pos in tokens_with_pos]
    target_tags = ['N', 'V']

    remaining = [(tokenizer.mask_token, pos) if random.random() > mask_prob and any([pos.startswith(tag) for tag in target_tags]) else (token, pos) for token, pos in tokens_with_pos]

    result = []
    for token, pos in remaining:
        if pos.startswith('J') or pos.startswith('E'):
            if len(result) != 0:
                result[-1] += token
            else:
                result.append(token)
        else:
            result.append(token)
    masked_text = ' '.join(result)

    masked_tokens = tokenizer.tokenize(masked_text)
    augmented_texts = []

    for _ in range(num_augmentations):
        inputs = tokenizer.encode(masked_tokens, return_tensors="pt")
        with torch.no_grad():
            outputs = model(inputs)
        
        predictions = outputs.logits.argmax(dim=-1)
        
        for i, token in enumerate(masked_tokens):
            if token == tokenizer.mask_token:
                masked_tokens[i] = tokenizer.convert_ids_to_tokens(predictions[0, i].item())
        
        augmented_text = tokenizer.convert_tokens_to_string(masked_tokens)
        augmented_texts.append(augmented_text)
    
    return augmented_texts

def augment_dataframe(df, col, num_augmentations=1):
    augmented_data = []
    
    for _, row in df.iterrows():
        print(f"Index: {_+1}/{len(df)}....({100*(_+1)/len(df):.2f}% done)")
        original_text = row[col]
        augmented_texts = augment_text(original_text, num_augmentations)
        
        for aug_text in augmented_texts:
            new_row = row.copy()
            new_row[col] = aug_text
            augmented_data.append(new_row)
    
    augmented_df = pd.DataFrame(augmented_data)
    return pd.concat([df, augmented_df], ignore_index=True)

train = augment_dataframe(train, 'sentence_1')
train.to_csv('data/train_augmented.csv', index=False)

Some weights of ElectraForMaskedLM were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['generator_lm_head.bias', 'generator_predictions.LayerNorm.bias', 'generator_predictions.LayerNorm.weight', 'generator_predictions.dense.bias', 'generator_predictions.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Index: 1/100....(1.00% done)


TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

# Modeling

In [6]:
model_name = config['MODEL_NAME']
process = 'raw_text'

wandb.login(key='dfae2bfa701b636a0c1d84ddd928a19f5e17c2f5')
wandb.init(project="Level1_STS", name=f'{model_name}_{process}', config = config)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


[34m[1mwandb[0m: Currently logged in as: [33mkangjun205[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /data/ephemeral/home/.netrc
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


In [7]:
tokenizer = get_tokenizer(config['MODEL_NAME'])
dataloader = TextDataLoader(
    tokenizer=tokenizer,
    max_len=config['MAX_LEN'],
    train_data=train,
    dev_data=dev,
    truncation=True,
    batch_size=config['BATCH_SIZE']
)
model = STSModel(config)

early_stop_callback = EarlyStopping(
    monitor='val_loss',
    patience=3,
    mode='min'
)

checkpoint_callback = ModelCheckpoint(
    dirpath='saved',
    filename='{epoch:02d}_{val_pearson_corr:.4f}',
    save_top_k=3,
    monitor='val_pearson_corr',
    mode='min'
)

model_name = config['MODEL_NAME']
run_name = f'{model_name}_{process}'
wandb_logger = WandbLogger(name = run_name, project="Level1-STS")

trainer = Trainer(
    accelerator="gpu",
    devices=1,
    max_epochs=config['EPOCHS'],
    val_check_interval=1.0,
    callbacks=[early_stop_callback, checkpoint_callback],
    logger = wandb_logger
    )

trainer.fit(model, datamodule=dataloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
`Trainer(val_check_interval=1)` was configured so validation will run after every batch.
/opt/conda/envs/STS/lib/python3.11/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/opt/conda/envs/STS/lib/python3.11/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /data/ephemeral/level1-semantictextsimilarity-nlp-16/saved exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type      | Params | Mode 
----------------------------------------------
0 | mod     | PeftModel | 562 M  | train
1 | dense   | Linear    | 1.0 K  | train
2 | sigmoid | Sigmoid   | 0      | train
3 | loss    | MSELoss   | 0      | train
---------------

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

/opt/conda/envs/STS/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


                                                                           

/opt/conda/envs/STS/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Epoch 0:   1%|▏         | 4/292 [00:14<16:50,  0.29it/s, v_num=4xr9]


Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined

# Evaluating

In [17]:
wandb.login(key='dfae2bfa701b636a0c1d84ddd928a19f5e17c2f5')

api = wandb.Api()
run = api.run("kangjun205/Level1_STS/dlyeghmc")

model_file = run.file("model.pth").download()
config = run.config



CommError: Permission denied to access kangjun205/Level1_STS/g3tra6sp

In [4]:
checkpoint_path = "saved/best-model-20175906-v2.ckpt"
model = STSModel.load_from_checkpoint(checkpoint_path)

In [5]:
test = pd.read_csv('data/test.csv')

tokenizer = get_tokenizer(config['MODEL_NAME'])
dataloader = TextDataLoader(
    tokenizer=tokenizer,
    max_len=config['MAX_LEN'],
    test_data=test,
    truncation=True
)
    
trainer = Trainer(
    accelerator="gpu",
    devices=1
)

preds = trainer.predict(model, dataloader)
all_pred = [val for pred in preds for val in pred]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/opt/conda/envs/STS/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 35/35 [00:08<00:00,  4.06it/s]


In [6]:
submission = pd.read_csv('data/sample_submission.csv')
submission['target'] = all_pred
display(submission.head())

submission.to_csv('data/submission.csv', index=False)

Unnamed: 0,id,target
0,boostcamp-sts-v1-test-000,3.148151
1,boostcamp-sts-v1-test-001,4.090247
2,boostcamp-sts-v1-test-002,1.774596
3,boostcamp-sts-v1-test-003,0.192109
4,boostcamp-sts-v1-test-004,3.875429


In [9]:
datetime.datetime.now().strftime('%d%H%M%S')

'21124716'