In [1]:
import datetime

import numpy as np
import pandas as pd
import wandb
import yaml
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.loggers import WandbLogger
from utils.tokenizer import get_tokenizer
from data_loader.data_loaders import TextDataLoader
from utils.util import set_seed
from model.model import STSModel
from utils.clean import clean_texts

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /data/ephemeral/home/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /data/ephemeral/home/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /data/ephemeral/home/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
set_seed(1234)

# Data Reading

In [3]:
train = pd.read_csv('data/train.csv')
dev = pd.read_csv('data/dev.csv')
test = pd.read_csv('data/test.csv')

train.drop(columns=['id', 'source', 'binary-label'], inplace = True)
dev.drop(columns=['id', 'source', 'binary-label'], inplace = True)

train.head()

Unnamed: 0,sentence_1,sentence_2,label
0,스릴도있고 반전도 있고 여느 한국영화 쓰레기들하고는 차원이 다르네요~,"반전도 있고,사랑도 있고재미도있네요.",2.2
1,앗 제가 접근권한이 없다고 뜹니다;;,"오, 액세스 권한이 없다고 합니다.",4.2
2,주택청약조건 변경해주세요.,주택청약 무주택기준 변경해주세요.,2.4
3,입사후 처음 대면으로 만나 반가웠습니다.,화상으로만 보다가 리얼로 만나니 정말 반가웠습니다.,3.0
4,뿌듯뿌듯 하네요!!,꼬옥 실제로 한번 뵈어요 뿌뿌뿌~!~!,0.0


In [4]:
train[['label']] = train[['label']].astype('float32')
dev[['label']] = dev[['label']].astype('float32')

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9324 entries, 0 to 9323
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   sentence_1  9324 non-null   object 
 1   sentence_2  9324 non-null   object 
 2   label       9324 non-null   float64
dtypes: float64(1), object(2)
memory usage: 218.7+ KB


In [4]:
train['sentence_1'].apply(lambda x: len(x)).describe()

count    9324.000000
mean       23.258151
std        14.395189
min         9.000000
25%        14.000000
50%        19.000000
75%        28.000000
max        98.000000
Name: sentence_1, dtype: float64

In [9]:
train[['label']] = train[['label']].astype('float32')
train['sentence_1'] = clean_texts(train['sentence_1'])
train['sentence_2'] = clean_texts(train['sentence_2'])
# train = preprocessing(train)

dev[['label']] = dev[['label']].astype('float32')
dev['sentence_1'] = clean_texts(dev['sentence_1'])
dev['sentence_2'] = clean_texts(dev['sentence_2'])
# dev = preprocessing(dev)

train.head()

Unnamed: 0,sentence_1,sentence_2,label
0,스릴도있고 반전도 있고 여느 한국영화 쓰레기들하고는 차원이 다르네요,반전도 있고사랑도 있고재미도있네요,2.2
1,앗 제가 접근권한이 없다고 뜹니다,오 액세스 권한이 없다고 합니다,4.2
2,주택청약조건 변경해주세요,주택청약 무주택기준 변경해주세요,2.4
3,입사후 처음 대면으로 만나 반가웠습니다,화상으로만 보다가 리얼로 만나니 정말 반가웠습니다,3.0
4,뿌듯뿌듯 하네요,꼬옥 실제로 한번 뵈어요 뿌뿌뿌,0.0


# Data Augmentataion

## RS, RD, RI

In [5]:
from konlpy.tag import Mecab
import random

def random_deletion(text, p=0.2):
    mecab = Mecab()
    tokens_with_pos = mecab.pos(text)
    tokens = [token for token, pos in tokens_with_pos]
    target_tags = ['IC', 'J', 'E', 'XP', 'XS']

    if len(tokens) == 1:
        return text
    
    remaining = [(token, pos) for token, pos in tokens_with_pos if random.random() > p and any([pos.startswith(tag) for tag in target_tags])]
    
    if len(remaining) == 0:
        return random.choice(tokens)
    
    result = []
    for token, pos in remaining:
        if pos.startswith('J') or pos.startswith('E'):
            if len(result) != 0:
                result[-1] += token
            else:
                result.append(token)
        else:
            result.append(token)
    
    return ' '.join(result)

def random_swap(text, n=1):
    mecab = Mecab()
    tokens_with_pos = mecab.pos(text)
    tokens = [token for token, pos in tokens_with_pos]
    for _ in range(n):
        if len(tokens) >= 2:
            idx1, idx2 = random.sample(range(len(tokens)), 2)
            tokens[idx1], tokens[idx2] = tokens[idx2], tokens[idx1]
    return ' '.join(tokens)

def random_insertion(text, n=1):
    mecab = Mecab()
    tokens_with_pos = mecab.pos(text)
    tokens = [token for token, pos in tokens_with_pos]
    for _ in range(n):
        insert_pos = random.randint(0, len(tokens))
        insert_word = random.choice(tokens)
        tokens.insert(insert_pos, insert_word)
    return ' '.join(tokens)

# train_swapped = train.copy(deep=True)
# train_swapped['sentence_1'] = train_swapped['sentence_1'].apply(lambda x : random_swap(x))
train_deleted = train.copy(deep=True)
train_deleted['sentence_1'] = train_deleted['sentence_1'].apply(lambda x : random_deletion(x))
# train_inserted = train.copy(deep=True)
# train_inserted['sentence_1'] = train_inserted['sentence_1'].apply(lambda x : random_insertion(x))

# train = pd.concat([train, train_swapped, train_deleted, train_inserted])
train = pd.concat([train, train_deleted])
train.reset_index(drop=True, inplace=True)

train.to_csv('data/train_augmented.csv', index=False)

## Augmentation With Language Model

In [95]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "skt/kogpt2-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

def augment_text(text, num_augmentations=1, max_length=20):
    augmented_texts = []
    input_ids = tokenizer.encode(text, return_tensors='pt')
    
    for _ in range(num_augmentations):
        output = model.generate(
            input_ids[:,:10],
            max_length=max_length,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            do_sample=True,
            top_p=0.5
        )
        
        augmented_text = tokenizer.decode(output[0], skip_special_tokens=True)
        augmented_texts.append(augmented_text)
    
    return augmented_texts

def augment_dataframe(df, col, num_augmentations=1):
    augmented_data = []
    
    for _, row in df.iterrows():
        print(f"Index: {_+1}/{len(df)}....({100*(_+1)/len(df):.2f}% done)")
        original_text = row[col]
        augmented_texts = augment_text(original_text, num_augmentations)
        
        for aug_text in augmented_texts:
            new_row = row.copy()
            new_row[col] = aug_text
            augmented_data.append(new_row)
    
    augmented_df = pd.DataFrame(augmented_data)
    return pd.concat([df, augmented_df], ignore_index=True)

train = augment_dataframe(train, 'sentence_1')
train.tail()

train.to_csv('data/train_augmented.csv', index=False)



Index: 1/100....(1.00% done)
Index: 2/100....(2.00% done)
Index: 3/100....(3.00% done)
Index: 4/100....(4.00% done)
Index: 5/100....(5.00% done)
Index: 6/100....(6.00% done)
Index: 7/100....(7.00% done)
Index: 8/100....(8.00% done)
Index: 9/100....(9.00% done)
Index: 10/100....(10.00% done)
Index: 11/100....(11.00% done)
Index: 12/100....(12.00% done)
Index: 13/100....(13.00% done)
Index: 14/100....(14.00% done)
Index: 15/100....(15.00% done)
Index: 16/100....(16.00% done)
Index: 17/100....(17.00% done)
Index: 18/100....(18.00% done)
Index: 19/100....(19.00% done)
Index: 20/100....(20.00% done)
Index: 21/100....(21.00% done)
Index: 22/100....(22.00% done)
Index: 23/100....(23.00% done)
Index: 24/100....(24.00% done)
Index: 25/100....(25.00% done)
Index: 26/100....(26.00% done)
Index: 27/100....(27.00% done)
Index: 28/100....(28.00% done)
Index: 29/100....(29.00% done)
Index: 30/100....(30.00% done)
Index: 31/100....(31.00% done)
Index: 32/100....(32.00% done)
Index: 33/100....(33.00% d

In [103]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "gogamza/kobart-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def augment_text(text, num_augmentations=1):
    inputs = tokenizer(text, return_tensors="pt", max_length=50, truncation=True)
    
    augmented_texts = []
    for _ in range(num_augmentations):
        outputs = model.generate(
            inputs.input_ids,
            max_length=20,
            num_return_sequences=1,
            do_sample=True,
            top_p=0.5
        )
        
        augmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        augmented_texts.append(augmented_text)
    
    return augmented_texts

def augment_dataframe(df, col, num_augmentations=1):
    augmented_data = []
    
    for _, row in df.iterrows():
        print(f"Index: {_+1}/{len(df)}....({100*(_+1)/len(df):.2f}% done)")
        original_text = row[col]
        augmented_texts = augment_text(original_text, num_augmentations)
        
        for aug_text in augmented_texts:
            new_row = row.copy()
            new_row[col] = aug_text
            augmented_data.append(new_row)
    
    augmented_df = pd.DataFrame(augmented_data)
    return pd.concat([df, augmented_df], ignore_index=True)

train = augment_dataframe(train, 'sentence_1')
train.to_csv('data/train_augmented.csv', index=False)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


Index: 1/10....(10.00% done)
Index: 2/10....(20.00% done)
Index: 3/10....(30.00% done)
Index: 4/10....(40.00% done)
Index: 5/10....(50.00% done)
Index: 6/10....(60.00% done)
Index: 7/10....(70.00% done)
Index: 8/10....(80.00% done)
Index: 9/10....(90.00% done)
Index: 10/10....(100.00% done)


In [114]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
import random

model_name = "klue/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
model = AutoModelForMaskedLM.from_pretrained(model_name)

def augment_text(text, num_augmentations=1, mask_prob=0.15):
    mecab = Mecab()
    tokens_with_pos = mecab.pos(text)
    tokens = [token for token, pos in tokens_with_pos]
    target_tags = ['N', 'V']

    remaining = [(tokenizer.mask_token, pos) if random.random() > mask_prob and any([pos.startswith(tag) for tag in target_tags]) else (token, pos) for token, pos in tokens_with_pos]

    result = []
    for token, pos in remaining:
        if pos.startswith('J') or pos.startswith('E'):
            if len(result) != 0:
                result[-1] += token
            else:
                result.append(token)
        else:
            result.append(token)
    masked_text = ' '.join(result)

    masked_tokens = tokenizer.tokenize(masked_text)
    augmented_texts = []

    for _ in range(num_augmentations):
        inputs = tokenizer.encode(masked_tokens, return_tensors="pt")
        with torch.no_grad():
            outputs = model(inputs)
        
        predictions = outputs.logits.argmax(dim=-1)
        
        for i, token in enumerate(masked_tokens):
            if token == tokenizer.mask_token:
                masked_tokens[i] = tokenizer.convert_ids_to_tokens(predictions[0, i].item())
        
        augmented_text = tokenizer.convert_tokens_to_string(masked_tokens)
        augmented_texts.append(augmented_text)
    
    return augmented_texts

def augment_dataframe(df, col, num_augmentations=1):
    augmented_data = []
    
    for _, row in df.iterrows():
        print(f"Index: {_+1}/{len(df)}....({100*(_+1)/len(df):.2f}% done)")
        original_text = row[col]
        augmented_texts = augment_text(original_text, num_augmentations)
        
        for aug_text in augmented_texts:
            new_row = row.copy()
            new_row[col] = aug_text
            augmented_data.append(new_row)
    
    augmented_df = pd.DataFrame(augmented_data)
    return pd.concat([df, augmented_df], ignore_index=True)

train = augment_dataframe(train, 'sentence_1')
train.to_csv('data/train_augmented.csv', index=False)

Some weights of ElectraForMaskedLM were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['generator_lm_head.bias', 'generator_predictions.LayerNorm.bias', 'generator_predictions.LayerNorm.weight', 'generator_predictions.dense.bias', 'generator_predictions.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Index: 1/100....(1.00% done)


TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

# Modeling

In [5]:
config = {
    'BATCH_SIZE': 32,
    'MAX_LEN': 128,
    'LEARNING_RATE': 0.0007,
    'EPOCHS': 20,
    'MODEL_NAME': 'FacebookAI/xlm-roberta-large',
    'LORA_RANK': 16,
    'MODULE_NAMES': ['query', 'key', 'value'],
    'SEED': 12345
}

In [6]:
wandb.login(key='dfae2bfa701b636a0c1d84ddd928a19f5e17c2f5')
wandb.init(project="Level1_STS", name='xlm-roberta-large_deletion_rawtext', config = config)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


[34m[1mwandb[0m: Currently logged in as: [33mkangjun205[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /data/ephemeral/home/.netrc
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


In [7]:
now_min = datetime.datetime.now().strftime('%d%H%M')
now_sec = datetime.datetime.now().strftime('%d%H%M%S')

tokenizer = get_tokenizer(config['MODEL_NAME'])
dataloader = TextDataLoader(
    tokenizer=tokenizer,
    max_len=config['MAX_LEN'],
    train_data=train,
    dev_data=dev,
    truncation=True,
    batch_size=config['BATCH_SIZE']
)
model = STSModel(config)

early_stop_callback = EarlyStopping(
    monitor='val_loss',
    patience=3,
    mode='min'
)

checkpoint_callback = ModelCheckpoint(
    dirpath='saved',
    filename='xlm-roberta-large_deletion_rawtext',
    save_top_k=3,
    monitor='val_pearson_corr',
    mode='min'
)

model_name = config['MODEL_NAME']
run_name = f'{model_name}-{now_min}'
wandb_logger = WandbLogger(name = run_name, project="Level1-STS")

trainer = Trainer(
    accelerator="gpu",
    devices=1,
    max_epochs=config['EPOCHS'],
    val_check_interval=1.0,
    callbacks=[early_stop_callback, checkpoint_callback],
    logger = wandb_logger
    )

trainer.fit(model, datamodule=dataloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
`Trainer(val_check_interval=1)` was configured so validation will run after every batch.
/opt/conda/envs/STS/lib/python3.11/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/opt/conda/envs/STS/lib/python3.11/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /data/ephemeral/level1-semantictextsimilarity-nlp-16/saved exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type      | Params | Mode 
----------------------------------------------
0 | mod     | PeftModel | 562 M  | train
1 | dense   | Linear    | 1.0 K  | train
2 | sigmoid | Sigmoid   | 0      | train
3 | loss    | MSELoss   | 0      | train
---------------

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

/opt/conda/envs/STS/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


                                                                           

/opt/conda/envs/STS/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Epoch 0:   1%|▏         | 4/292 [00:14<16:50,  0.29it/s, v_num=4xr9]


Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined

# Evaluating

In [17]:
wandb.login(key='dfae2bfa701b636a0c1d84ddd928a19f5e17c2f5')

api = wandb.Api()
run = api.run("kangjun205/Level1_STS/dlyeghmc")

model_file = run.file("model.pth").download()
config = run.config



CommError: Permission denied to access kangjun205/Level1_STS/g3tra6sp

In [4]:
checkpoint_path = "saved/best-model-20175906-v2.ckpt"
model = STSModel.load_from_checkpoint(checkpoint_path)

In [5]:
test = pd.read_csv('data/test.csv')

tokenizer = get_tokenizer(config['MODEL_NAME'])
dataloader = TextDataLoader(
    tokenizer=tokenizer,
    max_len=config['MAX_LEN'],
    test_data=test,
    truncation=True
)
    
trainer = Trainer(
    accelerator="gpu",
    devices=1
)

preds = trainer.predict(model, dataloader)
all_pred = [val for pred in preds for val in pred]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/opt/conda/envs/STS/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 35/35 [00:08<00:00,  4.06it/s]


In [6]:
submission = pd.read_csv('data/sample_submission.csv')
submission['target'] = all_pred
display(submission.head())

submission.to_csv('data/submission.csv', index=False)

Unnamed: 0,id,target
0,boostcamp-sts-v1-test-000,3.148151
1,boostcamp-sts-v1-test-001,4.090247
2,boostcamp-sts-v1-test-002,1.774596
3,boostcamp-sts-v1-test-003,0.192109
4,boostcamp-sts-v1-test-004,3.875429


In [9]:
datetime.datetime.now().strftime('%d%H%M%S')

'21124716'