In [1]:
import datetime

import numpy as np
import pandas as pd
import wandb
import yaml
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.loggers import WandbLogger
from utils.tokenizer import get_tokenizer
from data_loader.data_loaders import TextDataLoader
from utils.util import set_seed
from model.model import STSModel
from utils.util import WandbCheckpointCallback
from utils.clean import clean_texts

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /data/ephemeral/home/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /data/ephemeral/home/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /data/ephemeral/home/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data Reading

In [25]:
train = pd.read_csv('data/train.csv')
dev = pd.read_csv('data/dev.csv')

train.drop(columns=['id', 'source', 'binary-label'], inplace = True)
dev.drop(columns=['id', 'source', 'binary-label'], inplace = True)

train.head()

Unnamed: 0,sentence_1,sentence_2,label
0,스릴도있고 반전도 있고 여느 한국영화 쓰레기들하고는 차원이 다르네요~,"반전도 있고,사랑도 있고재미도있네요.",2.2
1,앗 제가 접근권한이 없다고 뜹니다;;,"오, 액세스 권한이 없다고 합니다.",4.2
2,주택청약조건 변경해주세요.,주택청약 무주택기준 변경해주세요.,2.4
3,입사후 처음 대면으로 만나 반가웠습니다.,화상으로만 보다가 리얼로 만나니 정말 반가웠습니다.,3.0
4,뿌듯뿌듯 하네요!!,꼬옥 실제로 한번 뵈어요 뿌뿌뿌~!~!,0.0


In [26]:
train[['label']] = train[['label']].astype('float32')
train['sentence_1'] = clean_texts(train['sentence_1'])
train['sentence_2'] = clean_texts(train['sentence_2'])

dev[['label']] = dev[['label']].astype('float32')
dev['sentence_1'] = clean_texts(dev['sentence_1'])
dev['sentence_2'] = clean_texts(dev['sentence_2'])

train.head()

Unnamed: 0,sentence_1,sentence_2,label
0,스릴도있고 반전도 있고 여느 한국영화 쓰레기들하고는 차원이 다르네요,반전도 있고사랑도 있고재미도있네요,2.2
1,앗 제가 접근권한이 없다고 뜹니다,오 액세스 권한이 없다고 합니다,4.2
2,주택청약조건 변경해주세요,주택청약 무주택기준 변경해주세요,2.4
3,입사후 처음 대면으로 만나 반가웠습니다,화상으로만 보다가 리얼로 만나니 정말 반가웠습니다,3.0
4,뿌듯뿌듯 하네요,꼬옥 실제로 한번 뵈어요 뿌뿌뿌,0.0


# Data Augmentataion

## RS, RD, RI

In [21]:
from konlpy.tag import Mecab
import random

def random_deletion(text, p=0.1):
    mecab = Mecab()
    tokens_with_pos = mecab.pos(text)
    tokens = [token for token, pos in tokens_with_pos]
    if len(tokens) == 1:
        return text
    remaining = [t for t in tokens if random.random() > p]
    if len(remaining) == 0:
        return random.choice(tokens)
    return ' '.join(remaining)

def random_swap(text, n=1):
    mecab = Mecab()
    tokens_with_pos = mecab.pos(text)
    tokens = [token for token, pos in tokens_with_pos]
    for _ in range(n):
        if len(tokens) >= 2:
            idx1, idx2 = random.sample(range(len(tokens)), 2)
            tokens[idx1], tokens[idx2] = tokens[idx2], tokens[idx1]
    return ' '.join(tokens)

def random_insertion(text, n=1):
    mecab = Mecab()
    tokens_with_pos = mecab.pos(text)
    tokens = [token for token, pos in tokens_with_pos]
    for _ in range(n):
        insert_pos = random.randint(0, len(tokens))
        insert_word = random.choice(tokens)
        tokens.insert(insert_pos, insert_word)
    return ' '.join(tokens)

train_swapped = train.copy(deep=True)
train_swapped['sentence_1'] = train_swapped['sentence_1'].apply(lambda x : random_swap(x))
train_deleted = train.copy(deep=True)
train_deleted['sentence_1'] = train_deleted['sentence_1'].apply(lambda x : random_deletion(x))
train_inserted = train.copy(deep=True)
train_inserted['sentence_1'] = train_inserted['sentence_1'].apply(lambda x : random_insertion(x))

# train = pd.concat([train, train_swapped, train_deleted, train_inserted])
train = pd.concat([train, train_inserted])
train.reset_index(drop=True, inplace=True)

## Augmentation With Language Model

In [27]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm

model_name = "skt/kogpt2-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

def augment_text(text, num_augmentations=1, max_length=128):
    augmented_texts = []
    input_ids = tokenizer.encode(text, return_tensors='pt')
    
    for _ in range(num_augmentations):
        output = model.generate(
            input_ids,
            max_length=max_length,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            do_sample=True,
            top_p=0.8
        )
        
        augmented_text = tokenizer.decode(output[0], skip_special_tokens=True)
        augmented_texts.append(augmented_text)
    
    return augmented_texts

def augment_dataframe(df, col, num_augmentations=1):
    augmented_data = []
    
    for _, row in tqdm(df.iterrows()):
        original_text = row[col]
        augmented_texts = augment_text(original_text, num_augmentations)
        
        for aug_text in augmented_texts:
            new_row = row.copy()
            new_row[col] = aug_text
            augmented_data.append(new_row)
    
    augmented_df = pd.DataFrame(augmented_data)
    return pd.concat([df, augmented_df], ignore_index=True)

train = augment_dataframe(train, 'sentence_1')
train.tail()

train.to_csv('data/train_augmented.csv', index=False)



KeyboardInterrupt: 

# Modeling

In [22]:
config = {
    'BATCH_SIZE': 32,
    'MAX_LEN': 128,
    'LEARNING_RATE': 0.00001,
    'EPOCHS': 10,
    'MODEL_NAME': 'intfloat/multilingual-e5-small'
}

In [23]:
wandb.login(key='dfae2bfa701b636a0c1d84ddd928a19f5e17c2f5')
wandb.init(project="Level1_STS", name = 'Random_Insert_3', config = config)



0,1
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▆▆▆▆▆▆▆▆▆▇▇▇█████
train_loss,█▄▃▂▂▂▁▁▁▁
trainer/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▆▆▆▇▇▇▇█
val_loss,█▇▄▃▄▁▂▂▁▁▂▃▂▄▄▄▃▄▄▄▃▄▄▆▄▅▅▄▃▄▄▃▃▄▄▃▄▃▄▂
val_pearson_corr,▁▇██████████████████████████████████████

0,1
epoch,9.0
train_loss,0.05482
trainer/global_step,5829.0
val_loss,0.7627
val_pearson_corr,0.81295


In [24]:
now_min = datetime.datetime.now().strftime('%d%H%M')
now_sec = datetime.datetime.now().strftime('%d%H%M%S')

tokenizer = get_tokenizer(config['MODEL_NAME'])
dataloader = TextDataLoader(
    tokenizer=tokenizer,
    max_len=config['MAX_LEN'],
    train_data=train,
    dev_data=dev,
    truncation=True,
    batch_size=config['BATCH_SIZE']
)
model = STSModel(config)

early_stop_callback = EarlyStopping(
    monitor='val_loss',
    patience=3,
    mode='min'
)

checkpoint_callback = ModelCheckpoint(
    dirpath='saved',
    filename=f'best-model-{now_sec}',
    save_top_k=3,
    monitor='val_loss',
    mode='min'
)

wandb_checkpoint_callback = WandbCheckpointCallback(top_k=3)

model_name = config['MODEL_NAME']
run_name = f'{model_name}-{now_min}'
wandb_logger = WandbLogger(name = run_name, project="Level1-STS")

trainer = Trainer(
    accelerator="gpu",
    devices=1,
    max_epochs=config['EPOCHS'],
    val_check_interval=1,
    logger = wandb_logger
    )

trainer.fit(model, datamodule=dataloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
`Trainer(val_check_interval=1)` was configured so validation will run after every batch.
/opt/conda/envs/STS/lib/python3.11/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type      | Params | Mode 
----------------------------------------------
0 | mod     | BertModel | 117 M  | eval 
1 | dense   | Linear    | 385    | train
2 | sigmoid | Sigmoid   | 0      | train
----------------------------------------------
117 M     Trainable params
0         Non-trainable params
117 M     Total params
470.617   Total estimated model params size (MB)
2         Modules in train mode
228       Modules in eval mode


Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00, 39.92it/s]

/opt/conda/envs/STS/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


                                                                           

/opt/conda/envs/STS/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Epoch 9:  46%|████▌     | 267/583 [02:53<03:25,  1.54it/s, v_num=9og8]


Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined

# Evaluating