In [1]:
import datetime

import numpy as np
import pandas as pd
import wandb
import yaml
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.loggers import WandbLogger
from utils.tokenizer import get_tokenizer
from data_loader.data_loaders import TextDataLoader
from utils.util import set_seed
from model.model import STSModel
from utils.util import WandbCheckpointCallback
from utils.clean import clean_texts

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /data/ephemeral/home/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /data/ephemeral/home/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /data/ephemeral/home/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data Reading

In [28]:
train = pd.read_csv('data/train.csv')
dev = pd.read_csv('data/dev.csv')
test = pd.read_csv('data/test.csv')

train.drop(columns=['id', 'source', 'binary-label'], inplace = True)
dev.drop(columns=['id', 'source', 'binary-label'], inplace = True)

train.head()

Unnamed: 0,sentence_1,sentence_2,label
0,스릴도있고 반전도 있고 여느 한국영화 쓰레기들하고는 차원이 다르네요~,"반전도 있고,사랑도 있고재미도있네요.",2.2
1,앗 제가 접근권한이 없다고 뜹니다;;,"오, 액세스 권한이 없다고 합니다.",4.2
2,주택청약조건 변경해주세요.,주택청약 무주택기준 변경해주세요.,2.4
3,입사후 처음 대면으로 만나 반가웠습니다.,화상으로만 보다가 리얼로 만나니 정말 반가웠습니다.,3.0
4,뿌듯뿌듯 하네요!!,꼬옥 실제로 한번 뵈어요 뿌뿌뿌~!~!,0.0


In [29]:
train[['label']] = train[['label']].astype('float32')
train['sentence_1'] = clean_texts(train['sentence_1'])
train['sentence_2'] = clean_texts(train['sentence_2'])

dev[['label']] = dev[['label']].astype('float32')
dev['sentence_1'] = clean_texts(dev['sentence_1'])
dev['sentence_2'] = clean_texts(dev['sentence_2'])

train.head()

Unnamed: 0,sentence_1,sentence_2,label
0,스릴도있고 반전도 있고 여느 한국영화 쓰레기들하고는 차원이 다르네요,반전도 있고사랑도 있고재미도있네요,2.2
1,앗 제가 접근권한이 없다고 뜹니다,오 액세스 권한이 없다고 합니다,4.2
2,주택청약조건 변경해주세요,주택청약 무주택기준 변경해주세요,2.4
3,입사후 처음 대면으로 만나 반가웠습니다,화상으로만 보다가 리얼로 만나니 정말 반가웠습니다,3.0
4,뿌듯뿌듯 하네요,꼬옥 실제로 한번 뵈어요 뿌뿌뿌,0.0


# Data Augmentataion

## RS, RD, RI

In [30]:
from konlpy.tag import Mecab
import random

def random_deletion(text, p=0.1):
    mecab = Mecab()
    tokens_with_pos = mecab.pos(text)
    tokens = [token for token, pos in tokens_with_pos]
    if len(tokens) == 1:
        return text
    remaining = [t for t in tokens if random.random() > p]
    if len(remaining) == 0:
        return random.choice(tokens)
    return ' '.join(remaining)

def random_swap(text, n=1):
    mecab = Mecab()
    tokens_with_pos = mecab.pos(text)
    tokens = [token for token, pos in tokens_with_pos]
    for _ in range(n):
        if len(tokens) >= 2:
            idx1, idx2 = random.sample(range(len(tokens)), 2)
            tokens[idx1], tokens[idx2] = tokens[idx2], tokens[idx1]
    return ' '.join(tokens)

def random_insertion(text, n=1):
    mecab = Mecab()
    tokens_with_pos = mecab.pos(text)
    tokens = [token for token, pos in tokens_with_pos]
    for _ in range(n):
        insert_pos = random.randint(0, len(tokens))
        insert_word = random.choice(tokens)
        tokens.insert(insert_pos, insert_word)
    return ' '.join(tokens)

train_swapped = train.copy(deep=True)
train_swapped['sentence_1'] = train_swapped['sentence_1'].apply(lambda x : random_swap(x))
train_deleted = train.copy(deep=True)
train_deleted['sentence_1'] = train_deleted['sentence_1'].apply(lambda x : random_deletion(x))
train_inserted = train.copy(deep=True)
train_inserted['sentence_1'] = train_inserted['sentence_1'].apply(lambda x : random_insertion(x))

# train = pd.concat([train, train_swapped, train_deleted, train_inserted])
train = pd.concat([train, train_inserted])
train.reset_index(drop=True, inplace=True)

## Augmentation With Language Model

In [30]:
train = train.iloc[:1000]

In [31]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "skt/kogpt2-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

def augment_text(text, num_augmentations=1, max_length=50):
    augmented_texts = []
    input_ids = tokenizer.encode(text, return_tensors='pt')
    
    for _ in range(num_augmentations):
        output = model.generate(
            input_ids,
            max_length=max_length,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            do_sample=True,
            top_p=0.8
        )
        
        augmented_text = tokenizer.decode(output[0], skip_special_tokens=True)
        augmented_texts.append(augmented_text)
    
    return augmented_texts

def augment_dataframe(df, col, num_augmentations=1):
    augmented_data = []
    
    for _, row in df.iterrows():
        print(f"Index: {_+1}/{len(df)}....({100*(_+1)/len(df):.2f}% done)")
        original_text = row[col]
        augmented_texts = augment_text(original_text, num_augmentations)
        
        for aug_text in augmented_texts:
            new_row = row.copy()
            new_row[col] = aug_text
            augmented_data.append(new_row)
    
    augmented_df = pd.DataFrame(augmented_data)
    return pd.concat([df, augmented_df], ignore_index=True)

train = augment_dataframe(train, 'sentence_1')
train.tail()

train.to_csv('data/train_augmented.csv', index=False)



Index: 0/1000....(0.10% done)
Index: 1/1000....(0.20% done)
Index: 2/1000....(0.30% done)
Index: 3/1000....(0.40% done)
Index: 4/1000....(0.50% done)
Index: 5/1000....(0.60% done)
Index: 6/1000....(0.70% done)
Index: 7/1000....(0.80% done)
Index: 8/1000....(0.90% done)
Index: 9/1000....(1.00% done)
Index: 10/1000....(1.10% done)
Index: 11/1000....(1.20% done)
Index: 12/1000....(1.30% done)
Index: 13/1000....(1.40% done)
Index: 14/1000....(1.50% done)
Index: 15/1000....(1.60% done)
Index: 16/1000....(1.70% done)
Index: 17/1000....(1.80% done)
Index: 18/1000....(1.90% done)
Index: 19/1000....(2.00% done)
Index: 20/1000....(2.10% done)
Index: 21/1000....(2.20% done)
Index: 22/1000....(2.30% done)
Index: 23/1000....(2.40% done)
Index: 24/1000....(2.50% done)
Index: 25/1000....(2.60% done)
Index: 26/1000....(2.70% done)
Index: 27/1000....(2.80% done)
Index: 28/1000....(2.90% done)
Index: 29/1000....(3.00% done)
Index: 30/1000....(3.10% done)
Index: 31/1000....(3.20% done)
Index: 32/1000....

In [32]:
train.loc[len(train)-1, 'sentence_1']

'조여옥을 파면 징계해주세요!!!\n그리고 박근혜대통령님도 파면조치 해주세요.! 국민들이 청원하는거 다 공감하고 있습니다.\n이런일이 반복되고 있어 정말 가슴이 아프고 답답합니다.\n제가 청원합니다.\n'

# Modeling

In [17]:
config = {
    'BATCH_SIZE': 32,
    'MAX_LEN': 128,
    'LEARNING_RATE': 0.00001,
    'EPOCHS': 10,
    'MODEL_NAME': 'intfloat/multilingual-e5-small'
}

In [22]:
wandb.login(key='dfae2bfa701b636a0c1d84ddd928a19f5e17c2f5')
wandb.init(project="Level1_STS", name = 'LM_AUG', config = config)



In [23]:
now_min = datetime.datetime.now().strftime('%d%H%M')
now_sec = datetime.datetime.now().strftime('%d%H%M%S')

tokenizer = get_tokenizer(config['MODEL_NAME'])
dataloader = TextDataLoader(
    tokenizer=tokenizer,
    max_len=config['MAX_LEN'],
    train_data=train,
    dev_data=dev,
    truncation=True,
    batch_size=config['BATCH_SIZE']
)
model = STSModel(config)

early_stop_callback = EarlyStopping(
    monitor='val_loss',
    patience=3,
    mode='min'
)

checkpoint_callback = ModelCheckpoint(
    dirpath='saved',
    filename=f'best-model-{now_sec}',
    save_top_k=3,
    monitor='val_loss',
    mode='min'
)

wandb_checkpoint_callback = WandbCheckpointCallback(top_k=3)

model_name = config['MODEL_NAME']
run_name = f'{model_name}-{now_min}'
wandb_logger = WandbLogger(name = run_name, project="Level1-STS")

trainer = Trainer(
    accelerator="gpu",
    devices=1,
    max_epochs=config['EPOCHS'],
    val_check_interval=1,
    callbacks=[early_stop_callback, checkpoint_callback],
    logger = wandb_logger
    )

trainer.fit(model, datamodule=dataloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
`Trainer(val_check_interval=1)` was configured so validation will run after every batch.
/opt/conda/envs/STS/lib/python3.11/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/opt/conda/envs/STS/lib/python3.11/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /data/ephemeral/level1-semantictextsimilarity-nlp-16/saved exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type      | Params | Mode 
----------------------------------------------
0 | mod     | BertModel | 117 M  | eval 
1 | dense   | Linear    | 385    | train
2 | sigmoid | Sigmoid   | 0      | train
----------------------------------------------
117 M    

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

/opt/conda/envs/STS/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


                                                                           

/opt/conda/envs/STS/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Epoch 0:   0%|          | 2/583 [00:00<03:50,  2.52it/s, v_num=1k1y]



Epoch 4: 100%|██████████| 583/583 [06:52<00:00,  1.41it/s, v_num=1k1y]


# Evaluating

In [7]:
test = pd.read_csv('data/test.csv')
dataloader = TextDataLoader(
    tokenizer=tokenizer,
    max_len=config['MAX_LEN'],
    test_data=test,
    truncation=True,
    batch_size=config['BATCH_SIZE']
)
    
trainer = Trainer(
    accelerator="gpu",
    devices=1
)

trainer.predict(model, dataloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/conda/envs/STS/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


/opt/conda/envs/STS/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Predicting: |          | 0/? [00:00<?, ?it/s]

AttributeError: 'TestDataset' object has no attribute 'labels'