In [None]:
!pip install transformers



In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler, DataCollatorWithPadding
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import torch
from tqdm.auto import tqdm
import pandas as pd
import re
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data = pd.read_csv('/content/drive/MyDrive/training.1600000.processed.noemoticon.csv',encoding = "latin-1",engine = 'python')
data.columns = ["label", "time", "date", "query", "username", "text"]

data=data[['text','label']]
data['label'][data['label']==4]=1
data_pos = data[data['label'] == 1]
data_neg = data[data['label'] == 0]
data_pos = data_pos.iloc[:int(50000)]
data_neg = data_neg.iloc[:int(50000)]
data = pd.concat([data_pos, data_neg])

def cleaning_URLs(data):
    return re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',data)
def cleaning_repeating_char(text):
    return re.sub(r'(.)\1+', r'\1', text)
def cleaning_numbers(data):
    return re.sub('[0-9]+', '', data)
def removing_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)
def reduce_repeated_letters(text):
    # 정규표현식을 사용하여 두 번 이상 반복되는 철자를 최대 두 번으로 줄임
    processed_text = re.sub(r'([a-zA-Z])\1{2,}', r'\1\1', text)
    return processed_text
def removing_email(data):
    return re.sub(r'@[^\s]+', ' ', data)
def removing_special_characters(text):
    special_characters = r'[~!@#$%^&*()-=+_]'
    return re.sub(special_characters, ' ', text)
def removing_punctuation(text):
    return ' '.join(re.sub("[.,!?;\\-=:]", " ", text).split())
def removing_etc(text):
    return ' '.join(re.sub(r"[^a-zA-Z]", " ", word) for word in text.split())

data['text'] = data['text'].apply(lambda x: removing_etc(x))
data['text'] = data['text'].apply(lambda x: removing_punctuation(x))
data['text'] = data['text'].apply(lambda x: removing_special_characters(x))
data['text'] = data['text'].apply(lambda x: removing_email(x))
data['text'] = data['text'].apply(lambda x: reduce_repeated_letters(x))
data['text'] = data['text'].apply(lambda x: cleaning_URLs(x))
data['text'] = data['text'].apply(lambda x: cleaning_repeating_char(x))
data['text'] = data['text'].apply(lambda x: cleaning_numbers(x))
data['text'] = data['text'].apply(lambda x: removing_html_tags(x))
data['text'] = data['text'].apply(lambda x: ' '.join(re.sub("(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)", " ", x).split()))

In [None]:
texts = data['text']
labels = data['label']

train_texts, valid_texts, train_labels, valid_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [None]:
!pip3 install emoji==0.6.0

Collecting emoji==0.6.0
  Downloading emoji-0.6.0.tar.gz (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.0/51.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-0.6.0-py3-none-any.whl size=49721 sha256=3097adb9020cdec8161fd89a1ebb9023168d5ad4a7a57c735a53767f30c3579d
  Stored in directory: /root/.cache/pip/wheels/1b/bd/d9/310c33c45a553798a714e27e3b8395d37128425442b8c78e07
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-0.6.0


In [None]:
checkpoint = "finiteautomata/bertweet-base-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/338 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


config.json:   0%|          | 0.00/949 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

In [None]:
def preprocess_dataset(texts, labels):
    tokenized_data = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=128, return_tensors='pt')
    tokenized_data['labels'] = torch.tensor(labels.values)
    return tokenized_data

class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data['labels'])


    def __getitem__(self, index):
        return {key: self.data[key][index] for key in self.data.keys()}

In [None]:
train_data = preprocess_dataset(train_texts, train_labels)
valid_data = preprocess_dataset(valid_texts, valid_labels)

train_dataset = CustomDataset(train_data)
valid_dataset = CustomDataset(valid_data)

data_collator = DataCollatorWithPadding(tokenizer)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator)
valid_dataloader = DataLoader(valid_dataset, batch_size=8, collate_fn=data_collator)
data_collator = DataCollatorWithPadding(tokenizer)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator)
valid_dataloader = DataLoader(valid_dataset, batch_size=8, collate_fn=data_collator)


In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler('linear', optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)



In [None]:
!nvidia-smi

Thu Nov 16 15:06:41 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   56C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
train_losses = []
valid_losses = []

progress_bar = tqdm(range(num_training_steps))
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        train_losses.append(loss.item())

    avg_train_loss = sum(train_losses) / len(train_losses)
    print(f"Epoch {epoch + 1}, Average training loss: {avg_train_loss}")

    accuracy = 0
    f1 = 0

    model.eval()
    all_labels = []
    all_predictions = []

    for batch in valid_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        valid_losses.append(outputs.loss.item())

        all_labels.extend(batch['labels'].to(torch.device('cpu')).numpy())
        all_predictions.extend(predictions.to(torch.device('cpu')).numpy())

    avg_valid_loss = sum(valid_losses) / len(valid_losses)
    print(f"Epoch {epoch + 1}, Average val loss: {avg_valid_loss}")

    accuracy = accuracy_score(all_labels, all_predictions)
    f1 = f1_score(all_labels, all_predictions, average='weighted')

    print("Epoch {}, Accuracy: {:.4f}, F1_score: {:.4f}".format(epoch + 1, accuracy, f1))


  0%|          | 0/30000 [00:00<?, ?it/s]

Epoch 1, Average training loss: 0.40984469325132666
Epoch 1, Average val loss: 0.38331529450565577
Epoch 1, Accuracy: 0.8269, F1_score: 0.8257
Epoch 2, Average training loss: 0.34863372413362376
Epoch 2, Average val loss: 0.3769217461865395
Epoch 2, Accuracy: 0.8418, F1_score: 0.8412
Epoch 3, Average training loss: 0.276057517743716
Epoch 3, Average val loss: 0.3979697866620496
Epoch 3, Accuracy: 0.8475, F1_score: 0.8474
