In [1]:
import re
import pickle

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf

import plotly.graph_objs as go
import plotly.colors as colors

2024-06-17 05:20:37.917785: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-17 05:20:37.917915: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-17 05:20:38.086033: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
from tensorflow import keras
from keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras import layers
from keras.models import Model
from keras.layers import SimpleRNN, LSTM, Bidirectional, GRU
from keras.layers import Input, MultiHeadAttention, Attention, AdditiveAttention

from keras.layers import Embedding, Dense, Dropout, BatchNormalization
from keras.optimizers import Adam

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

## Data load

In [4]:
fields = ['text', 'sentiment']
csv_train = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/train.csv', 
                        encoding='ISO-8859-1',
#                       on_bad_lines='skip',
                        usecols=fields)

In [5]:
csv_train.tail(10)

Unnamed: 0,text,sentiment
27471,"i`m defying gravity. and nobody in alll of oz,...",neutral
27472,http://twitpic.com/663vr - Wanted to visit the...,negative
27473,in spoke to you yesterday and u didnt respond...,neutral
27474,So I get up early and I feel good about the da...,positive
27475,enjoy ur night,positive
27476,wish we could come see u on Denver husband l...,negative
27477,I`ve wondered about rake to. The client has ...,negative
27478,Yay good for both of you. Enjoy the break - y...,positive
27479,But it was worth it ****.,positive
27480,All this flirting going on - The ATG smiles...,neutral


In [6]:
print(csv_train.shape)
print(csv_train.isnull().sum())
print(csv_train.info)

(27481, 2)
text         1
sentiment    0
dtype: int64
<bound method DataFrame.info of                                                     text sentiment
0                    I`d have responded, if I were going   neutral
1          Sooo SAD I will miss you here in San Diego!!!  negative
2                              my boss is bullying me...  negative
3                         what interview! leave me alone  negative
4       Sons of ****, why couldn`t they put them on t...  negative
...                                                  ...       ...
27476   wish we could come see u on Denver  husband l...  negative
27477   I`ve wondered about rake to.  The client has ...  negative
27478   Yay good for both of you. Enjoy the break - y...  positive
27479                         But it was worth it  ****.  positive
27480     All this flirting going on - The ATG smiles...   neutral

[27481 rows x 2 columns]>


In [7]:
csv_train.dropna(subset=["text"], inplace=True)
print(csv_train.isnull().sum())

text         0
sentiment    0
dtype: int64


In [8]:
def text_clean(text):
    pattern1 = r'(http|ftp|https)://(?:[-\w.]|(?:%[\da-fA-F]{2}))+' # URL제거
    pattern2 = r'<[^>]*>'         # HTML 태그 제거
    pattern3 = r'[^\w\s]'         # 특수기호제거
    pattern = f'{pattern1}|{pattern2}|{pattern3}'
    text = re.sub(pattern, '', text)
    text = text.lower()
    return text

In [9]:
csv_train['text'] = csv_train['text'].apply(text_clean)

In [10]:
# 'sentiment' 값을 숫자로 매핑하는 함수
sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
csv_train['sentiment'] = csv_train['sentiment'].map(sentiment_map)

In [11]:
csv_train.tail(5)

Unnamed: 0,text,sentiment
27476,wish we could come see u on denver husband l...,0
27477,ive wondered about rake to the client has ma...,0
27478,yay good for both of you enjoy the break you...,2
27479,but it was worth it,2
27480,all this flirting going on the atg smiles ...,1


In [12]:
print('Train data shape: ', csv_train.shape)
n_lebel = len(csv_train[csv_train.sentiment == 0])
print('negative in Train data: {} ({:.1f}%)'.format(n_lebel, n_lebel*100/len(csv_train)))
n_lebel = len(csv_train[csv_train.sentiment == 1])
print('neutral in Train data: {} ({:.1f}%)'.format(n_lebel, n_lebel*100/len(csv_train)))
n_lebel = len(csv_train[csv_train.sentiment == 2])
print('positive in Train data: {} ({:.1f}%)'.format(n_lebel, n_lebel*100/len(csv_train)))

Train data shape:  (27480, 2)
negative in Train data: 7781 (28.3%)
neutral in Train data: 11117 (40.5%)
positive in Train data: 8582 (31.2%)


In [13]:
pip install transformers

  pid, fd = os.forkpty()


Note: you may need to restart the kernel to use updated packages.


In [14]:
from transformers import BertTokenizer, BertForSequenceClassification, BertModel
import torch

from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [15]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [16]:
def tokenize_text(text):
    tokens = tokenizer.encode_plus(text, 
                                   add_special_tokens=True,
                                   max_length=128,
                                   padding='max_length',
                                   truncation=True,
                                   return_token_type_ids=False,
                                   return_attention_mask=True,
                                   return_tensors='pt')
    return tokens

In [17]:
# 데이터셋 클래스 정의
class SentimentDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        tokenized_text = tokenize_text(text)
        return {
            'input_ids': tokenized_text['input_ids'].flatten(),
            'attention_mask': tokenized_text['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

### Train Data Set

In [18]:
# 데이터셋 분할
train_texts, val_texts, train_labels, val_labels = train_test_split(csv_train['text'].values, 
                                                                    csv_train['sentiment'].values, 
                                                                    test_size=0.2, 
                                                                    random_state=42)

# 데이터로더 생성
train_dataset = SentimentDataset(train_texts, train_labels)
val_dataset = SentimentDataset(val_texts, val_labels)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [19]:
# test데이터셋 만들기
fields = ['text', 'sentiment']
csv_test = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/test.csv', 
                        encoding='ISO-8859-1',
                        usecols=fields)

### Test Data Set

In [20]:
csv_test.dropna(subset=["text"], inplace=True)
csv_test['text'] = csv_test['text'].apply(text_clean)
print(csv_test.isnull().sum())
print(csv_test.shape)

text         0
sentiment    0
dtype: int64
(3534, 2)


In [21]:
csv_test.head(5)

Unnamed: 0,text,sentiment
0,last session of the day 67ezh,neutral
1,shanghai is also really exciting precisely s...,positive
2,recession hit veronique branquinho she has to ...,negative
3,happy bday,positive
4,4w75p i like it,positive


In [22]:
sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
csv_test['sentiment'] = csv_test['sentiment'].map(sentiment_map)

test_texts = csv_test['text'].values
test_labels = csv_test['sentiment'].values
test_dataset = SentimentDataset(test_texts, test_labels)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

## BERT Model Train

In [22]:
# BERT 모델 로드
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# 옵티마이저 및 손실 함수 정의
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# 모델 학습
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [23]:
epochs = 5
for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    
    # 검증 데이터셋으로 평가
    model.eval()
    val_preds = []
    val_true = []
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
        
        preds = torch.argmax(logits, dim=-1).cpu().numpy()
        val_preds.extend(preds)
        val_true.extend(labels.cpu().numpy())
    
    val_acc = accuracy_score(val_true, val_preds)
    print(f'Epoch {epoch + 1}/{epochs}, Validation Accuracy: {val_acc:.4f}')

Epoch 1/5, Validation Accuracy: 0.7786
Epoch 2/5, Validation Accuracy: 0.7862
Epoch 3/5, Validation Accuracy: 0.7809
Epoch 4/5, Validation Accuracy: 0.7806
Epoch 5/5, Validation Accuracy: 0.7769


## Model Test

In [24]:
model.eval()
test_preds = []
test_true = []
for batch in test_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
    
    preds = torch.argmax(logits, dim=-1).cpu().numpy()
    test_preds.extend(preds)
    test_true.extend(labels.cpu().numpy())

test_acc = accuracy_score(test_true, test_preds)
print(f'Test Accuracy: {test_acc:.4f}')

Test Accuracy: 0.7651


In [25]:
print(classification_report(test_true, test_preds, target_names=['negative', 'neutral', 'positive']))

              precision    recall  f1-score   support

    negative       0.73      0.82      0.77      1001
     neutral       0.73      0.72      0.72      1430
    positive       0.85      0.78      0.81      1103

    accuracy                           0.77      3534
   macro avg       0.77      0.77      0.77      3534
weighted avg       0.77      0.77      0.77      3534



### GPT Model Train

In [27]:
import torch.nn as nn
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from transformers import OpenAIGPTTokenizer, OpenAIGPTModel, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [28]:
# 감정 분석 모델 정의
class SentimentClassifier(nn.Module):
    def __init__(self, model_name, num_classes):
        super(SentimentClassifier, self).__init__()
        self.model = OpenAIGPTModel.from_pretrained(model_name)
        self.tokenizer = OpenAIGPTTokenizer.from_pretrained(model_name)
        self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        self.tokenizer.padding_side = 'right'
        self.model.resize_token_embeddings(len(self.tokenizer))
        self.linear = nn.Linear(self.model.config.hidden_size, num_classes)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.model(input_ids, attention_mask=attention_mask)
        logits = self.linear(outputs.last_hidden_state[:, -1, :])
        return self.softmax(logits)

In [29]:
# 모델 인스턴스 생성
num_classes = 3
model = SentimentClassifier("openai-gpt", num_classes)

# 옵티마이저 및 손실 함수 정의
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# 모델 학습
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

config.json:   0%|          | 0.00/656 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/479M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/816k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/458k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


SentimentClassifier(
  (model): OpenAIGPTModel(
    (tokens_embed): Embedding(40479, 768)
    (positions_embed): Embedding(512, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x Block(
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (linear): Linear(in_features=768, out_features=3, bias=True)
  (softmax): Softmax(dim=-1)
)

In [30]:
epochs = 5
for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
    
    # 검증 데이터셋으로 평가
    model.eval()
    val_preds = []
    val_true = []
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        
        preds = torch.argmax(outputs, dim=-1).cpu().numpy()
        val_preds.extend(preds)
        val_true.extend(labels.cpu().numpy())
    
    val_acc = accuracy_score(val_true, val_preds)
    print(f'Epoch {epoch + 1}/{epochs}, Validation Accuracy: {val_acc:.4f}')


Epoch 1/5, Validation Accuracy: 0.5890
Epoch 2/5, Validation Accuracy: 0.6541
Epoch 3/5, Validation Accuracy: 0.6743
Epoch 4/5, Validation Accuracy: 0.6834
Epoch 5/5, Validation Accuracy: 0.6827


In [31]:
# 테스트 데이터셋으로 평가
model.eval()
test_preds = []
test_true = []
for batch in test_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    
    preds = torch.argmax(outputs, dim=-1).cpu().numpy()
    test_preds.extend(preds)
    test_true.extend(labels.cpu().numpy())

test_acc = accuracy_score(test_true, test_preds)
print(f'Test Accuracy: {test_acc:.4f}')
print(classification_report(test_true, test_preds, target_names=['negative', 'neutral', 'positive']))

Test Accuracy: 0.6822
              precision    recall  f1-score   support

    negative       0.69      0.61      0.64      1001
     neutral       0.64      0.69      0.66      1430
    positive       0.74      0.75      0.74      1103

    accuracy                           0.68      3534
   macro avg       0.69      0.68      0.68      3534
weighted avg       0.68      0.68      0.68      3534

