In [1]:
!pip3 install transformers torch



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import Library

In [3]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
# IMDB review
train_data = pd.read_csv("/content/drive/MyDrive/sentiment_analysis_dataset/IMDB/Train.csv")
test_data = pd.read_csv("/content/drive/MyDrive/sentiment_analysis_dataset/IMDB/Test.csv")
valid_data = pd.read_csv("/content/drive/MyDrive/sentiment_analysis_dataset/IMDB/Valid.csv")

In [5]:
train_data.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [6]:
train_data['label'].value_counts()
# balanced data

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,20019
1,19981


In [7]:
def get_max_len(text):
  """
  get max length of text
  """
  max_len = 0
  for sentence in text:
    max_len = max(max_len,len(sentence))
  return max_len

print(get_max_len(train_data['text'].tolist()))

13704


# Create Dataset

- Dataset is used to generate batch data during training

In [8]:
class MyDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [9]:
def get_tokenizer_and_model(model_name):
  tokenizer = BertTokenizer.from_pretrained(model_name)
  model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
  return tokenizer, model

In [10]:
def evaluate_model(model, valid_dataloader, device):
  model.eval()
  predictions, true_labels = [],[]
  with torch.no_grad():
    for batch in valid_dataloader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['label'].to(device)

      pred_output = model(input_ids, attention_mask)
      logits = pred_output.logits
      logits = torch.argmax(logits, dim=1).cpu().numpy()
      labels = labels.cpu().numpy()
      predictions.extend(logits)
      true_labels.extend(labels)
  return accuracy_score(true_labels, predictions)

In [11]:
def train_model(model, train_dataloader, valid_dataloader, epochs, inital_lr, device, step=20):
  optimizer = AdamW(model.parameters(), lr=inital_lr, correct_bias=False)
  model = model.to(device)
  trained_data = 0
  step_ = 0
  for epoch in range(epochs):
    # forward
    model.train()
    total_loss = 0
    total_loss_list = []
    for batch in train_dataloader:
      optimizer.zero_grad()
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['label'].to(device)

      model_output = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
      loss = model_output.loss
      total_loss += loss.item()
      total_loss_list.append(loss.item())

      loss.backward()
      optimizer.step()
      trained_data += batch_size
      step_ += 1
      if step_ % step == 0:
        print(f'Epoch: {epoch + 1}/{epochs}, trained_data:  {trained_data} / {epochs*batch_size*len(train_dataloader)}, Loss(avg): {total_loss / step*batch_size}')
        total_loss = 0
    val_accuracy = evaluate_model(model, valid_dataloader, device)
    print(f'Validation Accuracy: {val_accuracy:.4f}')


In [12]:
X_train = train_data['text'].tolist()
y_train = train_data['label'].tolist()
X_valid = valid_data['text'].tolist()
y_valid = valid_data['label'].tolist()
X_test = test_data['text'].tolist()
y_test = test_data['label'].tolist()

In [13]:
model_name = 'bert-base-uncased'
tokenizer, model = get_tokenizer_and_model(model_name)

# max_train_len = get_max_len(X_train)
# max_valid_len = get_max_len(X_valid)
# max_test_len = get_max_len(X_test)

# bert max_len is 512
max_len = 128
epoch = 3
lr = 2e-5
batch_size = 16

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_dataset = MyDataset(X_train, y_train, tokenizer, max_len)
val_dataset = MyDataset(X_valid, y_valid, tokenizer, max_len)

# 定义数据加载器
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

train_model(model, train_loader, val_loader, epochs=epoch, inital_lr = lr, device = device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 1/3, trained_data:  320 / 120000, Loss(avg): 11.398277139663696
Epoch: 1/3, trained_data:  640 / 120000, Loss(avg): 11.334536075592041
Epoch: 1/3, trained_data:  960 / 120000, Loss(avg): 10.904706335067749
Epoch: 1/3, trained_data:  1280 / 120000, Loss(avg): 9.309746384620667
Epoch: 1/3, trained_data:  1600 / 120000, Loss(avg): 7.7105267405509945
Epoch: 1/3, trained_data:  1920 / 120000, Loss(avg): 8.736096453666686
Epoch: 1/3, trained_data:  2240 / 120000, Loss(avg): 6.8486563205719
Epoch: 1/3, trained_data:  2560 / 120000, Loss(avg): 7.605550789833069
Epoch: 1/3, trained_data:  2880 / 120000, Loss(avg): 6.564855015277862
Epoch: 1/3, trained_data:  3200 / 120000, Loss(avg): 7.296554946899414
Epoch: 1/3, trained_data:  3520 / 120000, Loss(avg): 6.934622478485108
Epoch: 1/3, trained_data:  3840 / 120000, Loss(avg): 6.3076066970825195
Epoch: 1/3, trained_data:  4160 / 120000, Loss(avg): 6.477697348594665
Epoch: 1/3, trained_data:  4480 / 120000, Loss(avg): 5.743533277511597
Epoch:

In [14]:

torch.save(model.state_dict(), '/content/drive/MyDrive/sentiment_analysis_dataset/model/bert_model.pth')

In [None]:
import torch
from transformers import BertForSequenceClassification

# 创建一个新的模型实例
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# 加载保存的权重
model.load_state_dict(torch.load('/content/drive/MyDrive/sentiment_analysis_dataset/model/bert_model.pth'))

In [15]:
test_dataset = MyDataset(X_test, y_test, tokenizer, max_len)
test_loader = DataLoader(test_dataset, batch_size=1)
evaluate_model(model, test_loader, device)

0.895