In [11]:
# if you do not have transformers, please !pip install transformers
import transformers
from transformers import get_linear_schedule_with_warmup
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import AdamW

# if you do not have torch, please refer to https://pytorch.org/ [INSTALL PYTORCH]
import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
from torch.utils.data import random_split

import pandas as pd
import re
import string
import operator
import numpy as np
import random

from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import precision_score, recall_score, f1_score

In [12]:
print(transformers.__version__)
seed = 38
device = torch.device('cuda')
print('\n')
print(device)

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True

4.40.2


cuda


In [13]:
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
stop = stopwords.words('english')

df_train = pd.read_csv('./dataset/Train.csv')
df_test = pd.read_csv('./dataset/Test.csv')
df_val = pd.read_csv('./dataset/Valid.csv')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

"""
df_train = df_train[:4000]
df_test = df_test[:500]
df_val = df_val[:500]
"""

print(df_train.shape, df_test.shape, df_val.shape)
print('\n')
# check the model max len = 512
# print(tokenizer)
# get the list of {content, token, ids}

df_val['pre_text'] = df_val['text'].str.lower()
df_val['pre_text'] = df_val['text'].str.replace(r'[^\w\s]+', '')
df_val['pre_text'] = df_val['text'].str.replace('<br />','')
df_val['pre_text'] = df_val['text'].str.replace('<br />','')
df_val['pre_text'] = df_val['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

df_train['pre_text'] = df_train['text'].str.lower()
df_train['pre_text'] = df_train['text'].str.replace(r'[^\w\s]+', '')
df_train['pre_text'] = df_train['text'].str.replace('<br />','')
df_train['pre_text'] = df_train['text'].str.replace('<br />','')
df_train['pre_text'] = df_train['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

print('Text Pre-Processing Finish!')

# To simplify the process, I make all df_val['text'] = df_val['pre_text']; df_train['text'] = df_train['pre_text']
df_val['text'] = df_val['pre_text']
df_train['text'] = df_train['pre_text']

print(df_train.shape, df_test.shape, df_val.shape)
content = df_train['text'].values
labels = df_train['label'].values

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\guoya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(4000, 2) (500, 2) (500, 2)


Text Pre-Processing Finish!
(4000, 3) (500, 2) (500, 3)


In [14]:
def encoding_process(_content):
    get_ids = []    
    for text in _content:
        input_ids = tokenizer.encode(
                        text,                      
                        add_special_tokens = True,
                        max_length = 256,
                        pad_to_max_length = True, 
                        return_tensors = 'pt')
        get_ids.append(input_ids)    
    
    get_ids = torch.cat(get_ids, dim=0)
    return get_ids

In [15]:
# make sure return_dict is not default
return_dict = False

# Training dataset 
content = df_train['text'].values
labels = df_train['label'].values
get_ids = encoding_process(content)
labels = torch.tensor(labels)

# Validation dataset
val_content = df_val['text'].values
val_labels = df_val['label'].values
val_get_ids = encoding_process(val_content)
val_labels = torch.tensor(val_labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [16]:

# 定义批量大小
batch_size = 32  # 你可以根据实际情况调整批量大小

train_dataset = TensorDataset(torch.tensor(get_ids), torch.tensor(labels))

# 创建 DataLoader2
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = TensorDataset(torch.tensor(val_get_ids), torch.tensor(val_labels))

# 创建 DataLoader
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

  train_dataset = TensorDataset(torch.tensor(get_ids), torch.tensor(labels))
  val_dataset = TensorDataset(torch.tensor(val_get_ids), torch.tensor(val_labels))


In [17]:
epochs = 1

In [18]:
from sklearn.metrics import accuracy_score
# Bert-based-model
# reference
# https://huggingface.co/transformers/model_doc/bert.html
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, output_attentions=False, output_hidden_states=False)



model.cuda()
optimizer = AdamW(model.parameters(), lr=2e-5)
output_model = './model/imdb_bert.pth'
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# save
def save(model, optimizer):
    # save
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, output_model)

# reference
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html

def accuracy_calc(preds, labels):
    
    pre = np.argmax(preds, axis=1).flatten()
    real = labels.flatten()
    return accuracy_score(real, pre)

def f1_accuracy(preds, labels):
    
    pre = np.argmax(preds, axis=1).flatten()
    real = labels.flatten()
    return f1_score(real, pre)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
input_text = ["I love this movie!", "This movie is horrible.","No comment for the movie."]
#labels = [1, 0]  # 1表示正面情感，0表示负面情感

# 使用tokenizer对输入文本进行编码：将文本转换为模型可以理解的向量（input_ids和attention_mask）
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
encoded_inputs = tokenizer(input_text, padding=True, truncation=True, return_tensors="pt")

# 将编码结果输入到模型中，得到分类结果：
with torch.no_grad():
    outputs = model(**encoded_inputs)
    logits = outputs.logits
    # 对logits进行argmax操作，得到预测的类别
    predictions = torch.argmax(logits, dim=-1)

print(predictions)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([1, 1, 1])


In [19]:
# 💥 IMPORTANT: Please create the directory in your environment, 
# such like './content/model/', in order to save your model in your local!
for epoch in range(epochs):
    model.train()
    total_loss, total_val_loss = 0, 0
    total_eval_accuracy = 0
    _f1 = 0
    _train_f1 = 0
    for step, batch in enumerate(train_dataloader):
        model.zero_grad()
        loss, tval_ = model(batch[0].to(device), token_type_ids=None, attention_mask=(batch[0]>0).to(device), labels=batch[1].to(device),return_dict = False)
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step() 
        scheduler.step()
        tval_ = tval_.detach().cpu().numpy()
        label_ids = batch[1].to('cpu').numpy()
        _train_f1 += f1_accuracy(tval_, label_ids)
        
    model.eval()
    for i, batch in enumerate(val_dataloader):
        with torch.no_grad():
            loss, val_ = model(batch[0].to(device), token_type_ids=None, attention_mask=(batch[0]>0).to(device), labels=batch[1].to(device),return_dict = False)
                
            total_val_loss += loss.item()
            
            val_ = val_.detach().cpu().numpy()
            label_ids = batch[1].to('cpu').numpy()
            total_eval_accuracy += accuracy_calc(val_, label_ids)
            _f1 += f1_accuracy(val_, label_ids)
    
    training_loss = total_loss / len(train_dataloader)
    valid_loss = total_val_loss / len(val_dataloader)
    _accuracy = total_eval_accuracy / len(val_dataloader)
    _f1_score = _f1 / len(val_dataloader)
    train_f1_score = _train_f1/ len(train_dataloader)
    
    print('Training loss is', training_loss)
    print('Valid loss is:', valid_loss)
    print('Acc score is:', _accuracy)
    print('F1_score is:', _f1_score)
    print('train_F1_score is:', train_f1_score)
    print('\n')

save(model, optimizer)

Training loss is 0.46563229882717133
Valid loss is: 0.3060372481122613
Acc score is: 0.881640625
F1_score is: 0.8748315596970645
train_F1_score is: 0.7154248853232741




In [25]:
model.load_state_dict(torch.load('./model/imdb_bert.pth'),strict=False)

# 将模型移动到适当的设备（CPU或GPU）
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


In [27]:
input_text = ["I love this movie!", "This movie is horrible.","No comment for the movie."]
#labels = [1, 0]  # 1表示正面情感，0表示负面情感

# 使用tokenizer对输入文本进行编码：将文本转换为模型可以理解的向量（input_ids和attention_mask）
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
encoded_inputs = tokenizer(input_text, padding=True, truncation=True, return_tensors="pt")
encoded_inputs= encoded_inputs.to(device)

# 将编码结果输入到模型中，得到分类结果：
with torch.no_grad():
    outputs = model(**encoded_inputs)
    logits = outputs.logits
    # 对logits进行argmax操作，得到预测的类别
    predictions = torch.argmax(logits, dim=-1)

print(predictions)



tensor([1, 1, 1], device='cuda:0')
