In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES']='1'

In [2]:
import pandas as pd

train_df = pd.read_csv('train.csv', header=None)
train_df.columns = ['label', 'text']
train_df.head()

Unnamed: 0,label,text
0,2,非常喜欢这个德国超市，他家各类东东都品牌齐全，不像沃尔玛、家乐福之类只有便宜货。经常能找到一...
1,2,从这家超市刚在湖南开的时候就开始消费，眼看着这里周边由荒凉变得热闹。人生真是好短。\n奇怪的...
2,1,麦德隆多是大包装的东西。适合三代同堂的家庭采购。\n这里有卡才能进入，结帐时，同样要出示会员...
3,2,仓储式购物环境，看着货架上一堆堆的物品，人的购物欲望就被激发起来了！\n进口食品很多，特别是...
4,2,这里大部分都是自家开车来大采购的，而且必须是会员制，但是商品价格很不错，质量也还不错，来这里...


In [3]:
test_df = pd.read_csv('test.csv', header=None)
test_df.columns = ['label', 'text']
test_df.head()

Unnamed: 0,label,text
0,2,特地网上预约的，满方便的。去了就能拿票了。\n去的那天人满少的，还有画展。\n里面有专门的讲...
1,2,拿身份证领票，早上8点45开始放票，可是那个时候已经又很多很多人排队了，所以还是更早去吧，看...
2,2,记得当时博物馆刚开始免费的时候，每次路过都看到那排队排到烈士公园。过了一年就好多了。我们去那...
3,2,之前已经上网订好票，所以好方便就入左去啦。\n里面有好多辛追夫人一家嘅陪葬品，仲有辛追夫人嘅...
4,1,票是免费的，好像是每天有3000张免费票，拿着身份证就可以领。\n但是入馆要排队，一批一批的...


In [4]:
from transformers import BertTokenizer
import torch


# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

# Create a function to tokenize a set of texts
def preprocessing_for_bert(data):
    """Perform required preprocessing steps for pretrained BERT.
    @param    data (np.array): Array of texts to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                  tokens should be attended to by the model.
    """
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in data:
        encoded_sent = tokenizer.encode_plus(
            text=sent,
            add_special_tokens=True,
            max_length=200, 
            pad_to_max_length=True, 
            return_attention_mask=True
            )
        
        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

In [5]:
input_ids, attention_masks = preprocessing_for_bert(train_df[:1000].text)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [6]:
input_ids

tensor([[ 101, 7478, 2382,  ...,    0,    0,    0],
        [ 101,  794, 6821,  ..., 4633, 6381,  102],
        [ 101, 7931, 2548,  ...,    0,    0,    0],
        ...,
        [ 101, 1377, 5543,  ...,    0,    0,    0],
        [ 101, 2207, 3198,  ...,    0,    0,    0],
        [ 101, 5050, 3221,  ...,    0,    0,    0]])

In [7]:
attention_masks

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])

In [8]:
train_inputs, train_masks = preprocessing_for_bert(train_df[:10000].text)
valid_inputs, valid_masks = preprocessing_for_bert(train_df[10000:20000].text)

test_inputs, test_masks = preprocessing_for_bert(test_df.text)

In [9]:
y_train = train_df.label[:10000] - 1
y_valid = train_df.label[10000:20000] - 1
y_test = test_df.label - 1

In [10]:
y_train

0       1
1       1
2       0
3       1
4       1
       ..
9995    1
9996    0
9997    1
9998    1
9999    1
Name: label, Length: 10000, dtype: int64

In [11]:
y_valid

10000    0
10001    1
10002    0
10003    0
10004    1
        ..
19995    0
19996    1
19997    1
19998    0
19999    1
Name: label, Length: 10000, dtype: int64

In [12]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Convert other data types to torch.Tensor
train_labels = torch.tensor(y_train.values)
valid_labels = torch.tensor(y_valid.values)
test_labels = torch.tensor(y_test.values)

# For fine-tuning BERT, a batch size of 16 or 32 is recommended.
batch_size = 32

# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
valid_data = TensorDataset(valid_inputs, valid_masks, valid_labels)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size)

# Create the DataLoader for our testing set
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [13]:
train_labels.shape

torch.Size([10000])

In [14]:
import torch
import torch.nn as nn
from transformers import BertModel

# Create the BertClassfier class
class BertClassifier(nn.Module):
    def __init__(self, freeze_bert=False):
        super(BertClassifier, self).__init__()

        # Instantiate BERT model
        self.bert = BertModel.from_pretrained('bert-base-chinese', return_dict=False)

        # add your additional layers, for example, a dropout layer followed by a linear classification
        self.dropout = nn.Dropout(0.3)
        self.hidden = nn.Linear(768, 200)
        self.out = nn.Linear(200, 1)
        
        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                      max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                      num_labels)
        """
        # Feed input to BERT
        sequence_output, pooled_output = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        
        # apply dropout to the BERT output
        pooled_output = self.dropout(pooled_output)
        hidden = self.hidden(pooled_output)

        logits = self.out(hidden)
        
        return logits

In [15]:
from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(epochs=4):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(freeze_bert=False)

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=5e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len(train_dataloader) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [16]:
import random
import time
import numpy as np

# Specify loss function
loss_fn = nn.BCEWithLogitsLoss()# for binary classification

def set_seed(seed_value=0):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    """Train the BertClassifier model.
    """
    # Start training loop
    print("Start training...\n")
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        # model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            model.train()
            batch_counts +=1
            
            # Load batch to GPU
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask)

            # Compute loss and accumulate the loss values

            b_labels = b_labels.reshape(b_labels.shape[0],1)
            loss = loss_fn(logits.float(), b_labels.float())
            batch_loss += loss.item()
            total_loss += loss.item()
            

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                val_loss, val_accuracy = evaluate(model, val_dataloader)
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
    
    print("Training complete!")


def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        b_labels = b_labels.reshape(b_labels.shape[0],1)
        loss = loss_fn(logits.float(), b_labels.float())
        val_loss.append(loss.item())

        # Get the predictions
        preds = logits.sigmoid()#torch.argmax(logits, dim=1).flatten()
        
        # Calculate the accuracy rate
        accuracy = ((preds>0.5)==b_labels.byte()).float().cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

In [17]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))
    
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: NVIDIA RTX A5000


In [18]:
set_seed(10)    # Set seed for reproducibility
bert_classifier, optimizer, scheduler = initialize_model(epochs=1)
train(bert_classifier, train_dataloader, valid_dataloader, epochs=1, evaluation=True)


Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   1    |   20    |   0.665897   |     -      |     -     |   3.79   
   1    |   40    |   0.603880   |     -      |     -     |   3.65   
   1    |   60    |   0.634088   |     -      |     -     |   3.69   
   1    |   80    |   0.556140   |     -      |     -     |   3.73   
   1    |   100   |   0.546487   |     -      |     -     |   3.75   
   1    |   120   |   0.538677   |     -      |     -     |   3.76   
   1    |   140   |   0.580665   |     -      |     -     |   3.77   
   1    |   160   |   0.585484   |     -      |     -     |   3.76   
   1    |   180   |   0.556848   |     -      |     -     |   3.77   
   1    |   200   |   0.527068   |     -      |     -     |   3.78   
   1    |   220   |   0.564014   |     -      |     -     |   3.78   
   1    |   240   |   0.520711   |     -      |     -     |   3.78   


In [19]:
torch.save(bert_classifier, 'BERT_model')

In [20]:
import torch.nn.functional as F

def bert_predict(model, test_dataloader):
    """Perform a forward pass on the trained BERT model to predict probabilities
    on the test set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    all_logits = []

    # For each batch in our test set...
    for batch in test_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        all_logits.append(logits)
    
    # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)

    # Apply softmax to calculate probabilities
    probs = torch.sigmoid(all_logits).cpu().numpy()

    return probs

In [21]:
probs = bert_predict(bert_classifier, test_dataloader)
probs

array([[0.76127046],
       [0.55820394],
       [0.3778523 ],
       ...,
       [0.34089267],
       [0.8266014 ],
       [0.06928495]], dtype=float32)

In [22]:
from sklearn.metrics import accuracy_score

#y_pred = np.argmax(probs,axis=1)

y_pred = np.array((probs>0.5))
y_pred = y_pred + 0
accuracy_score(y_test, y_pred.reshape(y_pred.shape[0],))

0.750762

In [23]:
# load fine-tune model

saved_model = torch.load('BERT_model')
probs = bert_predict(saved_model, test_dataloader)
probs

array([[0.76127046],
       [0.55820394],
       [0.3778523 ],
       ...,
       [0.34089267],
       [0.8266014 ],
       [0.06928495]], dtype=float32)

In [26]:
_y_pred = np.array((probs>0.5))
_y_pred = _y_pred + 0
accuracy_score(y_test, _y_pred.reshape(_y_pred.shape[0],))

0.750762