In [18]:
import torch
from datasets import load_dataset


#Get the data from hugginface
class Dataset(torch.utils.data.Dataset):
    # set the parameter split "train" for train dataset "validation" for validation dataset
    def __init__(self, split):
        self.dataset = load_dataset(path='lansinuote/ChnSentiCorp', split=split)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        text = self.dataset[i]['text']
        label = self.dataset[i]['label']

        return text, label


dataset = Dataset('train')

# show the dataset len and content of 1st element
len(dataset), dataset[0]

Found cached dataset parquet (C:/Users/35391/.cache/huggingface/datasets/lansinuote___parquet/lansinuote--ChnSentiCorp-4d058ef86e3db8d5/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


(9600,
 ('选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般',
  1))

In [19]:
device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# use to(device) set the data and model to use cuda
device

device(type='cuda', index=0)

In [22]:
from transformers import BertTokenizer

# get the chinese version tokenizer
token = BertTokenizer.from_pretrained('bert-base-chinese')



In [23]:
def collate_fn(data):
    # sentence
    sents = [i[0] for i in data]
    # labels
    labels = [i[1] for i in data]

    # change sentence to tensors
    data = token.batch_encode_plus(batch_text_or_text_pairs=sents, # text
                                   truncation=True,
                                   padding='max_length',
                                   max_length=500,
                                   return_tensors='pt', # pt for PyTorch, or tf for TensorFlow
                                   return_length=True)
    
    # print(data.keys())

    #input_ids:map the id to different words
    input_ids = data['input_ids']
    #attention_mask: words == 1, padding  == 0 [1,1,1,1,1,0,0]
    attention_mask = data['attention_mask']
    # use for 1st sentence and 2nd sentence [0,0,0,0,0,1,1,1,1,1,1,1]
    token_type_ids = data['token_type_ids']
    labels = torch.LongTensor(labels)

    #print(data['length'], data['length'].max())

    return input_ids, attention_mask, token_type_ids, labels


#load the data to torch
loader = torch.utils.data.DataLoader(dataset=dataset,
                                     batch_size=16,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)


# for i, (input_ids, attention_mask, token_type_ids,
#         labels) in enumerate(loader):
#     break

# get the first element in loader
for input_ids, attention_mask, token_type_ids,labels in loader:
    break
    
# check the loader: loader length is 600, batch is 16, 600*16=9600 rows total
print(len(loader))
input_ids.shape, attention_mask.shape, token_type_ids.shape, labels

600


(torch.Size([16, 500]),
 torch.Size([16, 500]),
 torch.Size([16, 500]),
 tensor([0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1]))

In [25]:
from transformers import BertModel

# get the pretrained model of bert, need the same version of the tokenizer
pretrained = BertModel.from_pretrained('bert-base-chinese').to(device)

# don't need train the model and update the params
for param in pretrained.parameters():
    # print(param.shape)
    param.requires_grad_(False)

# try the first batch, last_hidden_state will give the model's output
out = pretrained(input_ids=input_ids.to(device),
           attention_mask=attention_mask.to(device),
           token_type_ids=token_type_ids.to(device))

# [batch_size, sequence_length, hidden_size]
out.last_hidden_state.shape

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


torch.Size([16, 500, 768])

In [26]:
out.last_hidden_state[:, 0].shape

torch.Size([16, 768])

In [29]:
# define forward
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # set the Fully Connected Layer(fc layer)
        self.fc = torch.nn.Linear(768, 2).to(device)

    def forward(self, input_ids, attention_mask, token_type_ids):
        # do not need update the weight
        with torch.no_grad():
            out = pretrained(input_ids=input_ids,
                       attention_mask=attention_mask,
                       token_type_ids=token_type_ids)

        # the 2nd dimension 1st position used to calculate the cls [CLS] 16*768
        out = out.last_hidden_state[:, 0]
#         print(out.shape)
        # 16 768 * 768 * 2 => 16 * 2
        out = self.fc(out)
#         print(out.shape)

        out = out.softmax(dim=1)

        return out


model = Model().to(device)

model(input_ids=input_ids.to(device),
      attention_mask=attention_mask.to(device),
      token_type_ids=token_type_ids.to(device)).shape

torch.Size([16, 2])

In [34]:
from transformers import AdamW

# train optimizer = AdamW
optimizer = AdamW(model.parameters(), lr=5e-4)
# define the loss
criterion = torch.nn.CrossEntropyLoss().to(device)

model.train()
for i, (input_ids, attention_mask, token_type_ids,
        labels) in enumerate(loader):
    out = model(input_ids=input_ids.to(device),
                attention_mask=attention_mask.to(device),
                token_type_ids=token_type_ids.to(device))

    loss = criterion(out, labels.to(device)).to(device)
    
    # backforward
    loss.backward()
    
    # use the optimizer
    optimizer.step()
    
    # reset the optimizer step to zero, can be put in front of backward
    optimizer.zero_grad()

    # every 5 batch show the loss and accuracy
    if i % 5 == 0:
        # softmax => get the max value index
        out = out.argmax(dim=1)
        # item() used to get the value from tensor   tensor(5) => 5
        accuracy = (out == labels.to(device)).sum().item() / len(labels.to(device))

        print(i, loss.item(), accuracy)

    if i == 300:
        break

0 0.6310632824897766 0.6875
5 0.6872082948684692 0.5625
10 0.6411182284355164 0.625
15 0.5849995613098145 0.8125
20 0.6454864144325256 0.6875
25 0.5975177884101868 0.8125
30 0.5190809965133667 0.875
35 0.5718966722488403 0.75
40 0.5118439793586731 0.9375
45 0.6213827729225159 0.75
50 0.5946326851844788 0.6875
55 0.5772092342376709 0.8125
60 0.5248995423316956 0.8125
65 0.5566399693489075 0.75
70 0.5524975657463074 0.875
75 0.4945199191570282 0.875
80 0.4463903605937958 0.9375
85 0.5500490069389343 0.6875
90 0.41821232438087463 1.0
95 0.5004081726074219 0.8125
100 0.48282232880592346 0.875
105 0.5214930772781372 0.6875
110 0.4178345799446106 1.0
115 0.4676635265350342 0.875
120 0.45681968331336975 0.875
125 0.5440527200698853 0.8125
130 0.4704640507698059 0.875
135 0.39664188027381897 1.0
140 0.444157212972641 0.9375
145 0.4321722984313965 1.0
150 0.5079193711280823 0.75
155 0.4318912625312805 0.875
160 0.43611329793930054 0.9375
165 0.4803919196128845 0.875
170 0.4719085991382599 0.875

In [12]:
a = torch.tensor([1,2,3,4,5])
b = torch.tensor([1,1,1,3,5])

print(a==b)
print((a == b).sum())
print((a == b).sum().item())
accuracy = (a == b).sum().item() / len(labels)

tensor([ True, False, False, False,  True])
tensor(2)
2


In [40]:
# test the model
def test():
    model.eval()
    correct = 0
    total = 0

    loader_test = torch.utils.data.DataLoader(dataset=Dataset('validation'),
                                              batch_size=32,
                                              collate_fn=collate_fn,
                                              shuffle=True,
                                              drop_last=True)

    for i, (input_ids, attention_mask, token_type_ids,
            labels) in enumerate(loader_test):

        if i%10==0:
            print(i)

        with torch.no_grad():
            out = model(input_ids=input_ids.to(device),
                        attention_mask=attention_mask.to(device),
                        token_type_ids=token_type_ids.to(device))

        out = out.argmax(dim=1)
        correct += (out == labels.to(device)).sum().item()
        total += len(labels)

    print(correct / total)


test()

Found cached dataset parquet (C:/Users/35391/.cache/huggingface/datasets/lansinuote___parquet/lansinuote--ChnSentiCorp-4d058ef86e3db8d5/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


0
10
20
30
0.870777027027027
