In [14]:
#huggingface
from datasets import load_dataset
#torch data
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
#huggingface 
from transformers import BertTokenizer, BertModel
from transformers import AutoModel,AutoTokenizer
#torch nn
import torch.nn as nn
import torch.optim as optim
from torch.nn.functional import one_hot

#lightening
import pytorch_lightning as pl
from pytorch_lightning import Trainer

#metrics
from torchmetrics.functional import accuracy,recall,precision,f1_score




In [15]:
#超参数
batch_size=128 #句子长度：200
num_epochs=10
dropout=0.4
rnn_hidden=312
rnn_layer=1
class_num=2
lr=0.001
bert_path="/root/albert_chinese_tiny"

In [4]:
#加载数据集Dataset
class MydataSet(Dataset):
    def __init__(self,path,split):
        self.dataset=load_dataset("csv",data_files=path,split=split)

    def __getitem__(self, index):
        text=self.dataset[index]["review"]
        label=self.dataset[index]["label"]
        return text,label
    def __len__(self):
        return len(self.dataset)
    


In [5]:
# data=MydataSet("./data/train.csv",'train')
# data[2]

In [6]:
#Dataloader生成迭代器
def collate_fn(data):
    
    # print(len(data)) #128
    sentence=[i[0] for i in data] #和参考代码不一样：[i[0] for i in data]
    label=[i[1] for i in data]

    data=token.batch_encode_plus(batch_text_or_text_pairs=sentence,
                                 max_length=200, #补全到长度200
                                 padding='max_length',
                                 pad_to_max_length=True,
                                 return_tensors='pt',
                                 truncation=True,
                                 return_length=True)
    input_ids=data['input_ids']
    attention_mask=data['attention_mask']
    token_type_ids=data['token_type_ids']
    label=torch.LongTensor(label)
    length=data['length']
    return input_ids,attention_mask,token_type_ids,label

In [7]:
#简单模型，测试各项数据和过程数据是否正常
class EasyModule(nn.Module):
    def __init__(self):
        super(EasyModule, self).__init__()
        self.drop=drop
        self.hidden_dim=hidden_dim
        self.output_dim=output_dim
        self.embedding=AutoModel.from_pretrained(bert_path)
        for param in self.embedding.parameters():
            param.requires_grad = False
        self.lstm=nn.LSTM(input_size=768,
                                    hidden_size=hidden_dim,
                                    num_layers=2,
                                    batch_first=True,
                                    bidirectional=True,
                                    dropout=self.drop)
        self.fc=nn.Linear(2*hidden_dim,self.output_dim)
    def forward(self, input_ids, attention_mask,token_type_ids):
        embedding=self.embedding(input_ids=input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
        embedding=embedding.last_hidden_state
        
        print(embedding)


In [8]:
#实验用的模型
class BiLSTMmodule(nn.Module):
    def __init__(self,drop,hidden_dim,output_dim):
        super(BiLSTMmodule, self).__init__()
        self.drop=drop
        self.hidden_dim=hidden_dim
        self.output_dim=output_dim

        self.embedding=AutoModel.from_pretrained(local_files_only=True,pretrained_model_name_or_path=bert_path)#git lfs下载！！
        for param in self.embedding.parameters():
            param.requires_grad_(False)
        
        self.lstm=nn.LSTM(input_size=312,#312维的embedding???
                            hidden_size=hidden_dim,
                            num_layers=2,
                            batch_first=True,
                            bidirectional=True,
                            dropout=self.drop)
        self.fc=nn.Linear(2*self.hidden_dim,self.output_dim) #BiLSTM，正向+反向
    def forward(self,input_ids,attention_mask,token_type_ids): #input_ids=attention_mask=[128,200] 
        embedded=self.embedding(input_ids=input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
        embedded=embedded.last_hidden_state
        # print("embedded shape:"+str(embedded.shape)) # embedded shape:torch.Size([128, 200, 312])
        out,(h_n,c_n)=self.lstm(embedded)
        # print(out.shape,h_n.shape,c_n.shape)
        # out:([128, 200, 624]) h_n:([4, 128, 312]) c_n:([4, 128, 312])
        # print(out[:, -1, :312] == h_n[-2,:,:]) #True
        # print(out[:, 0, 312:] == h_n[-1, :, :]) #True
        # print(h_n[-1, :, :].shape) #torch.Size([128, 312])
        output=torch.cat((h_n[-2, :, :], h_n[-1, :, :]), dim=1)#cat拼接函数，dim指示向哪个维度拼接
        # print("output shape"+str(output.shape))
        # output shape:torch.Size([128, 624])
        output=self.fc(output)
        # print(output.shape)
        return output
    


In [11]:
#lightening模型，管理整个训练过程
class BiLSTMLightening(pl.LightningModule):
    def __init__(self, drop,hidden_dim,output_dim):
        super(BiLSTMLightening,self).__init__()
        self.model = BiLSTMmodule(drop,hidden_dim,output_dim)
        self.criterion = nn.CrossEntropyLoss() #损失函数
        self.train_dataset=MydataSet("./data/train.csv",'train')
        self.val_dataset=MydataSet("./data/val.csv",'train')
    def configure_optimizers(self):#优化器
        optimizer = optim.AdamW(self.parameters(), lr=lr)
        return optimizer
    def forward(self,input_ids,attention_mask,token_type_ids):
        return self.model(input_ids,attention_mask,token_type_ids)

    def train_datalader(self):
        train_loader=DataLoader(dataset=self.train_dataset,batch_size=batch_size,shuffle=True,collate_fn=collate_fn)
        return train_loader

    def training_step(self,batch,batch_idx):
        input_ids,attention_mask,token_type_ids,label = batch #batch?
        y=one_hot(label,num_classes=class_num)
        y=y.to(dtype=torch.float)
        y_hat=self.model(input_ids,attention_mask,token_type_ids) #计算模型的输出
        loss=self.criterion(y_hat,y) #对比计算损失
        self.log('traing_loss',loss,prog_bar=True,logger=True,on_step=True,on_epoch=True) #输出
        return loss
    
    def train_dataloader(self):
        train_loader=DataLoader(dataset=self.train_dataset,batch_size=batch_size,shuffle=True,collate_fn=collate_fn,num_workers=4)
        return train_loader
    
    def validation_step(self,batch,batch_idx):
        input_ids,attention_mask,token_type_ids,label = batch
        y=one_hot(label,num_classes=class_num)
        y=y.to(dtype=torch.float)
        y_hat=self.model(input_ids,attention_mask,token_type_ids)
        loss=self.criterion(y_hat,y)
        self.log('val_loss',loss,prog_bar=True,logger=True,on_step=True,on_epoch=True)
        return loss
    
    def val_dataloader(self):
        val_loader=DataLoader(dataset=self.val_dataset,batch_size=batch_size,shuffle=False,collate_fn=collate_fn,num_workers=4)
        return val_loader
    

In [13]:
if __name__=="__main__":


    # print(len(dataset))
    # print(dataset[0])
    # print(dataset[0][0])

    # dataset=MydataSet("./data/train.csv",'train')
    # loader=DataLoader(dataset=dataset,batch_size=batch_size,shuffle=True,collate_fn=collate_fn,drop_last=True)
    
    # model=BiLSTMmodule(drop=dropout,hidden_dim=rnn_hidden,output_dim=class_num)
    # for i,(input_ids,attention_mask,token_type_ids,label) in enumerate(loader):
    #     print(attention_mask.shape)
    #     model.forward(input_ids,attention_mask,token_type_ids)
    #     break

    token=BertTokenizer.from_pretrained(bert_path)#用BertTokenizer
    trainer=Trainer(max_epochs=num_epochs,log_every_n_steps=10,accelerator='gpu',devices="auto")
    model=BiLSTMLightening(drop=dropout,hidden_dim=rnn_hidden,output_dim=class_num)
    trainer.fit(model=model)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | model     | BiLSTMmodule     | 8.0 M 
1 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
3.9 M     Trainable params
4.1 M     Non-trainable params
8.0 M     Total params
31.942    Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=2` reached.


In [None]:
#测试用
if __name__=="__main__":
    dataset=MydataSet("./data/train.csv",'train')
    token=BertTokenizer.from_pretrained(bert_path)#用BertTokenizer
    loader=DataLoader(dataset=dataset,batch_size=batch_size,shuffle=True,collate_fn=collate_fn,drop_last=True)
    model=BiLSTMmodule(drop=dropout,hidden_dim=rnn_hidden,output_dim=class_num)
    for i,(input_ids,attention_mask,token_type_ids,label) in enumerate(loader):
        print(attention_mask.shape)
        model.forward(input_ids,attention_mask,token_type_ids)
        break

        