In [16]:
from transformers import AutoModel
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import transformers
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import cohen_kappa_score,mean_absolute_error,mean_squared_error,accuracy_score,explained_variance_score,r2_score,confusion_matrix,ConfusionMatrixDisplay,classification_report,f1_score, silhouette_score,adjusted_mutual_info_score

In [17]:
config = {
    'model': 'model/deberta-v3-large',
    'dropout': 0.5,
    'max_length': 512,
    'batch_size': 16,
    'epochs': 5,
    'freeze_lr': 1e-5,
    'device': 'cuda' ,
    'scheduler': 'StepLR'
}

In [28]:
data = pd.read_excel('data/asap-aes/training_set_rel3.xls')
data = data[data['essay_set']==1]

In [29]:
score2label =dict()
label = 0
for i in sorted(list(set((data[data['essay_set']==1]['domain1_score'])))):
    score2label[i] = label
    label += 1

In [30]:
label2score = {score2label[i]:i for i in score2label}

In [31]:
data['domain1_score'] = [score2label[i] for i in data['domain1_score']]

In [45]:
class EssayDataset:
    def __init__(self, df, config, tokenizer=None, is_test=False):
        # 将输入的DataFrame进行重置索引，并保存在对象的成员变量df中
        self.df = df
        self.classes = 'domain1_score'
        self.max_len = config['max_length']
        self.tokenizer = tokenizer
        self.is_test = is_test 
    def __getitem__(self, idx):
        # 获取指定索引对应的原始文本
        sample = self.df['essay'][idx]
        # 使用tokenizer对原始文本进行编码，得到包含input_ids, token_type_ids和attention_mask的tokenized字典
        tokenized = self.tokenizer.encode_plus(sample,
                                               None,
                                               add_special_tokens=True,
                                               max_length=self.max_len,
                                               truncation=True,
                                               padding='max_length'
                                              )
        
        # 构造inputs字典，将编码后的tokens转换为PyTorch张量
        inputs = {
            "input_ids": torch.tensor(tokenized['input_ids'], dtype=torch.long),
            #"token_type_ids": torch.tensor(tokenized['token_type_ids'], dtype=torch.long),
            "attention_mask": torch.tensor(tokenized['attention_mask'], dtype=torch.long)
        }
        
        # 如果是测试数据集，直接返回inputs字典
        if self.is_test == True:
            return inputs
        
        # 否则，获取对应样本的标签，并构造targets字典，将标签转换为PyTorch张量
        label = self.df.loc[idx, self.classes]
        targets = {
            "labels": torch.tensor(label,dtype=torch.long),
        }
        
        return inputs, targets
    def __len__(self):
        return len(self.df)

In [35]:
traind = data[data['essay_set'] == 1][:1400]
testd = data[data['essay_set'] == 1][1400:]

In [37]:
tokenizer = AutoTokenizer.from_pretrained('model/tokenizer/deberta-v3-large/')

In [38]:
trainds = EssayDataset(traind, config, tokenizer=tokenizer)

In [39]:
testds = EssayDataset(testd, config, tokenizer=tokenizer, is_test=True)

In [46]:
train_loader = torch.utils.data.DataLoader(trainds,
                                           batch_size=config['batch_size'],
                                           shuffle=False,
                                           num_workers=0,
                                           pin_memory=True
                                          )

In [47]:
class FrozenEssayModel(nn.Module):
    def __init__(self,config):
        super(FrozenEssayModel,self).__init__()
        self.model_name = config['model']
        self.encoder = AutoModel.from_pretrained(self.model_name)
        
        # this is how you freeze a model: the base_model is generic term for the transformer name
        for param in self.encoder.base_model.parameters():
            param.requires_grad = False
            
        self.dropout = nn.Dropout(config['dropout'])
        self.fc1 = nn.Linear(self.encoder.config.hidden_size,64)
        #self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(64,11)
        
    def forward(self,inputs):
        outputs = self.encoder(**inputs, return_dict=False)
        outputs = self.dropout(outputs[0][:, -1])
        outputs = self.fc1(outputs)
        #outputs = torch.nn.ReLU()(outputs)
        #outputs = self.dropout(outputs)
        outputs = self.fc2(outputs)
        return outputs

In [57]:
class Trainer:
    def __init__(self, model, loaders, config,lr='unfreeze'):
        self.model = model
        self.train_loader = loaders
        self.config = config
        self.input_keys = ['input_ids','attention_mask']
        self.loss_fn = nn.CrossEntropyLoss()
        
        if lr == 'unfreeze':
            self.lr = self.config['unfreeze_lr']
        else:
            self.lr = self.config['freeze_lr']
            
        self.optim = self._get_optim()
        
        self.scheduler_options = {
            'CosineAnnealingWarmRestarts': torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(self.optim, T_0=5,eta_min=1e-7),
            'ReduceLROnPlateau': torch.optim.lr_scheduler.ReduceLROnPlateau(self.optim, 'min', min_lr=1e-7),
            'StepLR': torch.optim.lr_scheduler.StepLR(self.optim,step_size=2)
        }
        
        self.scheduler = self.scheduler_options[self.config['scheduler']]
        
        self.train_losses = []
        
    def _get_optim(self):
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=self.lr)
        return optimizer

        

    def train_one_epoch(self,epoch):
        
        running_loss = 0.
        progress = tqdm(self.train_loader, total=len(self.train_loader))
        true = []
        pred = []
        for i,(inputs,targets) in enumerate(progress):
            self.optim.zero_grad()
            
            inputs = {k:inputs[k].to(device=config['device']) for k in inputs.keys()}
            true += [int(i) for i in targets['labels']]
            #print(true)
            targets = targets['labels'].to(device=config['device'])
            
            outputs = self.model(inputs)
            pred += [int(i) for i in torch.max(outputs.data.detach().cpu(),dim=1)[1]]
            loss = self.loss_fn(outputs, targets)
            running_loss += loss.item()
            #print(pred)
            loss.backward()
            self.optim.step()
            print("{:.2f}".format(accuracy_score(true,pred)))
            if self.config['scheduler'] == 'CosineAnnealingWarmRestarts':
                self.scheduler.step(epoch-1+i/len(self.train_loader)) # as per pytorch docs
            
            del inputs, targets, outputs, loss
            
        if self.config['scheduler'] == 'StepLR':
            self.scheduler.step()
        
        train_loss = running_loss/len(self.train_loader)
        self.train_losses.append(train_loss)
        print(accuracy_score(true,pred))
        print(cohen_kappa_score(true,pred,weights='quadratic'))
            
    def test(self, test_loader):
        
        preds = []
        for (inputs) in test_loader:
            inputs = {k:inputs[k].to(device=config['device']) for k in inputs.keys()}
            
            outputs = self.model(inputs)
            preds.append(outputs.detach().cpu())
            
        preds = torch.concat(preds)
        return preds
    
    def fit(self):
        
        fit_progress = tqdm(
            range(1, self.config['epochs']+1),
            leave = True,
            desc="Training..."
        )
        
        for epoch in fit_progress:
            #print('start')
            self.model.train()
            fit_progress.set_description(f"EPOCH {epoch} / {self.config['epochs']} | training...")
            self.train_one_epoch(epoch)
            self.clear()
    

            print(f"{'-'*30} EPOCH {epoch} / {self.config['epochs']} {'-'*30}")
            print(f"train loss: {self.train_losses[-1]}")

            
    
    def clear(self):

        torch.cuda.empty_cache()

In [58]:
torch.cuda.empty_cache()

In [59]:
model = FrozenEssayModel(config).to(device=config['device'])
trainer_freeze = Trainer(model, train_loader, config, lr='freeze')

In [61]:
trainer_freeze.fit()

Training...:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

0.44
0.47
0.48
0.42
0.42
0.42
0.43
0.42
0.40
0.40
0.43
0.44
0.45
0.46
0.45
0.44
0.44
0.45
0.44
0.44
0.44
0.44
0.44
0.44
0.44
0.44
0.44
0.44
0.43
0.44
0.44
0.44
0.44
0.44
0.45
0.45
0.44
0.44
0.44
0.44
0.44
0.44
0.44
0.44
0.43
0.43
0.43
0.44
0.44
0.44
0.44
0.45
0.45
0.45
0.45
0.45
0.45
0.45
0.45
0.45
0.44
0.44
0.44
0.44
0.45
0.44
0.44
0.44
0.45
0.45
0.45
0.45
0.45
0.45
0.45
0.45
0.45
0.45
0.45
0.45
0.45
0.45
0.44
0.45
0.45
0.45
0.45
0.45
0.4471428571428571
0.44683038648339235
------------------------------ EPOCH 1 / 5 ------------------------------
train loss: 1.6274552291089839


  0%|          | 0/88 [00:00<?, ?it/s]

0.38
0.41
0.42
0.42
0.40
0.40
0.42
0.41
0.40
0.39
0.42
0.44
0.45
0.46
0.45
0.44
0.45
0.45
0.45
0.44
0.45
0.44
0.44
0.44
0.44
0.45
0.44
0.44
0.44
0.44
0.44
0.44
0.44
0.44
0.44
0.44
0.44
0.44
0.44
0.43
0.43
0.43
0.43
0.43
0.43
0.43
0.44
0.44
0.44
0.45
0.45
0.44
0.45
0.45
0.45
0.45
0.45
0.45
0.45
0.45
0.44
0.45
0.44
0.45
0.45
0.45
0.45
0.45
0.45
0.45
0.45
0.45
0.45
0.45
0.45
0.45
0.45
0.45
0.45
0.45
0.45
0.44
0.44
0.45
0.45
0.45
0.45
0.45
0.445
0.4603969936403931
------------------------------ EPOCH 2 / 5 ------------------------------
train loss: 1.5905964523553848


  0%|          | 0/88 [00:00<?, ?it/s]

0.44
0.38
0.40
0.39
0.38
0.38
0.38
0.39
0.36
0.36
0.39
0.40
0.41
0.41
0.41
0.40
0.41
0.42
0.42
0.42
0.42
0.42
0.42
0.42
0.42
0.43
0.43
0.43
0.43
0.43
0.43
0.44
0.44
0.44
0.46
0.46
0.45


KeyboardInterrupt: 

In [None]:
test_loader = torch.utils.data.DataLoader(testds,
                                           batch_size=config['batch_size'],
                                           shuffle=False,
                                           num_workers=0,
                                           pin_memory=True
                                          )

In [None]:
pred = trainer_freeze.test(test_loader)

In [None]:
pred = [list(i).index(max(i))+2 for i in pred]

In [None]:
from sklearn.metrics import cohen_kappa_score,mean_absolute_error,mean_squared_error,accuracy_score,explained_variance_score,r2_score,confusion_matrix,ConfusionMatrixDisplay,classification_report,f1_score, silhouette_score,adjusted_mutual_info_score

In [None]:
TestY = data[data['essay_set']==1]['domain1_score'][1400:]

In [None]:
print(accuracy_score(TestY,pred))
print(cohen_kappa_score(TestY,pred,weights='quadratic'))

0.46736292428198434
0.5069858621797463
