<hr/>
<center>新的分类标签</center>
<hr/>

In [1]:
DataFilePath = "./personality_dataset"
BertModelPath = "./bert-tiny-uncase"

In [2]:
import datasets


def data_load(path):
    train = datasets.load_from_disk(path + "/train")
    valid = datasets.load_from_disk(path + "/valid")
    return train, valid

In [3]:
train, valid = data_load(DataFilePath)

In [4]:
from transformers import BertModel, BertTokenizer


bert_model = BertModel.from_pretrained(BertModelPath)
tokenizer = BertTokenizer.from_pretrained(BertModelPath)

  return self.fget.__get__(instance, owner)()


In [5]:
from tqdm import tqdm


train_data = []
for person in tqdm(train):
    train_data.append(tokenizer(person["content"].replace("|||", "[SEP]"), padding="max_length", max_length=512, truncation=True, return_tensors="pt"))

valid_data = []
for person in tqdm(valid):
    valid_data.append(tokenizer(person["content"].replace("|||", "[SEP]"), padding="max_length", max_length=512, truncation=True, return_tensors="pt"))

100%|██████████| 58972/58972 [05:03<00:00, 194.22it/s]
100%|██████████| 3104/3104 [00:16<00:00, 192.19it/s]


In [6]:
import torch


class Dataset(torch.utils.data.Dataset):
    def __init__(self, train_data):
        self.texts = train_data
    
    def classes(self):
        return self.texts
    
    def __len__(self):
        return len(self.texts)
    
    def get_batch_texts(self, idx):
        return self.texts[idx]
    
    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        return batch_texts

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_dataload = torch.utils.data.DataLoader(Dataset(train_data), batch_size=1, worker_init_fn=4)
bert_model.to(device)
train_feature = []
with torch.no_grad():
    for data in tqdm(train_dataload):
        torch.cuda.empty_cache()
        train_feature.append(bert_model(data["input_ids"].squeeze(1).to(device), data["attention_mask"].to(device))["pooler_output"])
        
valid_dataload = torch.utils.data.DataLoader(Dataset(valid_data), batch_size=1, worker_init_fn=4)
valid_feature = []
with torch.no_grad():
    for data in tqdm(valid_data):
        torch.cuda.empty_cache()
        valid_feature.append(bert_model(data["input_ids"].squeeze(1).to(device), data["attention_mask"].to(device))["pooler_output"])

100%|██████████| 58972/58972 [02:32<00:00, 386.94it/s]
100%|██████████| 3104/3104 [00:07<00:00, 405.19it/s]


In [8]:
train_feature_np = [data.cpu().numpy().reshape(128) for data in tqdm(train_feature)]
valid_feature_np = [data.cpu().numpy().reshape(128) for data in tqdm(valid_feature)]

100%|██████████| 58972/58972 [00:02<00:00, 23491.77it/s]
100%|██████████| 3104/3104 [00:00<00:00, 19448.71it/s]


In [9]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=16, n_init="auto")
train_cluster_id = kmeans.fit_predict(train_feature_np)
valid_cluster_id = kmeans.predict(valid_feature_np)

<hr/>
<center>依照新的分类标准重新实验</center>
<hr/>

In [10]:
train_labels = torch.from_numpy(train_cluster_id)
valid_labels = torch.from_numpy(valid_cluster_id)

In [35]:
class MyModel(torch.nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.model = torch.nn.Sequential(torch.nn.Linear(128, 256),
                                         torch.nn.Dropout(0.5),
                                         torch.nn.ReLU(),
                                         torch.nn.Linear(256, 512),
                                         torch.nn.Dropout(0.5),
                                         torch.nn.ReLU(),
                                         torch.nn.Linear(512, 256),
                                         torch.nn.Dropout(0.5),
                                         torch.nn.ReLU(),
                                         torch.nn.Linear(256, 128),
                                         torch.nn.Dropout(0.5),
                                         torch.nn.ReLU(),
                                         torch.nn.Linear(128, 16),
                                         torch.nn.Softmax(dim=1))
        
    
    def forward(self, pooler_output):
        return self.model(pooler_output)

In [55]:
from torch.optim import Adam
import torch.utils
import torch.utils.data

try:
    model = torch.load("last_model.pt").to(device)
except FileNotFoundError:
    model = MyModel().to(device)

epochs = 300
learning_rate = 1e-5

criterion = torch.nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=learning_rate)

loader_train_texts = torch.utils.data.DataLoader(Dataset(train_feature_np), batch_size=128)
loader_train_labels = torch.utils.data.DataLoader(Dataset(train_labels), batch_size=128)
loader_valid_texts = torch.utils.data.DataLoader(Dataset(valid_feature_np), batch_size=128)
loader_valid_labels = torch.utils.data.DataLoader(Dataset(valid_labels), batch_size=128)

for epoch_num in range(epochs):
    
    total_acc_train = 0
    total_loss_train = 0
    
    for train_input, train_label, i in zip(loader_train_texts, loader_train_labels, range(len(loader_train_labels))):

        train_input = train_input.to(device)
        train_label = train_label.to(device)
        
        output = model(train_input)

        batch_loss = criterion(output, train_label.to(torch.int64))
        total_loss_train = batch_loss.item()
        
        acc = (output.argmax(dim=1) == train_label).sum().item()
        total_acc_train += acc
        
        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()
        
        
        total_acc_val = 0
        total_loss_val = 0
        
        with torch.no_grad():
            
            for val_input, val_label in zip(loader_valid_texts, loader_valid_labels):
                
                val_input = val_input.to(device)
                val_label = val_label.to(device)
                output = model(val_input)
                
                batch_loss = criterion(output, val_label.to(torch.int64))
                total_loss_val = batch_loss.item()
                
                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc
        
        print(
            f"Epochs: {epoch_num + 1} | Process: {i} / {len(loader_train_labels)}",
            f"| Train Loss: {total_loss_train: .3f}",
            f"| Train Accuracy: {total_acc_train / len(train_cluster_id): .3f}",
            f"| Val Loss: {total_loss_val: .3f}",
            f"| Val Accuracy: {total_acc_val / len(valid_cluster_id): .3f}", end="\r")
        torch.save(model, "last_model.pt")



Epochs: 300 | Process: 460 / 461 | Train Loss:  2.139 | Train Accuracy:  0.690 | Val Loss:  2.158 | Val Accuracy:  0.692