<hr/>
<center>探究不同性格间文本内容的相似度</center>
<hr/>

In [1]:
import datasets

DataFilePath = "./personality_dataset"

In [2]:
def get_data(path):
    train = datasets.load_from_disk(path+"/train")
    return train

train = get_data(DataFilePath)

In [3]:
labels = ["ENTJ", "ENTP", "ENFJ", "ENFP", "ESFJ", "ESFP", "ESTJ", "ESTP", "INTP", "INTJ", "INFP", "INFJ", "ISFP", "ISFJ", "ISTP", "ISTJ"]

In [4]:
nums = [0 for i in range(16)]

for person in train["personality"]:
    nums[labels.index(person)] += 1

nums

[1934,
 4456,
 2803,
 5186,
 552,
 758,
 494,
 638,
 7213,
 6913,
 10862,
 9281,
 2128,
 1923,
 2164,
 1667]

In [10]:
import tqdm


# 选取两个数量相近的性格
personality = ["ENTJ", "ISFP"]
ENTJ = [train[i]["content"].replace("|||", ".") for i in tqdm.tqdm(range(len(train)), position=0) if train[i]["personality"] == "ENTJ"]
ISFP = [train[i]["content"].replace("|||", ".") for i in tqdm.tqdm(range(len(train)), position=0) if train[i]["personality"] == "ISFP"]

100%|██████████| 58972/58972 [00:02<00:00, 20166.80it/s]
100%|██████████| 58972/58972 [00:02<00:00, 20281.49it/s]


In [19]:
import spacy

# 做词数统计
nlp = spacy.load("en_core_web_sm", exclude=["tok2vec", "tagger", "senter", "attribute_ruler", "lemmatizer"])
words1 = {}
words2 = {}
words = {}
for i in tqdm.tqdm(range(len(ENTJ)), position=0):
    ENTJ[i] = nlp(ENTJ[i])
    for token in ENTJ[i]:
        if token.text in words1.keys():
            words1[token.text] += 1
            words[token.text] += 1
        else:
            words[token.text] = 1
            words1[token.text] = 1
            
for i in tqdm.tqdm(range(len(ISFP)), position=0):
    ISFP[i] = nlp(ISFP[i])
    for token in ISFP[i]:
        if token.text in words2.keys():
            words2[token.text] += 1
        else:
            words2[token.text] = 1
        if token.text in words.keys():
            words[token.text] += 1
        else:
            words[token.text] = 1

100%|██████████| 1934/1934 [00:51<00:00, 37.83it/s]
100%|██████████| 2128/2128 [00:56<00:00, 37.54it/s]


In [50]:
# 预处理后期统一数据维度
lst1 = [words1[i] if i in words1.keys() else 0 for i in words.keys()]
lst2 = [words2[i] if i in words2.keys() else 0 for i in words.keys()]

In [63]:
import numpy as np


# 得到词数的编码向量
fea1 = np.array(lst1)
fea2 = np.array(lst2)

In [64]:
# 计算每一种词汇所占比例
fea1 = fea1 / np.sum(fea1)
fea2 = fea2 / np.sum(fea2)

In [65]:
# 变为单位向量
fea1 = fea1 / np.linalg.norm(fea1)
fea2 = fea2 / np.linalg.norm(fea2)

In [66]:
cor = fea1.T.dot(fea2)

In [67]:
cor

0.9852139611568004

<hr/>
<center>探究只对人格进行二分类的分类情况</center>
<hr/>

In [1]:
import datasets

DataFilePath = "./personality_dataset"

In [2]:
def get_data(path):
    train = datasets.load_from_disk(path+"/train")
    return train

train = get_data(DataFilePath)

In [3]:
nums = [0 for i in range(8)]

labels = ["E", "I", "S", "N", "T", "F", "J", "P"]

for person in train["personality"]:
    if "E" in person:
        nums[labels.index("E")] += 1
    else:
        nums[labels.index("I")] += 1
    if "S" in person:
        nums[labels.index("S")] += 1
    else:
        nums[labels.index("N")] += 1
    if "N" in person:
        nums[labels.index("T")] += 1
    else:
        nums[labels.index("F")] += 1
    if "J" in person:
        nums[labels.index("J")] += 1
    else:
        nums[labels.index("P")] += 1

nums

[16821, 42151, 10324, 48648, 48648, 10324, 25567, 33405]

In [4]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('./bert-base-uncase')

In [5]:
import torch.utils
import numpy as np


class Dataset(torch.utils.data.Dataset):
    def __init__(self, data, number):
        self.trian = datasets.load_from_disk(DataFilePath+"/"+data)
        self.labels = [1 if "J" in label else 0 for label in self.trian["personality"][0 : number if number < len(self.trian["personality"]) else -1]]
        self.texts = [tokenizer(person.replace("|||", "[SEP]"),
                                padding="max_length",
                                max_length=512,
                                truncation=True,
                                return_tensors="pt")
                      for person in self.trian["content"][0 : number if number < len(self.trian["content"]) else -1]]
    
    def classes(self):
        return self.labels
    
    def __len__(self):
        return len(self.labels)
    
    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])
    
    def get_batch_texts(self, idx):
        return self.texts[idx]
    
    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y

In [6]:
from torch import nn

class BertClassifier(nn.Module):
    def __init__(self, dropout=0.5):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('./bert-base-uncase')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 2)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)
        return final_layer

In [7]:
from torch.optim import Adam
from tqdm import tqdm

def train(model, train_data, val_data, learning_rate, epochs, number):
    
    train, val = Dataset(train_data, number), Dataset(val_data, number)
    
    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)
    
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    if use_cuda:
            model = model.cuda()
            criterion = criterion.cuda()
    
    for epoch_num in range(epochs):
        
        total_acc_train = 0
        total_loss_train = 0
        
        for train_input, train_label in tqdm(train_dataloader):

            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)
            
            output = model(input_id, mask)
            
            batch_loss = criterion(output, train_label.to(torch.int64))
            total_loss_train += batch_loss.item()
            
            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc
            
            model.zero_grad()
            batch_loss.backward()
            optimizer.step()
            
            
            total_acc_val = 0
            total_loss_val = 0
            
            with torch.no_grad():
                
                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)
                    output = model(input_id, mask)
                    
                    batch_loss = criterion(output, val_label.to(torch.int64))
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'''Epochs: {epoch_num + 1} 
                | Train Loss: {total_loss_train / len(train): .3f} 
                | Train Accuracy: {total_acc_train / len(train): .3f} 
                | Val Loss: {total_loss_val / len(val): .3f} 
                | Val Accuracy: {total_acc_val / len(val): .3f}''')   

In [8]:
EPOCHS = 1
model = BertClassifier()
LR = 1e-6
number = 100

train(model, "train", "valid",  LR, EPOCHS, number)

  return self.fget.__get__(instance, owner)()
  2%|▏         | 1/50 [00:10<08:28, 10.39s/it]

Epochs: 1 
                | Train Loss:  0.006 
                | Train Accuracy:  0.020 
                | Val Loss:  0.353 
                | Val Accuracy:  0.450


  4%|▍         | 2/50 [00:24<10:01, 12.53s/it]

Epochs: 1 
                | Train Loss:  0.012 
                | Train Accuracy:  0.040 
                | Val Loss:  0.355 
                | Val Accuracy:  0.460


  6%|▌         | 3/50 [00:32<08:06, 10.34s/it]

Epochs: 1 
                | Train Loss:  0.017 
                | Train Accuracy:  0.060 
                | Val Loss:  0.355 
                | Val Accuracy:  0.470


  8%|▊         | 4/50 [00:42<07:55, 10.34s/it]

Epochs: 1 
                | Train Loss:  0.024 
                | Train Accuracy:  0.080 
                | Val Loss:  0.354 
                | Val Accuracy:  0.420


 10%|█         | 5/50 [00:56<08:47, 11.71s/it]

Epochs: 1 
                | Train Loss:  0.030 
                | Train Accuracy:  0.090 
                | Val Loss:  0.356 
                | Val Accuracy:  0.450


 12%|█▏        | 6/50 [01:10<09:04, 12.37s/it]

Epochs: 1 
                | Train Loss:  0.036 
                | Train Accuracy:  0.110 
                | Val Loss:  0.361 
                | Val Accuracy:  0.460


 14%|█▍        | 7/50 [01:39<12:49, 17.90s/it]

Epochs: 1 
                | Train Loss:  0.045 
                | Train Accuracy:  0.110 
                | Val Loss:  0.357 
                | Val Accuracy:  0.430


 16%|█▌        | 8/50 [01:50<10:58, 15.69s/it]

Epochs: 1 
                | Train Loss:  0.052 
                | Train Accuracy:  0.110 
                | Val Loss:  0.357 
                | Val Accuracy:  0.460


 18%|█▊        | 9/50 [01:58<08:58, 13.13s/it]

Epochs: 1 
                | Train Loss:  0.060 
                | Train Accuracy:  0.120 
                | Val Loss:  0.355 
                | Val Accuracy:  0.450


 20%|██        | 10/50 [02:06<07:50, 11.77s/it]

Epochs: 1 
                | Train Loss:  0.067 
                | Train Accuracy:  0.130 
                | Val Loss:  0.357 
                | Val Accuracy:  0.430


 22%|██▏       | 11/50 [02:14<06:48, 10.49s/it]

Epochs: 1 
                | Train Loss:  0.073 
                | Train Accuracy:  0.150 
                | Val Loss:  0.360 
                | Val Accuracy:  0.400


 24%|██▍       | 12/50 [02:24<06:36, 10.45s/it]

Epochs: 1 
                | Train Loss:  0.080 
                | Train Accuracy:  0.160 
                | Val Loss:  0.358 
                | Val Accuracy:  0.510


 26%|██▌       | 13/50 [02:32<05:52,  9.54s/it]

Epochs: 1 
                | Train Loss:  0.088 
                | Train Accuracy:  0.170 
                | Val Loss:  0.360 
                | Val Accuracy:  0.440


 28%|██▊       | 14/50 [02:39<05:21,  8.93s/it]

Epochs: 1 
                | Train Loss:  0.093 
                | Train Accuracy:  0.190 
                | Val Loss:  0.354 
                | Val Accuracy:  0.460


 30%|███       | 15/50 [02:47<04:57,  8.50s/it]

Epochs: 1 
                | Train Loss:  0.102 
                | Train Accuracy:  0.190 
                | Val Loss:  0.353 
                | Val Accuracy:  0.450


 32%|███▏      | 16/50 [02:54<04:38,  8.20s/it]

Epochs: 1 
                | Train Loss:  0.109 
                | Train Accuracy:  0.200 
                | Val Loss:  0.358 
                | Val Accuracy:  0.440


 34%|███▍      | 17/50 [03:02<04:24,  8.01s/it]

Epochs: 1 
                | Train Loss:  0.116 
                | Train Accuracy:  0.210 
                | Val Loss:  0.356 
                | Val Accuracy:  0.440


 36%|███▌      | 18/50 [03:09<04:11,  7.87s/it]

Epochs: 1 
                | Train Loss:  0.123 
                | Train Accuracy:  0.220 
                | Val Loss:  0.365 
                | Val Accuracy:  0.470


 38%|███▊      | 19/50 [03:20<04:27,  8.62s/it]

Epochs: 1 
                | Train Loss:  0.130 
                | Train Accuracy:  0.230 
                | Val Loss:  0.356 
                | Val Accuracy:  0.490


 40%|████      | 20/50 [03:31<04:45,  9.52s/it]

Epochs: 1 
                | Train Loss:  0.136 
                | Train Accuracy:  0.240 
                | Val Loss:  0.357 
                | Val Accuracy:  0.420


 42%|████▏     | 21/50 [03:39<04:18,  8.92s/it]

Epochs: 1 
                | Train Loss:  0.145 
                | Train Accuracy:  0.240 
                | Val Loss:  0.356 
                | Val Accuracy:  0.440


 44%|████▍     | 22/50 [03:46<03:58,  8.51s/it]

Epochs: 1 
                | Train Loss:  0.151 
                | Train Accuracy:  0.250 
                | Val Loss:  0.356 
                | Val Accuracy:  0.490


 46%|████▌     | 23/50 [03:54<03:41,  8.20s/it]

Epochs: 1 
                | Train Loss:  0.158 
                | Train Accuracy:  0.260 
                | Val Loss:  0.357 
                | Val Accuracy:  0.490


 48%|████▊     | 24/50 [04:02<03:31,  8.15s/it]

Epochs: 1 
                | Train Loss:  0.164 
                | Train Accuracy:  0.270 
                | Val Loss:  0.356 
                | Val Accuracy:  0.480


 50%|█████     | 25/50 [04:09<03:19,  7.96s/it]

Epochs: 1 
                | Train Loss:  0.170 
                | Train Accuracy:  0.290 
                | Val Loss:  0.356 
                | Val Accuracy:  0.460


 52%|█████▏    | 26/50 [04:17<03:11,  7.99s/it]

Epochs: 1 
                | Train Loss:  0.178 
                | Train Accuracy:  0.290 
                | Val Loss:  0.356 
                | Val Accuracy:  0.480


 54%|█████▍    | 27/50 [04:25<03:03,  7.98s/it]

Epochs: 1 
                | Train Loss:  0.186 
                | Train Accuracy:  0.290 
                | Val Loss:  0.358 
                | Val Accuracy:  0.470


 56%|█████▌    | 28/50 [04:33<02:55,  7.98s/it]

Epochs: 1 
                | Train Loss:  0.193 
                | Train Accuracy:  0.300 
                | Val Loss:  0.359 
                | Val Accuracy:  0.460


 58%|█████▊    | 29/50 [04:41<02:44,  7.83s/it]

Epochs: 1 
                | Train Loss:  0.200 
                | Train Accuracy:  0.310 
                | Val Loss:  0.350 
                | Val Accuracy:  0.530


 60%|██████    | 30/50 [04:48<02:35,  7.75s/it]

Epochs: 1 
                | Train Loss:  0.208 
                | Train Accuracy:  0.310 
                | Val Loss:  0.355 
                | Val Accuracy:  0.400


 62%|██████▏   | 31/50 [04:56<02:26,  7.69s/it]

Epochs: 1 
                | Train Loss:  0.217 
                | Train Accuracy:  0.310 
                | Val Loss:  0.360 
                | Val Accuracy:  0.430


 64%|██████▍   | 32/50 [05:08<02:42,  9.04s/it]

Epochs: 1 
                | Train Loss:  0.223 
                | Train Accuracy:  0.330 
                | Val Loss:  0.360 
                | Val Accuracy:  0.470


 66%|██████▌   | 33/50 [05:47<05:06, 18.02s/it]

Epochs: 1 
                | Train Loss:  0.229 
                | Train Accuracy:  0.350 
                | Val Loss:  0.360 
                | Val Accuracy:  0.450


 68%|██████▊   | 34/50 [05:55<03:57, 14.86s/it]

Epochs: 1 
                | Train Loss:  0.236 
                | Train Accuracy:  0.350 
                | Val Loss:  0.350 
                | Val Accuracy:  0.470


 70%|███████   | 35/50 [06:06<03:28, 13.90s/it]

Epochs: 1 
                | Train Loss:  0.244 
                | Train Accuracy:  0.350 
                | Val Loss:  0.358 
                | Val Accuracy:  0.430


 72%|███████▏  | 36/50 [06:15<02:51, 12.26s/it]

Epochs: 1 
                | Train Loss:  0.252 
                | Train Accuracy:  0.350 
                | Val Loss:  0.354 
                | Val Accuracy:  0.450


 74%|███████▍  | 37/50 [06:22<02:20, 10.82s/it]

Epochs: 1 
                | Train Loss:  0.259 
                | Train Accuracy:  0.350 
                | Val Loss:  0.355 
                | Val Accuracy:  0.460


 76%|███████▌  | 38/50 [06:30<01:57,  9.79s/it]

Epochs: 1 
                | Train Loss:  0.266 
                | Train Accuracy:  0.360 
                | Val Loss:  0.351 
                | Val Accuracy:  0.500


 78%|███████▊  | 39/50 [06:37<01:40,  9.12s/it]

Epochs: 1 
                | Train Loss:  0.272 
                | Train Accuracy:  0.380 
                | Val Loss:  0.357 
                | Val Accuracy:  0.480


 80%|████████  | 40/50 [06:45<01:26,  8.67s/it]

Epochs: 1 
                | Train Loss:  0.279 
                | Train Accuracy:  0.390 
                | Val Loss:  0.354 
                | Val Accuracy:  0.460


 82%|████████▏ | 41/50 [06:52<01:14,  8.32s/it]

Epochs: 1 
                | Train Loss:  0.286 
                | Train Accuracy:  0.390 
                | Val Loss:  0.361 
                | Val Accuracy:  0.480


 84%|████████▍ | 42/50 [07:00<01:04,  8.06s/it]

Epochs: 1 
                | Train Loss:  0.294 
                | Train Accuracy:  0.400 
                | Val Loss:  0.355 
                | Val Accuracy:  0.440


 86%|████████▌ | 43/50 [07:07<00:55,  7.91s/it]

Epochs: 1 
                | Train Loss:  0.302 
                | Train Accuracy:  0.400 
                | Val Loss:  0.355 
                | Val Accuracy:  0.460


 88%|████████▊ | 44/50 [07:15<00:46,  7.80s/it]

Epochs: 1 
                | Train Loss:  0.309 
                | Train Accuracy:  0.410 
                | Val Loss:  0.359 
                | Val Accuracy:  0.380


 90%|█████████ | 45/50 [07:22<00:38,  7.74s/it]

Epochs: 1 
                | Train Loss:  0.315 
                | Train Accuracy:  0.430 
                | Val Loss:  0.353 
                | Val Accuracy:  0.510


 92%|█████████▏| 46/50 [07:31<00:31,  7.89s/it]

Epochs: 1 
                | Train Loss:  0.323 
                | Train Accuracy:  0.430 
                | Val Loss:  0.361 
                | Val Accuracy:  0.460


 94%|█████████▍| 47/50 [07:38<00:23,  7.79s/it]

Epochs: 1 
                | Train Loss:  0.330 
                | Train Accuracy:  0.440 
                | Val Loss:  0.357 
                | Val Accuracy:  0.460


 96%|█████████▌| 48/50 [07:46<00:15,  7.69s/it]

Epochs: 1 
                | Train Loss:  0.337 
                | Train Accuracy:  0.450 
                | Val Loss:  0.361 
                | Val Accuracy:  0.450


 98%|█████████▊| 49/50 [07:53<00:07,  7.66s/it]

Epochs: 1 
                | Train Loss:  0.343 
                | Train Accuracy:  0.470 
                | Val Loss:  0.355 
                | Val Accuracy:  0.530


100%|██████████| 50/50 [08:01<00:00,  9.63s/it]

Epochs: 1 
                | Train Loss:  0.351 
                | Train Accuracy:  0.470 
                | Val Loss:  0.353 
                | Val Accuracy:  0.450



