<hr/>
<center>探究不同性格间文本内容的相似度</center>
<hr/>

In [1]:
import datasets

DataFilePath = "./personality_dataset"

In [2]:
def get_data(path):
    train = datasets.load_from_disk(path+"/train")
    return train

train = get_data(DataFilePath)

In [3]:
labels = ["ENTJ", "ENTP", "ENFJ", "ENFP", "ESFJ", "ESFP", "ESTJ", "ESTP", "INTP", "INTJ", "INFP", "INFJ", "ISFP", "ISFJ", "ISTP", "ISTJ"]

In [4]:
nums = [0 for i in range(16)]

for person in train["personality"]:
    nums[labels.index(person)] += 1

nums

[1934,
 4456,
 2803,
 5186,
 552,
 758,
 494,
 638,
 7213,
 6913,
 10862,
 9281,
 2128,
 1923,
 2164,
 1667]

In [10]:
import tqdm


# 选取两个数量相近的性格
personality = ["ENTJ", "ISFP"]
ENTJ = [train[i]["content"].replace("|||", ".") for i in tqdm.tqdm(range(len(train)), position=0) if train[i]["personality"] == "ENTJ"]
ISFP = [train[i]["content"].replace("|||", ".") for i in tqdm.tqdm(range(len(train)), position=0) if train[i]["personality"] == "ISFP"]

100%|██████████| 58972/58972 [00:02<00:00, 20166.80it/s]
100%|██████████| 58972/58972 [00:02<00:00, 20281.49it/s]


In [19]:
import spacy

# 做词数统计
nlp = spacy.load("en_core_web_sm", exclude=["tok2vec", "tagger", "senter", "attribute_ruler", "lemmatizer"])
words1 = {}
words2 = {}
words = {}
for i in tqdm.tqdm(range(len(ENTJ)), position=0):
    ENTJ[i] = nlp(ENTJ[i])
    for token in ENTJ[i]:
        if token.text in words1.keys():
            words1[token.text] += 1
            words[token.text] += 1
        else:
            words[token.text] = 1
            words1[token.text] = 1
            
for i in tqdm.tqdm(range(len(ISFP)), position=0):
    ISFP[i] = nlp(ISFP[i])
    for token in ISFP[i]:
        if token.text in words2.keys():
            words2[token.text] += 1
        else:
            words2[token.text] = 1
        if token.text in words.keys():
            words[token.text] += 1
        else:
            words[token.text] = 1

100%|██████████| 1934/1934 [00:51<00:00, 37.83it/s]
100%|██████████| 2128/2128 [00:56<00:00, 37.54it/s]


In [50]:
# 预处理后期统一数据维度
lst1 = [words1[i] if i in words1.keys() else 0 for i in words.keys()]
lst2 = [words2[i] if i in words2.keys() else 0 for i in words.keys()]

In [63]:
import numpy as np


# 得到词数的编码向量
fea1 = np.array(lst1)
fea2 = np.array(lst2)

In [64]:
# 计算每一种词汇所占比例
fea1 = fea1 / np.sum(fea1)
fea2 = fea2 / np.sum(fea2)

In [65]:
# 变为单位向量
fea1 = fea1 / np.linalg.norm(fea1)
fea2 = fea2 / np.linalg.norm(fea2)

In [66]:
cor = fea1.T.dot(fea2)

In [67]:
cor

0.9852139611568004

<hr/>
<center>探究只对人格进行二分类的分类情况</center>
<hr/>

In [1]:
import datasets

DataFilePath = "./personality_dataset"

In [2]:
def get_data(path):
    train = datasets.load_from_disk(path+"/train")
    return train

train = get_data(DataFilePath)

In [3]:
nums = [0 for i in range(8)]

labels = ["E", "I", "S", "N", "T", "F", "J", "P"]

for person in train["personality"]:
    if "E" in person:
        nums[labels.index("E")] += 1
    else:
        nums[labels.index("I")] += 1
    if "S" in person:
        nums[labels.index("S")] += 1
    else:
        nums[labels.index("N")] += 1
    if "N" in person:
        nums[labels.index("T")] += 1
    else:
        nums[labels.index("F")] += 1
    if "J" in person:
        nums[labels.index("J")] += 1
    else:
        nums[labels.index("P")] += 1

nums

[16821, 42151, 10324, 48648, 48648, 10324, 25567, 33405]

In [4]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('./bert-base-uncase')

In [5]:
import torch.utils
import numpy as np


class Dataset(torch.utils.data.Dataset):
    def __init__(self, data, number, positive):
        self.trian = datasets.load_from_disk(DataFilePath+"/"+data)
        self.labels = [1 if positive in label else 0 for label in self.trian["personality"][0 : number if number < len(self.trian["personality"]) else -1]]
        self.texts = [tokenizer(person.replace("|||", "[SEP]"),
                                padding="max_length",
                                max_length=512,
                                truncation=True,
                                return_tensors="pt")
                      for person in self.trian["content"][0 : number if number < len(self.trian["content"]) else -1]]
    
    def classes(self):
        return self.labels
    
    def __len__(self):
        return len(self.labels)
    
    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])
    
    def get_batch_texts(self, idx):
        return self.texts[idx]
    
    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y

In [6]:
from torch import nn

class BertClassifier(nn.Module):
    def __init__(self, dropout=0.5):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('./bert-base-uncase')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 2)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)
        return final_layer

In [7]:
from torch.optim import Adam
from tqdm import tqdm

def train(model, train_data, val_data, learning_rate, epochs, number, positive):
    
    train, val = Dataset(train_data, number, positive), Dataset(val_data, number, positive)
    
    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)
    
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    max_acc = 0
    if use_cuda:
            model = model.cuda()
            criterion = criterion.cuda()
    
    for epoch_num in range(epochs):
        
        total_acc_train = 0
        total_loss_train = 0
        
        for train_input, train_label in tqdm(train_dataloader):

            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)
            
            output = model(input_id, mask)
            
            batch_loss = criterion(output, train_label.to(torch.int64))
            total_loss_train += batch_loss.item()
            
            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc
            
            model.zero_grad()
            batch_loss.backward()
            optimizer.step()
            
            
            total_acc_val = 0
            total_loss_val = 0
            
            with torch.no_grad():
                
                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)
                    output = model(input_id, mask)
                    
                    batch_loss = criterion(output, val_label.to(torch.int64))
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            max_acc = max(max_acc, total_acc_val / len(val))
            # print(
            #     f'''Epochs: {epoch_num + 1} 
            #     | Train Loss: {total_loss_train / len(train): .3f} 
            #     | Train Accuracy: {total_acc_train / len(train): .3f} 
            #     | Val Loss: {total_loss_val / len(val): .3f} 
            #     | Val Accuracy: {total_acc_val / len(val): .3f}''')   
    print("最好的准确率为", max_acc)

In [8]:
EPOCHS = 1
model = BertClassifier()
LR = 1e-6
number = 250

for i in ["E", "N", "T", "J"]:
    print("正类为",i)
    train(model, "train", "valid",  LR, EPOCHS, number, i)

  return self.fget.__get__(instance, owner)()


正类为 E


100%|██████████| 125/125 [56:04<00:00, 26.92s/it] 


最好的准确率为 0.3466214623451233
正类为 N


  6%|▋         | 8/125 [07:38<1:51:40, 57.27s/it] 


KeyboardInterrupt: 