In [1]:
import numpy as np 
import pandas as pd 
import torch
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

In [12]:
train_path="MBTI 500.csv"

In [8]:
tokenizer=BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)

Downloading: 100%|██████████| 440M/440M [00:10<00:00, 43.3MB/s] 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenc

In [14]:
#the dataset class for the first dataset, tokenized, and labeled
class Ds1(Dataset):
    def __init__(self, path, tokenizer, max_token_len=500):
        self.df = pd.read_csv(path)
        self.tokenizer=tokenizer
        self.max_token_len=max_token_len
        self.labelstrdicts={1:"ESTJ", 0:"INFP"}
    def __len__(self):
        return (len(self.df))
    def __getitem__(self, index):
        item=self.df.iloc[index]
        text=item["posts"]
        type=item["type"]
        labels=self.str2label(type)
        tokens=self.tokenizer(text,return_tensors="pt", truncation=True, max_length=self.max_token_len, padding="max_length")
        return {"input_ids": torch.squeeze(tokens.input_ids), "attention_mask":torch.squeeze(tokens.attention_mask), "labels":labels}
    def str2label(self, string):
        label=[]
        for letter in string:
            if letter in "ESTJ":
                label.append(1)
            else:
                label.append(0)
        return label
    def label2str(self, label):
        string=[]
        for index,number in enumerate(label):
            string.append(self.labelstrdicts[number][index])
        return string


In [15]:
dataset=Ds1(train_path, tokenizer)
#print(dataset[0])

In [31]:
data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
def getdl(ds):
    total_len=len(ds)
    train_len=int(len(ds)*0.8)
    val_len=int((total_len-train_len)/2)
    test_len=total_len-train_len-val_len
    [train_ds, val_ds, test_ds]=torch.utils.data.random_split(ds, [train_len, val_len, test_len])
    #return (training dataloader, validation dataloader, test dataloader)
    return DataLoader(train_ds, batch_size=10, shuffle=True, collate_fn=data_collator), DataLoader(val_ds, batch_size=10, shuffle=False, collate_fn=data_collator), DataLoader(test_ds, batch_size=10, shuffle=False, collate_fn=data_collator)


In [32]:
train_dl, val_dl, test_dl=getdl(dataset)