In [40]:
import numpy as np 
import pandas as pd 
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from torch import nn
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from transformers import BertTokenizer

In [9]:
#from google.colab import drive
#drive.mount('/content/drive')

Mounted at /content/drive


In [22]:
train_path="./dataset2(sep).csv"

In [85]:
#the dataset class for the first dataset, tokenized, and labeled
class Ds1(Dataset):
    def __init__(self, path, tokenizer, max_token_len=500):
        self.df = pd.read_csv(path).dropna()
        self.tokenizer=tokenizer
        self.max_token_len=max_token_len
        self.labelstrdicts={1:"ESTJ", 0:"INFP"}
    def __len__(self):
        return (len(self.df))
    def __getitem__(self, index):
        item=self.df.iloc[index]
        text=item["post"]
        type=item["type"]
        labels=self.str2label(type)
        tokens=self.tokenizer(text,return_tensors="pt", truncation=True, max_length=self.max_token_len, padding="max_length")
        return {"input_ids": torch.squeeze(tokens.input_ids), "attention_mask":torch.squeeze(tokens.attention_mask), "labels":labels}
    def str2label(self, string):
        label=[]
        for letter in string:
            if letter in "ESTJ":
                label.append(1)
            else:
                label.append(0)
        return label
    def label2str(self, label):
        string=[]
        for index,number in enumerate(label):
            string.append(self.labelstrdicts[number][index])
        return string


In [86]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
#torchtokenizer = get_tokenizer('basic_english')
dataset=Ds1(train_path, tokenizer)

#def yield_tokens(data_iter):
    #for data in data_iter:
        #yield torchtokenizer(data["text"])

#vocab = build_vocab_from_iterator(yield_tokens(dataset), specials=["[sep]"])
#vocab.set_default_index(vocab["[sep]"])
#print(dataset[0])

In [59]:
#print(len(vocab)) = 150549

150549


In [87]:
data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
def getdl(ds, batch_size):
    total_len=len(ds)
    train_len=int(len(ds)*0.8)
    val_len=int((total_len-train_len)/2)
    test_len=total_len-train_len-val_len
    [train_ds, val_ds, test_ds]=torch.utils.data.random_split(ds, [train_len, val_len, test_len])
    #return (training dataloader, validation dataloader, test dataloader)
    return DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=data_collator), DataLoader(val_ds, batch_size=batch_size, shuffle=False, collate_fn=data_collator), DataLoader(test_ds, batch_size=batch_size, shuffle=False, collate_fn=data_collator)


In [88]:
train_dl, val_dl, test_dl=getdl(dataset, 64)

In [67]:
for e,i in enumerate(train_dl):
    print(e, i)
    if e > 50:
        break
    
    

0 {'input_ids': tensor([[  101,  1005,  5541,  ...,  3768,  1997,   102],
        [  101,  1005,  2043,  ...,   102,  1045,   102],
        [  101,  1005,  2941,  ...,  2057,  1005,   102],
        ...,
        [  101,  1998,  2016,  ..., 14499,  2389,   102],
        [  101,  1005,  2009,  ...,  3042,  2005,   102],
        [  101,  1005,  1045,  ..., 11265,  1012,   102]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[1, 0, 1, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 1],
        [1, 0, 0, 1],
        [1, 0, 1, 1],
        [0, 0, 1, 1],
        [1, 0, 0, 0],
        [0, 0, 0, 1],
        [0, 0, 1, 1],
        [1, 0, 1, 1],
        [0, 1, 0, 1],
        [1, 0, 1, 1],
        [0, 0, 0, 1],
        [0, 0, 1, 1],
        [0, 0, 0, 1],
        [0, 0, 1, 0],
        [0, 0, 0, 1],


5 {'input_ids': tensor([[  101,  1005,  7592,  ...,  1998, 20014,   102],
        [  101,  1005,  1045,  ...,  1012,   102,   102],
        [  101,  1005,  1012,  ..., 18158,  4779,   102],
        ...,
        [  101,  1005,  7929,  ...,  2299,  1012,   102],
        [  101,  1005,  2821,  ...,  2000,  2695,   102],
        [  101, 17710, 10649,  ...,  1998,  1037,   102]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[0, 0, 0, 1],
        [0, 0, 1, 0],
        [0, 0, 1, 0],
        [0, 1, 1, 0],
        [0, 0, 1, 0],
        [1, 0, 1, 0],
        [1, 0, 0, 0],
        [0, 1, 0, 0],
        [0, 0, 0, 1],
        [0, 0, 0, 1],
        [0, 0, 0, 1],
        [0, 0, 0, 0],
        [0, 0, 1, 1],
        [0, 1, 0, 1],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [1, 0, 0, 0],


10 {'input_ids': tensor([[  101,  1005,  1024,  ...,  9535,  3110,   102],
        [  101,  1005,  8840,  ...,  1006,  2687,   102],
        [  101,  1005,  2876,  ...,  1049, 10468,   102],
        ...,
        [  101,  1005,  1045,  ..., 10061,  1012,   102],
        [  101,  1005,  2502,  ...,  8962,  1997,   102],
        [  101,  1005,  2023,  ...,  1045,  1005,   102]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[0, 0, 0, 1],
        [0, 0, 0, 0],
        [0, 0, 1, 0],
        [0, 0, 0, 1],
        [0, 0, 1, 0],
        [0, 0, 1, 1],
        [0, 0, 0, 1],
        [0, 1, 1, 0],
        [0, 1, 0, 1],
        [0, 0, 1, 1],
        [1, 0, 1, 1],
        [0, 0, 0, 0],
        [0, 0, 0, 1],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 1, 1],

15 {'input_ids': tensor([[  101,  1045,  2001,  ...,  2003,  4951,   102],
        [  101,  1005,  2026,  ...,  2014,  6270,   102],
        [  101,  1005,  2157,  ..., 22578,  2024,   102],
        ...,
        [  101,  1005,  2000,  ...,  2774,  1010,   102],
        [  101,  1005,  2024,  ...,  1010,  2009,   102],
        [  101,  1005,  1045,  ...,  4298,  1007,   102]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[0, 0, 1, 0],
        [1, 0, 1, 0],
        [0, 0, 0, 1],
        [1, 0, 0, 0],
        [1, 0, 0, 1],
        [0, 0, 1, 1],
        [0, 1, 1, 0],
        [0, 0, 1, 0],
        [1, 0, 1, 0],
        [0, 0, 0, 0],
        [0, 0, 1, 1],
        [0, 0, 0, 1],
        [1, 0, 0, 0],
        [0, 0, 1, 0],
        [0, 0, 0, 0],
        [0, 0, 1, 0],
        [0, 0, 0, 1],

20 {'input_ids': tensor([[  101,  1005,  1045,  ...,  1037,  2711,   102],
        [  101,  1005,  2023,  ...,   102, 10905,   102],
        [  101,  1005,  5597,  ...,     0,     0,     0],
        ...,
        [  101,  1005,  2054,  ...,  1010,  1045,   102],
        [  101,  1005,  3227,  ...,   102,  2017,   102],
        [  101,  1005,  3963,  ...,  1012,  1012,   102]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[0, 0, 1, 1],
        [0, 1, 0, 0],
        [0, 1, 1, 1],
        [0, 0, 0, 1],
        [0, 1, 1, 1],
        [0, 1, 1, 0],
        [0, 0, 0, 1],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 1, 0],
        [1, 0, 1, 1],
        [0, 0, 0, 0],
        [0, 0, 1, 1],
        [1, 0, 0, 0],
        [0, 1, 1, 1],
        [0, 0, 0, 1],
        [0, 0, 1, 0],

25 {'input_ids': tensor([[ 101, 2009, 2003,  ..., 2111, 2507,  102],
        [ 101, 1005, 1045,  ..., 1005, 2310,  102],
        [ 101, 1005, 5292,  ..., 6721, 4490,  102],
        ...,
        [ 101, 1005, 3398,  ..., 1012, 1012,  102],
        [ 101, 1005, 7910,  ..., 1010, 3374,  102],
        [ 101, 1005, 2053,  ..., 2361,  999,  102]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[0, 0, 1, 0],
        [0, 0, 0, 1],
        [0, 0, 0, 0],
        [0, 0, 1, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [1, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 1, 0],
        [0, 0, 0, 1],
        [1, 1, 0, 1],
        [1, 0, 1, 0],
        [0, 0, 1, 1],
        [0, 0, 1, 0],
        [0, 1, 1, 1],
        [1, 0, 0, 0],
        [0, 0, 0, 0],
        [1, 0, 1, 0],
        [0, 0

30 {'input_ids': tensor([[ 101, 1005, 1045,  ..., 2008, 2031,  102],
        [ 101, 1005, 1045,  ..., 2065, 2027,  102],
        [ 101, 1005, 1045,  ..., 1012,  102,  102],
        ...,
        [ 101, 1005, 1045,  ..., 3382, 2000,  102],
        [ 101, 6343, 3658,  ..., 3899,  102,  102],
        [ 101, 1005, 2026,  ..., 2008, 3071,  102]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[0, 0, 1, 0],
        [0, 0, 1, 0],
        [0, 0, 1, 0],
        [0, 1, 1, 1],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 1, 0],
        [0, 0, 0, 1],
        [0, 0, 1, 0],
        [0, 0, 0, 0],
        [0, 0, 1, 0],
        [0, 0, 0, 1],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 1],
        [0, 0, 0, 1],
        [1, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0

35 {'input_ids': tensor([[  101,  1005,  3835,  ...,  2054,  2017,   102],
        [  101,  1005,  1045,  ...,  4730,  2651,   102],
        [  101,  2324,  1029,  ...,  2012, 25795,   102],
        ...,
        [  101,  1005,  1045,  ...,  2006,  2327,   102],
        [  101,  1005,  2061,  ...,  8699,  2000,   102],
        [  101,  1005, 13501,  ...,  1012,   102,   102]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[1, 0, 0, 0],
        [0, 0, 0, 1],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 1, 0, 1],
        [0, 0, 1, 0],
        [1, 1, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 1, 1],
        [0, 0, 0, 0],
        [0, 0, 1, 1],
        [0, 0, 0, 1],
        [0, 0, 1, 0],
        [1, 0, 1, 0],
        [0, 1, 1, 1],
        [0, 1, 1, 0],

40 {'input_ids': tensor([[  101,  1005,  1045,  ...,  2204, 10768,   102],
        [  101,  3087,  2066,  ...,  8502,  1012,   102],
        [  101,  1005,  7167,  ...,   102,  1045,   102],
        ...,
        [  101,  1005,  2008,  ...,  2367,  2138,   102],
        [  101,  1005,  1045,  ...,  8633,  1012,   102],
        [  101,  1005,  2777,  ...,  2000,  1996,   102]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[0, 0, 1, 0],
        [0, 0, 0, 0],
        [0, 1, 1, 0],
        [0, 0, 1, 0],
        [0, 0, 1, 1],
        [1, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 1, 0],
        [1, 0, 0, 0],
        [1, 0, 1, 0],
        [1, 0, 0, 1],
        [0, 0, 0, 0],
        [0, 1, 0, 0],
        [0, 0, 1, 1],
        [0, 0, 0, 0],
        [0, 0, 1, 1],
        [1, 0, 1, 1],

45 {'input_ids': tensor([[  101,  1045,  2123,  ...,  1045,  2031,   102],
        [  101,  1005, 16780,  ...,  2020,  1021,   102],
        [  101,  1005,  1045,  ..., 14083, 14776,   102],
        ...,
        [  101,  1005,  2435,  ...,  2004,  2019,   102],
        [  101,  1005, 15333,  ...,  7015,  1010,   102],
        [  101,  1005,  2788,  ...,   102,  1999,   102]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[0, 0, 0, 0],
        [0, 1, 1, 0],
        [0, 0, 0, 0],
        [1, 0, 1, 0],
        [0, 0, 0, 0],
        [1, 0, 1, 0],
        [0, 0, 0, 0],
        [1, 1, 1, 0],
        [0, 0, 0, 1],
        [0, 0, 1, 0],
        [1, 1, 0, 0],
        [1, 1, 1, 0],
        [0, 0, 1, 0],
        [0, 1, 1, 0],
        [1, 0, 1, 1],
        [0, 0, 0, 1],
        [0, 0, 0, 0],

50 {'input_ids': tensor([[  101,  1005,  1045,  ..., 10201,  2105,   102],
        [  101,  1005,  1998,  ...,  6866,  2009,   102],
        [  101,  1005,  4642,  ...,  1029,   102,   102],
        ...,
        [  101,  1005,  2079,  ...,  2288,  5506,   102],
        [  101,  1005,  1045,  ...,  1055,  2980,   102],
        [  101,  1005,  1037,  ...,  1045,  2109,   102]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[0, 0, 0, 0],
        [1, 0, 0, 1],
        [0, 1, 1, 0],
        [0, 0, 0, 0],
        [0, 1, 0, 1],
        [1, 0, 0, 1],
        [0, 0, 0, 0],
        [0, 0, 0, 1],
        [0, 0, 1, 0],
        [0, 0, 0, 0],
        [0, 0, 1, 0],
        [0, 0, 1, 0],
        [0, 0, 1, 1],
        [1, 0, 0, 0],
        [0, 0, 1, 1],
        [0, 0, 0, 1],
        [0, 0, 0, 1],

In [89]:
# Construct the model
class BaselineClassifier(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class):
        super(BaselineClassifier, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text):#, offsets):
        embedded = self.embedding(text)#, offsets)
        return self.fc(embedded)

In [90]:
# Initiate instance

num_class = 2
vocab_size = 150549
emsize = 64 # size of embeddings
# 1:"ESTJ", 0:"INFP"
model_EI = BaselineClassifier(vocab_size, emsize, num_class).to("mps:0") # label_id = 0
model_SN = BaselineClassifier(vocab_size, emsize, num_class).to("mps:0") # label_id = 1
model_TF = BaselineClassifier(vocab_size, emsize, num_class).to("mps:0") # label_id = 2
model_JP = BaselineClassifier(vocab_size, emsize, num_class).to("mps:0") # label_id = 3

In [91]:
# Define training and evaluation
import time

def train(dataloader, model, label_id):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (text, mask, label) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text)
        loss = criterion(predicted_label, label[label_id])
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader, model, label_id):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (text, mask, label) in enumerate(dataloader):
            predicted_label = model(text)
            loss = criterion(predicted_label, label[label_id])
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [84]:
# Training
# train_dl, val_dl, test_dl
# model_EI | label_id = 0
# model_SN | label_id = 1
# model_TF | label_id = 2
# model_JP | label_id = 3

# Hyperparameters
EPOCHS = 10 # epoch
LR = 5  # learning rate
BATCH_SIZE = 64 # batch size for training

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model_EI.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dl, model_EI, 0)
    accu_val = evaluate(val_dl, model_EI, 0)
    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

AttributeError: 'str' object has no attribute 'dim'