In [None]:
!pip install transformers
!pip install datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np 
import pandas as pd 
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
from datasets import load_metric
import datetime
from torch import nn
from transformers import AutoConfig
from transformers import AutoModel
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

In [None]:
class CFG:
    str_now = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
    basic_lr=1e-3
    train = True
    debug = False
    offline = False
    models_path = "bert-base-uncased"
    epochs = 50
    save_all_models = False
    apex = True
    print_freq = 20
    num_workers = 4
    model = "bert-base-uncased"
    loss_func = 'SmoothL1'
    scheduler = 'cosine'
    batch_scheduler = True
    num_cycles = 0.5
    num_warmup_steps = 0
    encoder_lr = 2e-5
    decoder_lr = 2e-5
    min_lr = 1e-6
    llrd = True
    layerwise_lr = 5e-5
    layerwise_lr_decay = 0.9
    layerwise_weight_decay = 0.01
    layerwise_adam_epsilon = 1e-6
    layerwise_use_bertadam = False
    #pooling
    pooling = 'mean' # mean, max, min, attention, weightedlayer
    layer_start = 4
    #init_weight
    init_weight = 'normal' # normal, xavier_uniform, xavier_normal, kaiming_uniform, kaiming_normal, orthogonal
    #re-init
    reinit = True
    reinit_n = 1
    #adversarial
    fgm = False
    awp = False
    adv_lr = 1
    adv_eps = 0.2
    unscale = False
    eps = 1e-6
    betas = (0.9, 0.999)
    max_len = 50
    weight_decay = 0.01
    gradient_accumulation_steps = 1
    max_grad_norm = 1000
    target_cols = ['EI', 'SN', 'TF', 'JP']
    seed = 42
    cv_seed = 42
    n_fold = 4
    trn_fold = list(range(n_fold))
    batch_size = 50
    n_targets = 4
    gpu_id = 0
    device = f'cuda:{gpu_id}'
cfg=CFG()

In [None]:

from transformers import BertTokenizer
from transformers import BertForSequenceClassification
tokenizer=BertTokenizer.from_pretrained(cfg.model)
#the dataset class for the first dataset, tokenized, and labeled
class Ds1(Dataset):
    def __init__(self, path, tokenizer, max_token_len=cfg.max_len):
        self.df = pd.read_csv(path).dropna()
        self.tokenizer=tokenizer
        self.max_token_len=max_token_len
        self.labelstrdicts={1:"ESTJ", 0:"INFP"}
    def __len__(self):
        return (len(self.df))
    def __getitem__(self, index):
        item=self.df.iloc[index]
        text=item["post"]
        type=item["type"]
        labels=self.str2label(type)
        try:
          tokens=self.tokenizer(text,return_tensors="pt", truncation=True, max_length=self.max_token_len, padding="max_length")
        except:
          print(text)
          quit()
        return {"input_ids": torch.squeeze(tokens.input_ids), "attention_mask":torch.squeeze(tokens.attention_mask), "labels":labels}
    def str2label(self, string):
        label=[]
        for letter in string:
            if letter in "ESTJ":
                label.append(1.)
            else:
                label.append(0.)
        return label
    def label2str(self, label):
        string=[]
        for index,number in enumerate(label):
            string.append(self.labelstrdicts[number][index])
        return string


In [None]:
path="/content/drive/MyDrive/nlpproject/dataset2.csv"
dataset=Ds1(path, tokenizer)
#print(dataset[0])
data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
def getdl(ds):
    total_len=len(ds)
    train_len=int(len(ds)*0.8)
    val_len=int((total_len-train_len)/2)
    test_len=total_len-train_len-val_len
    [train_ds, val_ds, test_ds]=torch.utils.data.random_split(ds, [train_len, val_len, test_len])
    #return (training dataloader, validation dataloader, test dataloader)
    return DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=False, collate_fn=data_collator), DataLoader(val_ds, batch_size=cfg.batch_size, shuffle=False, collate_fn=data_collator), DataLoader(test_ds, batch_size=cfg.batch_size, shuffle=False, collate_fn=data_collator)
train_dl, val_dl, test_dl=getdl(dataset)

In [None]:
class myModel(nn.Module):
    def __init__(self, CFG, pretrained = True):
        super().__init__()
        self.CFG = CFG
        self.config = AutoConfig.from_pretrained(CFG.model, ouput_hidden_states = True)
        self.config.hidden_dropout = 0.
        self.config.hidden_dropout_prob = 0.
        self.config.attention_dropout = 0.
        self.config.attention_probs_dropout_prob = 0.
        self.config.max_length=self.CFG.max_len
        
        if pretrained:
            self.model = AutoModel.from_pretrained(CFG.model, config=self.config)
        else:
            self.model = AutoModel(self.config)            
        self.fc = nn.Linear(self.config.hidden_size, self.CFG.n_targets)
        self.sig = nn.Sigmoid()
        self._init_weights(self.fc)
        
        if 'bert-base' in CFG.model:
            self.model.embeddings.requires_grad_(False)
            self.model.encoder.layer[:12].requires_grad_(False)
        
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            if CFG.init_weight == 'normal':
                module.weight.data.normal_(mean = 0.0, std = self.config.initializer_range)
            elif CFG.init_weight == 'xavier_uniform':
                module.weight.data = nn.init.xavier_uniform_(module.weight.data)
            elif CFG.init_weight == 'xavier_normal':
                module.weight.data = nn.init.xavier_normal_(module.weight.data)
            elif CFG.init_weight == 'kaiming_uniform':
                module.weight.data = nn.init.kaiming_uniform_(module.weight.data)
            elif CFG.init_weight == 'kaiming_normal':
                module.weight.data = nn.init.kaiming_normal_(module.weight.data)
            elif CFG.init_weight == 'orthogonal':
                module.weight.data = nn.init.orthogonal_(module.weight.data)
                
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            if CFG.init_weight == 'normal':
                module.weight.data.normal_(mean = 0.0, std = self.config.initializer_range)
            elif CFG.init_weight == 'xavier_uniform':
                module.weight.data = nn.init.xavier_uniform_(module.weight.data)
            elif CFG.init_weight == 'xavier_normal':
                module.weight.data = nn.init.xavier_normal_(module.weight.data)
            elif CFG.init_weight == 'kaiming_uniform':
                module.weight.data = nn.init.kaiming_uniform_(module.weight.data)
            elif CFG.init_weight == 'kaiming_normal':
                module.weight.data = nn.init.kaiming_normal_(module.weight.data)
            elif CFG.init_weight == 'orthogonal':
                module.weight.data = nn.init.orthogonal_(module.weight.data)
                
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    
    def forward(self, inputs):
        feature = self.model(input_ids=inputs["input_ids"],attention_mask=inputs["attention_mask"]).last_hidden_state[:,0]
        fourlogits = self.fc(feature)
        output = self.sig(fourlogits)
        return output #4 number between 0 and 1

In [None]:
from torch.optim import lr_scheduler
from torch import nn
from torch.optim import Adam
from tqdm.notebook import tqdm
def train(train_ds, eval_ds, model, epochs, cfg):
    if torch.cuda.is_available():  
        dev = "cuda:0" 
    else:  
        dev = "cpu" 
    device = torch.device(dev)
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=cfg.basic_lr)
    scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer,T_0=500, 
                                                                 eta_min=1e-6)
    model = model.to(device)
    criterion = criterion.to(device)
    
    for e in range(epochs):
        totaltrainloss=0
        for i,batch in enumerate(train_ds):
            #if i%100==0:
              #print(i)
            batch.to(device)
            labels=batch["labels"]
            outputs=model(inputs=batch)
            bloss=criterion(outputs, labels)
            totaltrainloss+=bloss.item()
            model.zero_grad()
            bloss.backward()
            optimizer.step()
            scheduler.step()
        totalevalloss=0
        totalcorrectrate=torch.tensor([0.,0.,0.,0.,0.])
        with torch.no_grad():
            for batch in eval_ds:
                batch.to(device)
                labels=batch["labels"]
                outputs=model(inputs=batch)
                eloss=criterion(outputs, labels).item()
                totalevalloss+=eloss
                totalcorrectrate+=evaluate(labels, outputs)
        totalcorrectrate=(totalcorrectrate/len(eval_ds)).tolist()
        print("probability that our prediction is correct: ", totalcorrectrate[0])
        print("probability that our prediction of EI is correct: ", totalcorrectrate[1])
        print("probability that our prediction of SN is correct: ", totalcorrectrate[2])
        print("probability that our prediction of TF is correct: ", totalcorrectrate[3])
        print("probability that our prediction of JP is correct: ", totalcorrectrate[4])
        print(f'Epoch: {e+ 1} | Train Loss: {totaltrainloss / len(train_ds): .3f} | Val Loss: {totalevalloss / len(eval_ds): .3f}') 

In [None]:
def evaluate(labels, outputs):
  answers=torch.round(outputs)
  batch_size=len(labels)
  allcorrect=torch.sum(answers==labels)/(batch_size*4)
  sum=(torch.sum(answers==labels, dim=0)/batch_size).tolist()
  EIcorrect, SNcorrect, TFcorrect, JPcorrect= sum[0], sum[1], sum[2], sum[3]
  return torch.tensor([allcorrect, EIcorrect, SNcorrect, TFcorrect, JPcorrect])

In [None]:
model=myModel(cfg, pretrained = True)

In [None]:
totalcorrectrate=torch.tensor([0.,0.,0.,0.,0.])
for batch in train_dl:
  labels=batch["labels"]
  outputs=model(inputs=batch)
  totalcorrectrate+=evaluate(labels, outputs)
  break
print(totalcorrectrate)

In [None]:
train(train_dl, val_dl, model, epochs=cfg.epochs, cfg=cfg)

probability that our prediction is correct:  0.6122736930847168
probability that our prediction of EI is correct:  0.6811087727546692
probability that our prediction of SN is correct:  0.865970253944397
probability that our prediction of TF is correct:  0.4745592474937439
probability that our prediction of JP is correct:  0.4274559020996094
Epoch: 1 | Train Loss:  1.577 | Val Loss:  1.575
probability that our prediction is correct:  0.6150123476982117
probability that our prediction of EI is correct:  0.6716381907463074
probability that our prediction of SN is correct:  0.865970253944397
probability that our prediction of TF is correct:  0.4750882089138031
probability that our prediction of JP is correct:  0.44735535979270935
Epoch: 2 | Train Loss:  1.573 | Val Loss:  1.575
probability that our prediction is correct:  0.6177704334259033
probability that our prediction of EI is correct:  0.683929443359375
probability that our prediction of SN is correct:  0.8658946752548218
probability 