In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers
!pip install datasets

In [None]:
#foldername= "/content/drive/My Drive/nlpproject/bertsequenceclassification/"
foldername= "/content/drive/My Drive/Project/extractedfeaturewithbert/"

In [None]:
import numpy as np 
import pandas as pd 
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
from datasets import load_metric
import datetime
from torch import nn
from transformers import AutoConfig
from transformers import AutoModel
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

In [None]:
class CFG():
  max_len = 512
  #model = "bert-base-uncased"
  model = "distilbert-base-uncased"
  lr=1e-4
  min_lr=1e-10
  batch_size=16
  epoch=1000
  embedding_dim=512
  hidden_dim=256
  drop=0.15
cfg=CFG()

In [None]:
class Ds(Dataset):
    def __init__(self, path, tokenizer, i, max_token_len=cfg.max_len):
        self.df = pd.read_csv(path).dropna()
        self.tokenizer=tokenizer
        self.max_token_len=max_token_len
        self.labelstrdicts={1:"ESTJ", 0:"INFP"}
        self.loc=i #EI at index 0 in mbti
    def __len__(self):
        return (len(self.df))
    def __getitem__(self, index):
        item=self.df.iloc[index]
        text=item["post"]
        t=item["type"]
        labels=self.str2label(t)
        try:
          tokens=self.tokenizer(text,return_tensors="pt", truncation=True, max_length=self.max_token_len, padding="max_length")
        except:
          print(text)
          quit()
        return {"input_ids": torch.squeeze(tokens.input_ids), "attention_mask":torch.squeeze(tokens.attention_mask), "labels":labels}
    def str2label(self, string):
        letter=string[self.loc]
        if letter in "ESTJ":
            return 1
        else:
            return 0
    def label2str(self, label):
        return self.labelstrdicts[label][self.loc]



In [None]:
#from transformers import BertTokenizer, BertForSequenceClassification
#tokenizer=BertTokenizer.from_pretrained(cfg.model)
#bertmodel=AutoModel.from_pretrained(cfg.model)

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
tokenizer=BertTokenizer.from_pretrained(cfg.model)
bertmodel=AutoModel.from_pretrained(cfg.model)

In [None]:
path=foldername+"dataset2.csv"
#print(dataset[0])
data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
def getdl(ds):
    total_len=len(ds)
    train_len=int(len(ds)*0.9)
    val_len=int((total_len-train_len)/2)
    test_len=total_len-train_len-val_len
    [train_ds, val_ds, test_ds]=torch.utils.data.random_split(ds, [train_len, val_len, test_len])
    #return (training dataloader, validation dataloader, test dataloader)
    return len(train_ds), len(val_ds), len(test_ds), DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=False, collate_fn=data_collator), DataLoader(val_ds, batch_size=cfg.batch_size, shuffle=False, collate_fn=data_collator), DataLoader(test_ds, batch_size=cfg.batch_size, shuffle=False, collate_fn=data_collator)
    #return DataLoader(ds, batch_size=cfg.batch_size, shuffle=False, collate_fn=data_collator)

In [None]:
dataset_EI=Ds(path, tokenizer, i=0)
len_train, len_val, len_test, train_dl_EI,val_dl_EI, test_dl_EI=getdl(dataset_EI)

#dataset_SN=Ds_SN(path, tokenizer)
#dl_SN=getdl(dataset_SN)

#dataset_TF=Ds_TF(path, tokenizer)
#dl_TF=getdl(dataset_TF)

#dataset_JP=Ds_JP(path, tokenizer)
#dl_JP=getdl(dataset_JP)

In [None]:
l=[x['labels'] for x in dataset_EI]

In [None]:
num_one=torch.count_nonzero(torch.tensor(l), dim=0)
num_zero=len(l)-num_one

print(num_zero)
print(num_one)
LABEL_RATIO=torch.tensor([num_one/num_one, num_zero/num_one]).to("cuda")
print(LABEL_RATIO)

In [None]:
def evaluate(labels, outputs):
  answers=(torch.argmax(outputs, dim=1))
  allcorrect=torch.sum(answers==labels)
  return allcorrect

In [None]:
class myModel(nn.Module):
    def __init__(self, CFG, dropoutrate, model):
        super().__init__()
        self.CFG = CFG
        self.relu = nn.ReLU()
        self.t = nn.Tanh()
        #self.bert = model
        self.distilbert = model
        self.fc1 = nn.Linear(768, 64)
        self.fc2 = nn.Linear(64, 16)
        #self.fc3 = nn.Linear(64, 16)
        #self.fc4 = nn.Linear(64, 16)
        #self.fc5 = nn.Linear(512, 256)
        #self.fc6 = nn.Linear(256, 128)
        #self.fc7 = nn.Linear(128, 64)
        #self.fc8 = nn.Linear(64, 32)
        #self.fc9 = nn.Linear(32, 16)
        #self.fc10 = nn.Linear(16, 2)
        self.fc_final = nn.Linear(16, 2)

        self.dropout = nn.Dropout(p=dropoutrate, inplace=False)
    
    def forward(self, batch):

        #output=self.bert(input_ids=batch["input_ids"],attention_mask=batch["attention_mask"]).last_hidden_state[:,0]
        output=self.distilbert(input_ids=batch["input_ids"],attention_mask=batch["attention_mask"]).last_hidden_state[:,0]

        output = self.dropout(output)



        output = self.fc1(output)
        output = self.relu(output)
        output = self.dropout(output)

        output = self.fc2(output)
        output = self.relu(output)
        output = self.dropout(output)

        #output = self.fc3(output)
        #output = self.relu(output)
        #output = self.dropout(output)

        #output = self.fc4(output)
        #output = self.t(output)
        #output = self.dropout(output)

        #output = self.fc5(output)
        #output = self.t(output)
        #output = self.dropout(output)

        #output = self.fc6(output)
        #output = self.t(output)
        #output = self.dropout(output)

        #output = self.fc7(output)
        #output = self.t(output)
        #output = self.dropout(output)

        #output = self.fc8(output)
        #output = self.t(output)
        #output = self.dropout(output)

        #output = self.fc9(output)
        #output = self.t(output)
        #output = self.dropout(output)

        output = self.fc_final(output)
        #output = self.sig(output)

        #output = torch.squeeze(output)
        return output #2 number between 0 and 1

In [None]:
from torch.optim import lr_scheduler
from torch.nn import CrossEntropyLoss
from torch import optim
from torch.optim import Adam
from tqdm.notebook import tqdm
def train(train_ds, eval_ds, model, epochs, cfg, type, lr, loss=None):
    if torch.cuda.is_available():  
        dev = "cuda:0" 
    else:  
        dev = "cpu" 
    device = torch.device(dev)
    model = model.to(device)

    #weights=torch.tensor([1., 3.]).cuda()
    #criterion = nn.MSELoss()
    criterion = CrossEntropyLoss(weight=LABEL_RATIO)

    criterion.to(device)
    #criterion = loss
    
    optimizer = Adam(model.parameters(), lr=lr)
    #optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer,T_0=500, eta_min=1e-15)
    #scheduler= optim.lr_scheduler.ExponentialLR(optimizer, 0.99, last_epoch=- 1, verbose=False)
        
    totalevalloss=0
    totalcorrect=0
    totaldata=0
    with torch.no_grad():
        model.eval()
        for batch in eval_ds:
            batch.to(device)
            blabels=batch["labels"]
            outputs=model(batch)
            eloss=criterion(outputs, blabels).item()
            totalevalloss+=eloss
            totalcorrect+=evaluate(blabels, outputs)
            totaldata+=len(blabels)
    totalcorrect_rate=(totalcorrect/(totaldata))
    print("probability that our prediction of ", type, " is correct: ", totalcorrect_rate)
    #print(f'Initial Val Loss: {totalevalloss / len(eval_ds): .3f} ' ) 
    print(f'Initial Val Loss: {totalevalloss / len(eval_ds): .3f} | current lr: {scheduler.get_last_lr()}' ) 
    
    for e in range(epochs):
        totaltrainloss=0
        totaltraincorrect=0
        totaltraindata=0
        for i,batch in enumerate(train_ds):
            model.train()
            optimizer.zero_grad()
            batch.to(device)
            labels=batch["labels"]
            outputs=model(batch)
            bloss=criterion(outputs, labels)
            bloss.backward()
            optimizer.step()
            totaltrainloss+=bloss.item()
            totaltraincorrect+=evaluate(labels, outputs)
            totaltraindata+=len(labels)
        scheduler.step()
        totalevalloss=0
        totalcorrect=0
        totaldata=0
        with torch.no_grad():
            model.eval()
            for batch in eval_ds:
                batch.to(device)
                blabels=batch["labels"]
                outputs=model(batch)
                eloss=criterion(outputs, blabels).item()
                totalevalloss+=eloss
                totalcorrect+=evaluate(blabels, outputs)
                totaldata+=len(blabels)
        totalcorrect_rate=(totalcorrect/(totaldata))
        totaltraincorrect_rate = (totaltraincorrect/(totaltraindata))
        print("probability that our prediction of ", type, " is correct: ", totalcorrect_rate)
        print("probability that our prediction of ", type, " is correct in training dataset: ", totaltraincorrect_rate)
        #print(f'Epoch: {e+ 1} | Train Loss: {totaltrainloss / len(train_ds): .8f} | Val Loss: {totalevalloss / len(eval_ds): .3f}' ) 
        print(f'Epoch: {e+ 1} | Train Loss: {totaltrainloss / len(train_ds): .8f} | Val Loss: {totalevalloss / len(eval_ds): .8f} | current lr: {scheduler.get_last_lr()}' ) 

In [None]:
#model=myModel(cfg, 0.2, bertmodel)
model=myModel(cfg, 0.2, distilbertmodel)
train(train_dl_EI, val_dl_EI, model, epochs=cfg.epoch, cfg=cfg, type="EI", lr=1e-4, loss=None)

probability that our prediction of  EI  is correct:  tensor(0.7662, device='cuda:0')
Initial Val Loss:  0.696 | current lr: [0.0001]


In [None]:
torch.save(model, "new")

In [None]:
#m=BertForSequenceClassification.from_pretrained(cfg.model)
m=DistilBertForSequenceClassification.from_pretrained(cfg.model)
m

In [None]:
#bertmodel
distilbertmodel

In [None]:

for i,batch in enumerate(val_dl_EI):
            batch.to("cuda")
            batch_labels=batch["labels"]
            outputs=model(input_ids=batch["input_ids"]).logits
            print(outputs)
            x=torch.argmax(outputs,dim=1)
            print(x)
            print(batch_labels)
            print(torch.sum(x==batch_labels))
            print((x==batch_labels).sum())
            print(evaluate(batch_labels, outputs))
            break