In [26]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [27]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

torch.manual_seed(1)


<torch._C.Generator at 0x7fbc15327330>

In [28]:
foldername= "/content/drive/My Drive/nlpproject/"
path = foldername+"dataset2.csv"

In [29]:
df = pd.read_csv(path).dropna()
items = df.iloc[:]["post"]
old_word_to_ix = dict()
for i,sent in enumerate(items):
    for word in sent.split():
        if word not in old_word_to_ix: 
            old_word_to_ix[word] = 1
        else:
            old_word_to_ix[word] = old_word_to_ix[word]+1
# Add unknown word "UNK" to word_to_ix.
# Doing this in case you find an unknown word in testing.
word_to_ix = {}
for word in old_word_to_ix:
  count=old_word_to_ix[word]
  if count>10:
    word_to_ix[word]=len(word_to_ix)
word_to_ix["UNK"] = len(word_to_ix)
print(len(word_to_ix))

33842


In [30]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] if w in to_ix else to_ix["UNK"] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [31]:
#the dataset class for the first dataset, tokenized, and labeled
class Ds(Dataset):
    def __init__(self, path, id_mbti, word_to_ix):
        self.df = pd.read_csv(path).dropna()
        self.labelstrdicts={1:"ESTJ", 0:"INFP"}
        self.id_mbti=0  
        self.word_to_ix=word_to_ix
    def __len__(self):
        return (len(self.df))
    def __getitem__(self, index):
        item=self.df.iloc[index]
        text=item["post"]
        type=item["type"]
        labels=self.str2label(type)
        try:
          tokens=text.split()
        except:
          print(text)
          quit()
        return {"input_ids": prepare_sequence(text,self.word_to_ix), "labels":labels}
    def str2label(self, string):
        letter=string[self.id_mbti]
        if letter in "ESTJ":
            return 1
        else:
            return 0
    def label2str(self, label):
        return self.labelstrdicts[label][self.id_mbti]

In [32]:
from torch.nn.utils.rnn import pad_sequence
def c(data): 
    inputs = [torch.tensor(d['input_ids']) for d in data] 
    labels = [d['labels'] for d in data]
    inputs = pad_sequence(inputs, batch_first=True)
    labels = torch.tensor(labels) 
    return { 
        'input_ids': inputs, 
        'labels': labels
    }

In [33]:
def getdl(ds):
    total_len=len(ds)
    train_len=int(len(ds)*0.9)
    val_len=int((total_len-train_len)/2)
    test_len=total_len-train_len-val_len
    [train_ds, val_ds, test_ds]=torch.utils.data.random_split(ds, [train_len, val_len, test_len])
    #return (training dataloader, validation dataloader, test dataloader)
    return len(train_ds), len(val_ds), len(test_ds), DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True,collate_fn=c), DataLoader(val_ds, batch_size=cfg.batch_size, shuffle=False,collate_fn=c), DataLoader(test_ds, batch_size=cfg.batch_size, shuffle=False, collate_fn=c)
    #return DataLoader(ds, batch_size=cfg.batch_size, shuffle=False, collate_fn=data_collator)

In [34]:
class CFG():
  lr=1e-4
  min_lr=1e-10
  batch_size=1024
  epoch=1000
  embedding_dim=512
  hidden_dim=256
  drop=0.15
cfg=CFG()

In [35]:
ds=Ds(path,0,word_to_ix)
len_train, len_val, len_test, train_dl_EI,val_dl_EI, test_dl_EI=getdl(ds)

In [36]:
l=[x['labels'] for x in ds]

In [37]:
num_one=torch.count_nonzero(torch.tensor(l), dim=0)
num_zero=len(l)-num_one

print(num_zero)
print(num_one)
LABEL_RATIO=torch.tensor([num_one/num_one, num_zero/num_one]).to("cuda")
print(LABEL_RATIO)
print(len(ds))

tensor(303868)
tensor(92655)
tensor([1.0000, 3.2796], device='cuda:0')
396523


In [38]:
def evaluate(labels, outputs):
  answers=(torch.argmax(outputs, dim=1))
  allcorrect=torch.sum(answers==labels)
  return allcorrect

In [39]:
class Model(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, drop):
        super(Model, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.dropout= nn.Dropout(drop)
        self.relu = nn.ReLU()
        self.fc1= nn.Linear(hidden_dim, 128)
        self.fc2= nn.Linear(128, 32)
        self.fc3= nn.Linear(32, 8)
        self.fc_final= nn.Linear(8, 2)
    def forward(self, input_ids):
        o =  self.word_embeddings(input_ids)
        o,_ = self.lstm(o)
        o = o[:,-1]
        o = self.fc1(o)
        o = self.dropout(self.relu(o))
        o = self.fc2(o)
        o = self.dropout(self.relu(o))
        o = self.fc3(o)
        o = self.dropout(self.relu(o))
        o = self.fc_final(o)
        return o

In [40]:
from torch.optim import lr_scheduler
from torch.nn import CrossEntropyLoss
from torch import optim
from torch.optim import Adam
from tqdm.notebook import tqdm
def train(train_ds, eval_ds, model, epochs, cfg, type, lr, loss=None):
    if torch.cuda.is_available():  
        dev = "cuda:0" 
    else:  
        dev = "cpu" 
    device = torch.device(dev)
    model = model.to(device)

    #weights=torch.tensor([1., 3.]).cuda()
    #criterion = nn.MSELoss()
    criterion = CrossEntropyLoss(weight=LABEL_RATIO)
    #criterion = CrossEntropyLoss()

    criterion.to(device)
    #criterion = loss
    
    optimizer = Adam(model.parameters(), lr=lr)
    #optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer,T_0=500, eta_min=1e-15)
    #scheduler= optim.lr_scheduler.ExponentialLR(optimizer, 0.99, last_epoch=- 1, verbose=False)
        
    totalevalloss=0
    totalcorrect=0
    totaldata=0
    with torch.no_grad():
        model.eval()
        for batch in eval_ds:
            blabels=batch["labels"].to(device)
            input_ids=batch["input_ids"].to(device)
            outputs=model(input_ids=input_ids)
            eloss=criterion(outputs, blabels).item()
            totalevalloss+=eloss
            totalcorrect+=evaluate(blabels, outputs)
            totaldata+=len(blabels)
    totalcorrect_rate=(totalcorrect/(totaldata))
    print("probability that our prediction of ", type, " is correct: ", totalcorrect_rate)
    #print(f'Initial Val Loss: {totalevalloss / len(eval_ds): .3f} ' ) 
    print(f'Initial Val Loss: {totalevalloss / len(eval_ds): .3f} | current lr: {scheduler.get_last_lr()}' ) 
    
    for e in range(epochs):
        totaltrainloss=0
        totaltraincorrect=0
        totaltraindata=0
        for i,batch in enumerate(train_ds):
            model.train()
            optimizer.zero_grad()

            blabels=batch["labels"].to(device)
            input_ids=batch["input_ids"].to(device)
            outputs=model(input_ids=input_ids)

            bloss=criterion(outputs, blabels)
            bloss.backward()
            optimizer.step()
            totaltrainloss+=bloss.item()
            totaltraincorrect+=evaluate(blabels, outputs)
            totaltraindata+=len(blabels)
        scheduler.step()
        totalevalloss=0
        totalcorrect=0
        totaldata=0
        with torch.no_grad():
            model.eval()
            for batch in eval_ds:
                blabels=batch["labels"].to(device)
                input_ids=batch["input_ids"].to(device)
                outputs=model(input_ids=input_ids)
                eloss=criterion(outputs, blabels).item()
                totalevalloss+=eloss
                totalcorrect+=evaluate(blabels, outputs)
                totaldata+=len(blabels)
        totalcorrect_rate=(totalcorrect/(totaldata))
        totaltraincorrect_rate = (totaltraincorrect/(totaltraindata))
        print("probability that our prediction of ", type, " is correct: ", totalcorrect_rate)
        print("probability that our prediction of ", type, " is correct in training dataset: ", totaltraincorrect_rate)
        #print(f'Epoch: {e+ 1} | Train Loss: {totaltrainloss / len(train_ds): .8f} | Val Loss: {totalevalloss / len(eval_ds): .3f}' ) 
        print(f'Epoch: {e+ 1} | Train Loss: {totaltrainloss / len(train_ds): .8f} | Val Loss: {totalevalloss / len(eval_ds): .8f} | current lr: {scheduler.get_last_lr()}' ) 

In [None]:
model = Model(cfg.embedding_dim, cfg.hidden_dim, len(word_to_ix),drop=cfg.drop)
print(model)
train(train_dl_EI, val_dl_EI, model, epochs=cfg.epoch, cfg=cfg, type="EI", lr=1e-3, loss=None)

Model(
  (word_embeddings): Embedding(33842, 512)
  (lstm): LSTM(512, 256)
  (dropout): Dropout(p=0.15, inplace=False)
  (relu): ReLU()
  (fc1): Linear(in_features=256, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=8, bias=True)
  (fc_final): Linear(in_features=8, out_features=2, bias=True)
)


  This is separate from the ipykernel package so we can avoid doing imports until


probability that our prediction of  EI  is correct:  tensor(0.2363, device='cuda:0')
Initial Val Loss:  0.693 | current lr: [0.001]
probability that our prediction of  EI  is correct:  tensor(0.2363, device='cuda:0')
probability that our prediction of  EI  is correct in training dataset:  tensor(0.5312, device='cuda:0')
Epoch: 1 | Train Loss:  0.69317654 | Val Loss:  0.69313941 | current lr: [0.0009999901304280686]
probability that our prediction of  EI  is correct:  tensor(0.7637, device='cuda:0')
probability that our prediction of  EI  is correct in training dataset:  tensor(0.6415, device='cuda:0')
Epoch: 2 | Train Loss:  0.69315898 | Val Loss:  0.69316504 | current lr: [0.0009999605221019082]
probability that our prediction of  EI  is correct:  tensor(0.7637, device='cuda:0')
probability that our prediction of  EI  is correct in training dataset:  tensor(0.5458, device='cuda:0')
Epoch: 3 | Train Loss:  0.69315241 | Val Loss:  0.69337246 | current lr: [0.0009999111761904045]
probabi

In [None]:
criterion = CrossEntropyLoss()
model = Model(cfg.embedding_dim, cfg.hidden_dim, len(word_to_ix))
model.to("cuda")
print(model)
with torch.no_grad():
  device="cuda"
  for batch in train_dl_EI:
    print(batch)
    blabels=batch["labels"].to(device)
    print(blabels)
    input_ids=batch["input_ids"].to(device)
    print(input_ids)
    outputs=model(input_ids=input_ids)
    print(outputs)
    print(blabels)
    eloss=criterion(outputs, blabels).item()
    break