In [None]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
foldername= "/content/drive/My Drive/nlpproject/"

In [None]:
import numpy as np 
import pandas as pd 
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
from datasets import load_metric
import datetime
from torch import nn
from transformers import AutoConfig
from transformers import AutoModel
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

In [None]:
class CFG:
    str_now = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
    basic_lr=1e-3
    train = True
    debug = False
    offline = False
    models_path = "bert-base-uncased"
    epochs = 50
    save_all_models = False
    apex = True
    print_freq = 20
    num_workers = 4
    model = "bert-base-uncased"
    loss_func = 'SmoothL1'
    scheduler = 'cosine'
    batch_scheduler = True
    num_cycles = 0.5
    num_warmup_steps = 0
    encoder_lr = 2e-5
    decoder_lr = 2e-5
    min_lr = 1e-6
    llrd = True
    layerwise_lr = 5e-5
    layerwise_lr_decay = 0.9
    layerwise_weight_decay = 0.01
    layerwise_adam_epsilon = 1e-6
    layerwise_use_bertadam = False
    #pooling
    pooling = 'mean' # mean, max, min, attention, weightedlayer
    layer_start = 4
    #init_weight
    init_weight = 'normal' # normal, xavier_uniform, xavier_normal, kaiming_uniform, kaiming_normal, orthogonal
    #re-init
    reinit = True
    reinit_n = 1
    #adversarial
    fgm = False
    awp = False
    adv_lr = 1
    adv_eps = 0.2
    unscale = False
    eps = 1e-6
    betas = (0.9, 0.999)
    max_len = 512
    weight_decay = 0.01
    gradient_accumulation_steps = 1
    max_grad_norm = 1000
    target_cols = ['EI', 'SN', 'TF', 'JP']
    seed = 42
    cv_seed = 42
    n_fold = 4
    trn_fold = list(range(n_fold))
    batch_size = 128
    n_targets = 4
    gpu_id = 0
    device = f'cuda:{gpu_id}'
cfg=CFG()

In [None]:

from transformers import BertTokenizer
from transformers import BertForSequenceClassification
tokenizer=BertTokenizer.from_pretrained(cfg.model)
#the dataset class for the first dataset, tokenized, and labeled
class Ds_EI(Dataset):
    def __init__(self, path, tokenizer, max_token_len=cfg.max_len):
        self.df = pd.read_csv(path).dropna()
        self.tokenizer=tokenizer
        self.max_token_len=max_token_len
        self.labelstrdicts={1:"ESTJ", 0:"INFP"}
        self.loc=0  #EI at index 0 in mbti
    def __len__(self):
        return (len(self.df))
    def __getitem__(self, index):
        item=self.df.iloc[index]
        text=item["post"]
        type=item["type"]
        labels=self.str2label(type)
        try:
          tokens=self.tokenizer(text,return_tensors="pt", truncation=True, max_length=self.max_token_len, padding="max_length")
        except:
          print(text)
          quit()
        return {"input_ids": torch.squeeze(tokens.input_ids), "attention_mask":torch.squeeze(tokens.attention_mask), "labels":labels}
    def str2label(self, string):
        letter=string[self.loc]
        if letter in "ESTJ":
            return 1.
        else:
            return 0.
    def label2str(self, label):
        return self.labelstrdicts[label][self.loc]

class Ds_SN(Dataset):
    def __init__(self, path, tokenizer, max_token_len=cfg.max_len):
        self.df = pd.read_csv(path).dropna()
        self.tokenizer=tokenizer
        self.max_token_len=max_token_len
        self.labelstrdicts={1:"ESTJ", 0:"INFP"}
        self.loc=1  #EI at index 0 in mbti
    def __len__(self):
        return (len(self.df))
    def __getitem__(self, index):
        item=self.df.iloc[index]
        text=item["post"]
        type=item["type"]
        labels=self.str2label(type)
        try:
          tokens=self.tokenizer(text,return_tensors="pt", truncation=True, max_length=self.max_token_len, padding="max_length")
        except:
          print(text)
          quit()
        return {"input_ids": torch.squeeze(tokens.input_ids), "attention_mask":torch.squeeze(tokens.attention_mask), "labels":labels}
    def str2label(self, string):
        letter=string[self.loc]
        if letter in "ESTJ":
            return 1.
        else:
            return 0.
    def label2str(self, label):
        return self.labelstrdicts[label][self.loc]

class Ds_TF(Dataset):
    def __init__(self, path, tokenizer, max_token_len=cfg.max_len):
        self.df = pd.read_csv(path).dropna()
        self.tokenizer=tokenizer
        self.max_token_len=max_token_len
        self.labelstrdicts={1:"ESTJ", 0:"INFP"}
        self.loc=2  #EI at index 0 in mbti
    def __len__(self):
        return (len(self.df))
    def __getitem__(self, index):
        item=self.df.iloc[index]
        text=item["post"]
        type=item["type"]
        labels=self.str2label(type)
        try:
          tokens=self.tokenizer(text,return_tensors="pt", truncation=True, max_length=self.max_token_len, padding="max_length")
        except:
          print(text)
          quit()
        return {"input_ids": torch.squeeze(tokens.input_ids), "attention_mask":torch.squeeze(tokens.attention_mask), "labels":labels}
    def str2label(self, string):
        letter=string[self.loc]
        if letter in "ESTJ":
            return 1.
        else:
            return 0.
    def label2str(self, label):
        return self.labelstrdicts[label][self.loc]

class Ds_JP(Dataset):
    def __init__(self, path, tokenizer, max_token_len=cfg.max_len):
        self.df = pd.read_csv(path).dropna()
        self.tokenizer=tokenizer
        self.max_token_len=max_token_len
        self.labelstrdicts={1:"ESTJ", 0:"INFP"}
        self.loc=3  #EI at index 0 in mbti
    def __len__(self):
        return (len(self.df))
    def __getitem__(self, index):
        item=self.df.iloc[index]
        text=item["post"]
        type=item["type"]
        labels=self.str2label(type)
        try:
          tokens=self.tokenizer(text,return_tensors="pt", truncation=True, max_length=self.max_token_len, padding="max_length")
        except:
          print(text)
          quit()
        return {"input_ids": torch.squeeze(tokens.input_ids), "attention_mask":torch.squeeze(tokens.attention_mask), "labels":labels}
    def str2label(self, string):
        letter=string[self.loc]
        if letter in "ESTJ":
            return 1.
        else:
            return 0.
    def label2str(self, label):
        return self.labelstrdicts[label][self.loc]



In [None]:
path=foldername+"dataset2(sep).csv"
#print(dataset[0])
data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
def getdl(ds):
    #total_len=len(ds)
    #train_len=int(len(ds)*0.8)
    #val_len=int((total_len-train_len)/2)
    #test_len=total_len-train_len-val_len
    #[train_ds, val_ds, test_ds]=torch.utils.data.random_split(ds, [train_len, val_len, test_len])
    #return (training dataloader, validation dataloader, test dataloader)
    #return DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=False, collate_fn=data_collator), DataLoader(val_ds, batch_size=cfg.batch_size, shuffle=False, collate_fn=data_collator), DataLoader(test_ds, batch_size=cfg.batch_size, shuffle=False, collate_fn=data_collator)
    return DataLoader(ds, batch_size=cfg.batch_size, shuffle=False, collate_fn=data_collator)

In [None]:
#NEED TO CHANGE WHEN SWITCH TASK
ds=Ds_EI(path, tokenizer)
dl_EI=getdl(ds)

In [None]:
config = AutoConfig.from_pretrained(CFG.model, ouput_hidden_states = True)
model = AutoModel.from_pretrained(CFG.model, config=config)
model.requires_grads=False

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
if torch.cuda.is_available():  
    dev = "cuda:0" 
else:  
    dev = "cpu" 
device = torch.device(dev)
cpu=torch.device("cpu")
features=[]
labels=[]
model.to(device)
with torch.no_grad():
    for i,batch in enumerate(dl_EI):
        print(i*cfg.batch_size)
        batch.to(device)
        #last_hidden_layers=model(input_ids=batch["input_ids"],attention_mask=batch["attention_mask"]).last_hidden_state[:,0]
        #last_hidden_layers.to("cpu")
        #features.extend(last_hidden_layers)
        output=model(input_ids=batch["input_ids"],attention_mask=batch["attention_mask"])
        #print(output)
        pooler_output=output.pooler_output
        pooler_output=pooler_output.to(cpu)
        #print(pooler_output[0])
        features.extend(pooler_output)
        l=batch["labels"].to("cpu")
        labels.extend(l)

0
128
256
384
512
640
768
896
1024
1152
1280
1408
1536
1664
1792
1920
2048
2176
2304
2432
2560
2688
2816
2944
3072
3200
3328
3456
3584
3712
3840
3968
4096
4224
4352
4480
4608
4736
4864
4992
5120
5248
5376
5504
5632
5760
5888
6016
6144
6272
6400
6528
6656
6784
6912
7040
7168
7296
7424
7552
7680
7808
7936
8064
8192
8320
8448
8576


In [None]:
features=torch.stack(features)
labels=torch.stack(labels)

In [None]:
torch.save(features, foldername+'EI_Extraced_feature_tensors_pooleroutput.pt')
torch.save(labels, foldername+'EI_Extraced_label_tensors_pooleroutput.pt')

In [None]:
f=torch.load(foldername+'EI_Extraced_feature_tensors_pooleroutput.pt')

In [None]:
(f[0]==features[0])

tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, Tr