In [2]:
import pandas as pd

data = pd.read_csv(r'C:\Users\ameen\Documents\Projects\mbti\mbti-type-detector\data\mbti\mbti_1.csv')

In [3]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-base", num_labels=16)
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from peft import LoraConfig, TaskType

peft_config = LoraConfig(task_type=TaskType.SEQ_CLS, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)

In [5]:
from peft import get_peft_model

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 307,216 || all params: 184,741,664 || trainable%: 0.1663


In [6]:
def label_encode(type):
    types = ['INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP', 'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ']
    return types.index(type)

data['label'] = data['type'].apply(label_encode)

In [7]:
data.drop(['type'], axis=1, inplace=True)

In [8]:
data

Unnamed: 0,posts,label
0,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,0
1,'I'm finding the lack of me in these posts ver...,1
2,'Good one _____ https://www.youtube.com/wat...,2
3,"'Dear INTP, I enjoyed our conversation the o...",3
4,'You're fired.|||That's another silly misconce...,4
...,...,...
8670,'https://www.youtube.com/watch?v=t8edHB_h908||...,8
8671,'So...if this thread already exists someplace ...,7
8672,'So many questions when i do these things. I ...,2
8673,'I am very conflicted right now when it comes ...,6


In [9]:
from datasets import Dataset

dataset = Dataset.from_pandas(data)

In [10]:
def tokenize(batch):
    return tokenizer(batch['posts'], padding=True, truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize, batched=True)

Map: 100%|██████████| 8675/8675 [00:08<00:00, 973.13 examples/s] 


In [11]:
tokenized_dataset.remove_columns(['posts'])

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 8675
})

In [12]:
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)

In [13]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['posts', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6940
    })
    test: Dataset({
        features: ['posts', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1735
    })
})

In [14]:
from transformers import Trainer, TrainingArguments


def compute_accuracy(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {"accuracy": (preds == labels).mean()}

training_args = TrainingArguments(
    output_dir="/outputs",
    learning_rate=1e-3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)



In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_accuracy,
)

trainer.train()

  0%|          | 1/3470 [00:09<8:51:30,  9.19s/it]

: 