In [1]:
import pandas as pd
import numpy as np
kaggle_data_DL = pd.read_csv('/kaggle/input/mbti-type/mbti_1.csv')

kaggle_data_DL.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [None]:
# Replace the '|||' separator with a space
kaggle_data_DL['posts'] = kaggle_data_DL['posts'].str.replace('|||', ' ', regex=False)

# Verify (no.4 specifically)
print(kaggle_data_DL['posts'].head(6))

In [None]:
# Remove URLs
import re

def remove_urls(text):
    return re.sub(r'http\S+', '', text)

kaggle_data_DL['posts'] = kaggle_data_DL['posts'].apply(remove_urls)

In [None]:
import re

mbti_types = [
    'infj', 'entp', 'intp', 'intj', 'entj', 'enfj', 'infp', 'enfp',
    'isfp', 'istp', 'isfj', 'istj', 'estp', 'esfp', 'estj', 'esfj',
    'introvert', 'extrovert'  
]

def remove_leakage_words(text):
    pattern = r'\b(?:' + '|'.join(mbti_types) + r')\b'
    return re.sub(pattern, '', text, flags=re.IGNORECASE)  # ← key change

kaggle_data_DL['posts'] = kaggle_data_DL['posts'].apply(remove_leakage_words)
print(kaggle_data_DL['posts'].head())


In [None]:
# normalize whitespace for a single text

import re

def normalize_whitespace(text):
    return re.sub(r"\s+", " ", text).strip()

kaggle_data_DL['posts'] = kaggle_data_DL['posts'].apply(normalize_whitespace)
print(kaggle_data_DL['posts'].head())

In [None]:
# Save CSV 
kaggle_data.to_csv('mbti_deeplearning_new.csv', index=False)

print("File saved successfully as 'mbti_deeplearning_new.csv'")

# **Deberta**

In [None]:
# !pip install --upgrade protobuf==4.25.3
# !pip install --upgrade transformers datasets sentencepiece
# !pip install evaluate

In [2]:
import os
import random
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, WeightedRandomSampler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using Device: {device}")

df = pd.read_csv("/kaggle/working/mbti_deeplearning_new.csv")

# Encode Labels
le = LabelEncoder()
df["label"] = le.fit_transform(df["type"])
print(f"Classes: {le.classes_}")
print("Original Class Distribution:\n", df["label"].value_counts())

# Split 80/20
train_df, val_df = train_test_split(
    df, test_size=0.2, random_state=SEED, stratify=df["label"]
)

train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df.reset_index(drop=True))

2025-11-22 15:17:19.079315: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763824639.101893    8256 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763824639.108709    8256 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Using Device: cuda
Classes: ['Extrovert' 'Introvert']
Original Class Distribution:
 label
1    6676
0    1999
Name: count, dtype: int64


In [3]:
model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(batch):
    return tokenizer(
        batch["posts"],
        truncation=True,
        max_length=512,
        padding=False  
    )

print("Tokenizing...")
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.rename_column("label", "labels")
val_dataset = val_dataset.rename_column("label", "labels")

columns_to_keep = ["input_ids", "attention_mask", "labels"]
train_dataset.set_format(type="torch", columns=columns_to_keep)
val_dataset.set_format(type="torch", columns=columns_to_keep)




Tokenizing...


Map:   0%|          | 0/6940 [00:00<?, ? examples/s]

Map:   0%|          | 0/1735 [00:00<?, ? examples/s]

In [4]:
# weighted sampler
train_labels = train_df["label"].values
class_counts = np.bincount(train_labels)
class_weights = 1. / class_counts
sample_weights = class_weights[train_labels]

sampler = WeightedRandomSampler(
    weights=sample_weights,
    num_samples=len(sample_weights),
    replacement=True
)

In [5]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2
).to(device)


class CustomTrainer(Trainer):
    def get_train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.args.train_batch_size,
            sampler=sampler,  
            collate_fn=self.data_collator,
            num_workers=0     
        )

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="macro")
    
    return {"accuracy": acc, "f1_macro": f1}


training_args = TrainingArguments(
    output_dir="/kaggle/working/",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro", 
    save_total_limit=1,
    report_to="none",
    fp16=torch.cuda.is_available()
)


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("Starting Training...")
trainer.train()

trainer.save_model("/kaggle/working/debertav3")
tokenizer.save_pretrained("/kaggle/working/debertav3")

def predict_personality(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    
    model.eval()
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    pred_idx = torch.argmax(probs, dim=-1).item()
    
    label = le.inverse_transform([pred_idx])[0]
    confidence = probs[0][pred_idx].item()
    
    return f"{label} (Confidence: {confidence:.2%})"

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = CustomTrainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Starting Training...




Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,No log,0.464375,0.808646,0.746124
2,0.532700,0.420043,0.854179,0.790604
3,0.329900,0.524779,0.841499,0.782074




In [6]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
import numpy as np

# Run prediction on validation set
pred_output = trainer.predict(val_dataset)

# Extract logits and true labels
logits = pred_output.predictions
y_true = pred_output.label_ids

# Convert logits → predicted class indices
y_pred = np.argmax(logits, axis=-1)

precision = precision_score(y_true, y_pred, average="macro")
recall = recall_score(y_true, y_pred, average="macro")
f1 = f1_score(y_true, y_pred, average="macro")

print(f"Precision (macro): {precision:.4f}")
print(f"Recall (macro):    {recall:.4f}")
print(f"F1 Score (macro):  {f1:.4f}")

print("\nClassification Report:\n")
print(classification_report(
    y_true, 
    y_pred, 
    target_names=le.classes_
))

cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:\n", cm)




Precision (macro): 0.7964
Recall (macro):    0.7853
F1 Score (macro):  0.7906

Classification Report:

              precision    recall  f1-score   support

   Extrovert       0.69      0.66      0.68       400
   Introvert       0.90      0.91      0.91      1335

    accuracy                           0.85      1735
   macro avg       0.80      0.79      0.79      1735
weighted avg       0.85      0.85      0.85      1735

Confusion Matrix:
 [[ 263  137]
 [ 116 1219]]


In [9]:
def predict_personality(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    
    model.eval()
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    pred_idx = torch.argmax(probs, dim=-1).item()
    
    label = le.inverse_transform([pred_idx])[0]
    confidence = probs[0][pred_idx].item()
    
    return f"{label} (Confidence: {confidence:.2%})"


In [22]:
# Textbook extrovert vs "Reddit-style" extrovert (casual, opinionated)
textbook_extrovert = "I feel energized when i am around people."
reddit_extrovert = "Lmao that is hilarious! I literally shouted at my screen. We should totally do a meetup for this sub, it would be chaotic but fun."

print(f"Textbook Extrovert: {predict_personality(textbook_extrovert)}")
print(f"Reddit Extrovert:   {predict_personality(reddit_extrovert)}")

Textbook Extrovert: Introvert (Confidence: 64.95%)
Reddit Extrovert:   Extrovert (Confidence: 62.23%)


In [23]:
# textbook introvert vs "Reddit-style" introvert (casual, opinionated)
textbook_introvert = "I recharge my energy by spending time alone. Social interactions often feel draining to me, and I prefer deep, one-on-one conversations over large groups."
reddit_introvert = "Ugh, honestly I just want to stay in my room and play video games all weekend. People are so exhausting lol. Does anyone else feel like hiding when the doorbell rings?"

print(f"Textbook Introvert: {predict_personality(textbook_introvert)}")
print(f"Reddit Introvert:   {predict_personality(reddit_introvert)}")

Textbook Introvert: Introvert (Confidence: 69.10%)
Reddit Introvert:   Introvert (Confidence: 64.35%)
