In [17]:
%pip install --upgrade transformers accelerate



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [10]:
import os, re, pandas as pd, numpy as np
import torch
from datasets import Dataset
from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    Trainer, TrainingArguments
)
import evaluate
import kagglehub


In [2]:
# This will download & unzip into a folder, e.g. "./datasnaek-mbti-type"
path = kagglehub.dataset_download("datasnaek/mbti-type")
print("MBTI files at:", path)

# Load the CSV
mbti_df = pd.read_csv(os.path.join(path, "mbti_1.csv"))
print(mbti_df.shape)    # ~8600 rows × 2 cols
mbti_df.head(3)


MBTI files at: /Users/adrian/.cache/kagglehub/datasets/datasnaek/mbti-type/versions/1
(8675, 2)


Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...


In [3]:
def clean_text(s: str) -> str:
    # split posts, remove URLs/mentions, collapse whitespace
    txt = " ".join(s.split("|||"))
    txt = re.sub(r"http\S+|www\.\S+", "", txt)
    txt = re.sub(r"[@#]\S+", "", txt)
    return re.sub(r"\s+", " ", txt).strip()

mbti_df["text"] = mbti_df["posts"].map(clean_text)



In [4]:
def mbti_to_psycho(mbti: str):
    I, N, T, J = [(c==flag) for (c,flag) in zip(mbti, ["I","N","T","J"])]
    E, S, F, P = [not x for x in (I, N, T, J)]
    # awareness: N⇒0.8, else 0.2
    awareness = 0.8 if N else 0.2
    # conscientiousness: J⇒0.8, else 0.2
    conscientiousness = 0.8 if J else 0.2
    # neuroticism: base 0.3 +0.3 if F +0.2 if I  ⇒ range [0.3,0.8]
    neuro = 0.3 + (0.3 if F else 0) + (0.2 if I else 0)
    # stress tolerance = 1 – neuroticism
    stress = 1 - neuro
    # risk tolerance:
    if E and T:
        risk = 0.8
    elif E and F:
        risk = 0.4
    elif I and T:
        risk = 0.6
    else:  # I & F
        risk = 0.2
    return awareness, conscientiousness, stress, neuro, risk

# apply mapping
mapped = mbti_df["type"].map(lambda t: mbti_to_psycho(t))
mbti_df[["awareness","conscientiousness","stress","neuroticism","risk_tolerance"]] = \
    pd.DataFrame(mapped.tolist(), index=mbti_df.index)


In [5]:
out_csv = "mbti_psychometrics.csv"  # ⬅️ adjust filename if you like
mbti_df[["text","awareness","conscientiousness","stress","neuroticism","risk_tolerance"]] \
    .to_csv(out_csv, index=False)
print("Saved:", out_csv)


Saved: mbti_psychometrics.csv


In [6]:
ds = Dataset.from_pandas(
    mbti_df[["text","awareness","conscientiousness","stress","neuroticism","risk_tolerance"]]
)
split = ds.train_test_split(test_size=0.2, seed=42)
train_ds, eval_ds = split["train"], split["test"]


In [15]:
checkpoint = "bert-base-uncased"
tokenizer  = BertTokenizerFast.from_pretrained(checkpoint)

def tokenize_fn(x):
    return tokenizer(x["text"], padding="max_length", truncation=True, max_length=128)

train_ds = train_ds.map(tokenize_fn, batched=True)
eval_ds  = eval_ds.map(tokenize_fn, batched=True)

train_ds.set_format(type="torch",
                    columns=["input_ids","attention_mask",
                             "awareness","conscientiousness","stress","neuroticism","risk_tolerance"])
eval_ds.set_format(type="torch",
                   columns=["input_ids","attention_mask",
                            "awareness","conscientiousness","stress","neuroticism","risk_tolerance"])


Map: 100%|██████████| 6940/6940 [00:08<00:00, 863.25 examples/s]
Map: 100%|██████████| 1735/1735 [00:02<00:00, 796.41 examples/s]


In [17]:
# Rebuild train_ds so it has a single 'labels' list field:
import torch

def pack_labels(example):
    example["labels"] = [
        example["awareness"],
        example["conscientiousness"],
        example["stress"],
        example["neuroticism"],
        example["risk_tolerance"],
    ]
    return example

train_ds = train_ds.map(pack_labels)
eval_ds  = eval_ds.map(pack_labels)

# Now reset formats:
train_ds.set_format(type="torch", columns=["input_ids","attention_mask","labels"])
eval_ds.set_format(type="torch", columns=["input_ids","attention_mask","labels"])


Map: 100%|██████████| 6940/6940 [00:01<00:00, 3511.94 examples/s]
Map: 100%|██████████| 1735/1735 [00:00<00:00, 3824.95 examples/s]


In [18]:
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate

# 1. Define a metrics function
mse_metric = evaluate.load("mse")

def compute_metrics(eval_pred):
    preds, labels = eval_pred  # both are np arrays of shape (batch_size, 5)
    return {"mse": float(np.mean((preds - labels) ** 2))}

# 2. Set up TrainingArguments
training_args = TrainingArguments(
    output_dir="mbti-psych-reg",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps=50,
    save_total_limit=2,
    # no `evaluation_strategy`, just evaluate at the end
)

# 3. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,          # ensures padding & everything if needed
    compute_metrics=compute_metrics,
)

# 4. Train
trainer.train()

# 5. Evaluate
metrics = trainer.evaluate()
print("Final Eval MSE:", metrics["eval_mse"])


  trainer = Trainer(


Step,Training Loss
50,0.0836
100,0.0698
150,0.0614
200,0.0579
250,0.058
300,0.0575
350,0.0577
400,0.0536
450,0.0562
500,0.0548




Final Eval MSE: 0.04042550548911095


In [19]:
trainer.save_model("psychometric_model")
tokenizer.save_pretrained("psychometric_model")


('psychometric_model/tokenizer_config.json',
 'psychometric_model/special_tokens_map.json',
 'psychometric_model/vocab.txt',
 'psychometric_model/added_tokens.json',
 'psychometric_model/tokenizer.json')

In [20]:
# 11. Test the trained model on new texts

from transformers import BertTokenizerFast, BertForSequenceClassification
import torch

# 11.1 Load the fine-tuned model and tokenizer
checkpoint = "psychometric_model"  # wherever you saved it
tokenizer  = BertTokenizerFast.from_pretrained(checkpoint)
model      = BertForSequenceClassification.from_pretrained(checkpoint, problem_type="regression")
model.eval()

# 11.2 Define some new sentences to score
test_texts = [
    "I always double-check every link before clicking on it.",
    "I often ignore software updates if I'm busy."
]

# 11.3 Tokenize
inputs = tokenizer(
    test_texts,
    padding="max_length",
    truncation=True,
    max_length=128,
    return_tensors="pt"
)

# 11.4 Run the model (no grad)
with torch.no_grad():
    outputs = model(**inputs)
preds = outputs.logits  # shape: (batch_size, 5)

# 11.5 Print out each of the five psychometric scores
for text, scores in zip(test_texts, preds):
    awareness, conscientiousness, stress, neuroticism, risk_tolerance = scores.tolist()
    print(f"Text: {text}")
    print(f"  Awareness:         {awareness:.2f}")
    print(f"  Conscientiousness: {conscientiousness:.2f}")
    print(f"  Stress:            {stress:.2f}")
    print(f"  Neuroticism:       {neuroticism:.2f}")
    print(f"  Risk tolerance:    {risk_tolerance:.2f}\n")


Text: I always double-check every link before clicking on it.
  Awareness:         0.67
  Conscientiousness: 0.46
  Stress:            0.35
  Neuroticism:       0.67
  Risk tolerance:    0.40

Text: I often ignore software updates if I'm busy.
  Awareness:         0.54
  Conscientiousness: 0.53
  Stress:            0.42
  Neuroticism:       0.61
  Risk tolerance:    0.49

