In [1]:
!pip install -q peft bitsandbytes accelerate

[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m59.4/59.4 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import pandas as pd
import re
import torch
import os
import json
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
DATA_FILE_PATH = "/content/drive/MyDrive/AI & DS LAB/preprocessed_personality_data.csv"
MODEL_OUTPUT_DIR = "./my_mbti_classifier"

In [None]:
def clean_text_for_training(text: str) -> str:

    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+', '', text)

    return text.strip()

In [None]:
def train_model(sample_size=100000, model_name="distilbert-base-uncased", max_length=256,
                train_batch_size=8, eval_batch_size=8, num_epochs=10, lr=8e-4, output_dir="./my_mbti_classifier"):

    import torch
    from datasets import Dataset
    from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
    from peft import LoraConfig, get_peft_model

    print("--- Loading data ---")
    df = pd.read_csv(DATA_FILE_PATH)
    if sample_size and len(df) > sample_size:
        df = df.sample(n=sample_size, random_state=42)
    print(f"Using {len(df)} rows (sample_size={sample_size})")


    # Build label maps
    MBTI_TYPES = sorted(df['type'].unique())
    label2id = {label: i for i, label in enumerate(MBTI_TYPES)}
    id2label = {i: label for i, label in enumerate(MBTI_TYPES)}
    df['label'] = df['type'].map(label2id)
    NUM_LABELS = len(MBTI_TYPES)
    print(f"Detected {NUM_LABELS} labels: {MBTI_TYPES}")


    # Tokenizer and tokenization
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    def tokenize_function(examples):
        texts = [clean_text_for_training(t) for t in examples['text']]
        return tokenizer(texts, truncation=True, padding='max_length', max_length=max_length)

    train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
    train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
    test_ds = Dataset.from_pandas(test_df.reset_index(drop=True))

    train_ds = train_ds.map(tokenize_function, batched=True)
    test_ds = test_ds.map(tokenize_function, batched=True)

    # Remove unnecessary columns if present
    remove_cols = [c for c in ['text','type','__index_level_0__'] if c in train_ds.column_names]
    if remove_cols:
        train_ds = train_ds.remove_columns(remove_cols)
        test_ds = test_ds.remove_columns(remove_cols)

    # Ensure label column is present and integer
    train_ds = train_ds.map(lambda x: {'labels': int(x['label'])} if isinstance(x['label'], (int,)) else {'labels': x['label']}, batched=False)
    test_ds = test_ds.map(lambda x: {'labels': int(x['label'])} if isinstance(x['label'], (int,)) else {'labels': x['label']}, batched=False)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    print("--- Loading base model ---")
    base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=NUM_LABELS, id2label=id2label, label2id=label2id)

    # Apply LoRA / PEFT
    print("--- Applying LoRA (PEFT) ---")
    lora_config = LoraConfig(
        r=8,
        lora_alpha=16,
        target_modules=["q_lin", "v_lin"],
        lora_dropout=0.1,
        bias="none",
        task_type="SEQ_CLS",
    )
    model = get_peft_model(base_model, lora_config)
    model.to(device)

    # Print trainable params
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Trainable params: {trainable_params:,} / {total_params:,} ({100*trainable_params/total_params:.4f}%)")


    # Training args
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=num_epochs,
        per_device_train_batch_size=train_batch_size,
        per_device_eval_batch_size=eval_batch_size,
        gradient_accumulation_steps=2,
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=lr,
        warmup_ratio=0.1,
        weight_decay=0.01,
        logging_steps=100,
        fp16=True,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        save_total_limit=3,
        report_to="none",
    )

    # Use a simple compute_metrics (accuracy)
    import numpy as np
    from sklearn.metrics import accuracy_score, f1_score

    def compute_metrics(pred):
        labels = pred.label_ids
        preds = np.argmax(pred.predictions, axis=1)
        acc = accuracy_score(labels, preds)
        f1 = f1_score(labels, preds, average='weighted')
        return {'accuracy': acc, 'f1': f1}

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=test_ds,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    print("--- Starting training ---")
    trainer.train()

    print("--- Saving PEFT adapter and tokenizer ---")
    os.makedirs(output_dir, exist_ok=True)
    model.save_pretrained(output_dir)   # saves peft adapter config & adapter weights
    tokenizer.save_pretrained(output_dir)

    print(f"Saved PEFT adapter to {output_dir}")
    return trainer


In [None]:
class PersonalityClassifier:


    def __init__(self, model_path: str = "./my_mbti_classifier"):

        print(f"Loading model from {model_path}...")
        if not os.path.exists(model_path):
             print(f"--- FATAL ERROR ---")
             print(f"Model directory not found at: {model_path}")
             print("Please run the training script first to create this directory.")
             raise FileNotFoundError(f"Model directory not found: {model_path}")

        try:
            # Load the tokenizer and model from the saved directory
            self.tokenizer = AutoTokenizer.from_pretrained(model_path)
            self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
        except OSError as e:
            print(f"--- FATAL ERROR ---")
            print(f"Could not load model files from directory: {model_path}")
            print(f"Error details: {e}")
            raise


        self.model.eval()
        print("Model loaded successfully.")

    def _preprocess_text(self, text: str) -> str:

        if not isinstance(text, str):
            return ""
        text = text.lower()
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        text = re.sub(r'\@\w+', '', text)
        text = text.replace("post text:", "")
        return text.strip()


    def classify_personality(self, text: str) -> str:

        if not text:
            print("Warning: No text provided for classification.")
            return "UNKNOWN"

        TOKEN_MAX_LENGTH = 512
        inputs = self.tokenizer(
            text,
            truncation=True,
            padding=True,
            max_length=TOKEN_MAX_LENGTH,
            return_tensors="pt"
        )

        # inference
        with torch.no_grad():
            outputs = self.model(**inputs)

        logits = outputs.logits

        predicted_class_id = torch.argmax(logits, dim=1).item()

        # Map the ID back to its string label (e.g., 5 -> "ISFJ")
        return self.model.config.id2label[predicted_class_id]


    def process_stage_1_output(self, stage_1_json: dict) -> dict:

        profile_data = stage_1_json.get("profile_data", {})
        text_parts = []

        if profile_data.get("headline"):
            text_parts.append(profile_data["headline"])

        if profile_data.get("career_history"):
            for job in profile_data["career_history"]:
                if job.get("description"):
                    text_parts.append(job["description"])

        if profile_data.get("skills"):
            text_parts.append(" ".join(profile_data["skills"]))

        if profile_data.get("technical_contributions"):
            for contrib in profile_data["technical_contributions"]:
                if contrib.get("title"):
                    text_parts.append(contrib["title"])

        if profile_data.get("recent_activity"):
            text_parts.append(" ".join(profile_data["recent_activity"]))

        # Join all text parts into one large string
        full_text_to_classify = " ".join(text_parts)
        print(f"\n--- Combined Text for Classification ---\n{full_text_to_classify}\n")


        # Pre_process the full text
        cleaned_text = self._preprocess_text(full_text_to_classify)

        if not cleaned_text:
            print("Warning: No classifiable text found in profile.")
            predicted_type = "UNKNOWN"
        else:
            predicted_type = self.classify_personality(cleaned_text)

        # Add the new key to the profile data
        profile_data["inferred_mbti"] = predicted_type
        stage_1_json["profile_data"] = profile_data

        return stage_1_json


In [None]:
def check_classifier():

    print("========== STARTING CLASSIFICATION CHECK ==========")

    # The input JSON from Stage 1
    sample_stage_1_json = {
      "objective": "Send a job offer for a Senior AI Researcher role at our new robotics lab.",
      "optional_context": "Highlight our focus on multi-modal AI and mention the competitive publication bonus.",
      "profile_data": {
        "name": "Dr. Aris Thorne",
        "headline": "Principal AI Scientist @ QuantumLeap AI | Driving innovation in NLP and Generative Models",
        "career_history": [
          {
            "role": "Principal AI Scientist",
            "company": "QuantumLeap AI",
            "duration": "2022-Present",
            "description": "Leading the core research team for the 'Odyssey' large language model. Published 3 papers at NeurIPS."
          },
          {
            "role": "Senior Machine Learning Engineer",
            "company": "DataWeave Inc.",
            "duration": "2019-2022",
            "description": "Developed and deployed scalable NLP solutions for sentiment analysis, reducing inference costs by 30%."
          }
        ],
        "skills": [
          "Large Language Models (LLMs)",
          "Transformers",
          "PyTorch",
          "Distributed Training",
          "Computer Vision",
          "Python"
        ],
        "recent_activity": [
          "Post text: 'Just released our latest research on efficient attention mechanisms. The key challenge wasn't just accuracy, but computational feasibility. Excited to see how the community builds on this. #AI #NLP #Research'"
        ],
        "technical_contributions": [
          {
            "type": "GitHub",
            "title": "efficient-attention"
          },
          {
            "type": "Medium",
            "title": "Beyond Transformers: The Future of Sequence Modeling"
          }
        ]
      }
    }

    print("Input JSON (from Stage 1):")
    print(json.dumps(sample_stage_1_json, indent=2))

    try:

        classifier = PersonalityClassifier(model_path=MODEL_OUTPUT_DIR)

        enriched_json = classifier.process_stage_1_output(sample_stage_1_json)

        print("--- Enriched JSON (Output for Stage 3) ---")
        print(json.dumps(enriched_json, indent=2))
        print(f"\nSuccessfully classified personality as: {enriched_json['profile_data']['inferred_mbti']}")
        print("===================================================\n")

    except FileNotFoundError:
        print("\n--- CLASSIFICATION FAILED ---")
        print(f"Could not find the model directory '{MODEL_OUTPUT_DIR}'.")
    except Exception as e:
        print(f"\nAn unexpected error occurred during classification: {e}")

In [None]:
if __name__ == "__main__":
    # train the model
    training_success = train_model()

    # If training was successful, run the check
    if training_success:
        check_classifier()
    else:
        print("--- Workflow Halted ---")
        print("Model training failed or was skipped. Skipping classification check.")

--- Loading data ---
Using 100000 rows (sample_size=100000)
Detected 16 labels: ['ENFJ', 'ENFP', 'ENTJ', 'ENTP', 'ESFJ', 'ESFP', 'ESTJ', 'ESTP', 'INFJ', 'INFP', 'INTJ', 'INTP', 'ISFJ', 'ISFP', 'ISTJ', 'ISTP']


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Using device: cuda
--- Loading base model ---


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


--- Applying LoRA (PEFT) ---
Trainable params: 750,352 / 67,716,128 (1.1081%)


  trainer = Trainer(


--- Starting training ---


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,2.5538,2.528984,0.17745,0.110787
2,2.4883,2.495435,0.18185,0.134306
3,2.4786,2.473972,0.1926,0.145339
4,2.4475,2.471256,0.19815,0.159498
5,2.4187,2.462174,0.20135,0.168378
6,2.3674,2.464698,0.20345,0.173257
7,2.3551,2.469103,0.20645,0.173538


--- Saving PEFT adapter and tokenizer ---
Saved PEFT adapter to ./my_mbti_classifier
Input JSON (from Stage 1):
{
  "objective": "Send a job offer for a Senior AI Researcher role at our new robotics lab.",
  "optional_context": "Highlight our focus on multi-modal AI and mention the competitive publication bonus.",
  "profile_data": {
    "name": "Dr. Aris Thorne",
    "headline": "Principal AI Scientist @ QuantumLeap AI | Driving innovation in NLP and Generative Models",
    "career_history": [
      {
        "role": "Principal AI Scientist",
        "company": "QuantumLeap AI",
        "duration": "2022-Present",
        "description": "Leading the core research team for the 'Odyssey' large language model. Published 3 papers at NeurIPS."
      },
      {
        "role": "Senior Machine Learning Engineer",
        "company": "DataWeave Inc.",
        "duration": "2019-2022",
        "description": "Developed and deployed scalable NLP solutions for sentiment analysis, reducing inferenc

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



An unexpected error occurred during classification: Error(s) in loading state_dict for DistilBertForSequenceClassification:
	size mismatch for classifier.modules_to_save.default.weight: copying a param with shape torch.Size([16, 768]) from checkpoint, the shape in current model is torch.Size([2, 768]).
	size mismatch for classifier.modules_to_save.default.bias: copying a param with shape torch.Size([16]) from checkpoint, the shape in current model is torch.Size([2]).


In [None]:
!zip -r my_mbti_classifier.zip ./my_mbti_classifier

updating: my_mbti_classifier/ (stored 0%)
updating: my_mbti_classifier/tokenizer.json (deflated 71%)
updating: my_mbti_classifier/vocab.txt (deflated 53%)
updating: my_mbti_classifier/README.md (deflated 66%)
updating: my_mbti_classifier/tokenizer_config.json (deflated 75%)
updating: my_mbti_classifier/adapter_model.safetensors (deflated 7%)
updating: my_mbti_classifier/special_tokens_map.json (deflated 42%)
updating: my_mbti_classifier/adapter_config.json (deflated 56%)
  adding: my_mbti_classifier/checkpoint-35000/ (stored 0%)
  adding: my_mbti_classifier/checkpoint-35000/tokenizer.json (deflated 71%)
  adding: my_mbti_classifier/checkpoint-35000/rng_state.pth (deflated 26%)
  adding: my_mbti_classifier/checkpoint-35000/scaler.pt (deflated 64%)
  adding: my_mbti_classifier/checkpoint-35000/scheduler.pt (deflated 61%)
  adding: my_mbti_classifier/checkpoint-35000/vocab.txt (deflated 53%)
  adding: my_mbti_classifier/checkpoint-35000/README.md (deflated 66%)
  adding: my_mbti_classifie

In [None]:
from google.colab import files
files.download("my_mbti_classifier.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [5]:
MODEL_ZIP_PATH = "/content/drive/MyDrive/AI & DS LAB/mbti_classifier_model.zip"
EXTRACT_DIR = "./my_mbti_classifier"

# Unzip the model if not already extracted
if not os.path.exists(EXTRACT_DIR):
    !unzip -o "$MODEL_ZIP_PATH" -d "$EXTRACT_DIR"
else:
    print("Model folder already exists, skipping unzip.")



Archive:  /content/drive/MyDrive/AI & DS LAB/mbti_classifier_model.zip
   creating: ./my_mbti_classifier/my_mbti_classifier/
  inflating: ./my_mbti_classifier/my_mbti_classifier/tokenizer.json  
  inflating: ./my_mbti_classifier/my_mbti_classifier/vocab.txt  
   creating: ./my_mbti_classifier/my_mbti_classifier/checkpoint-250/
  inflating: ./my_mbti_classifier/my_mbti_classifier/checkpoint-250/tokenizer.json  
  inflating: ./my_mbti_classifier/my_mbti_classifier/checkpoint-250/rng_state.pth  
  inflating: ./my_mbti_classifier/my_mbti_classifier/checkpoint-250/scaler.pt  
  inflating: ./my_mbti_classifier/my_mbti_classifier/checkpoint-250/scheduler.pt  
  inflating: ./my_mbti_classifier/my_mbti_classifier/checkpoint-250/vocab.txt  
  inflating: ./my_mbti_classifier/my_mbti_classifier/checkpoint-250/README.md  
  inflating: ./my_mbti_classifier/my_mbti_classifier/checkpoint-250/optimizer.pt  
  inflating: ./my_mbti_classifier/my_mbti_classifier/checkpoint-250/tokenizer_config.json  
  in

In [15]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from peft import PeftModel
import torch

BASE_MODEL = "distilbert-base-uncased"
MODEL_DIR = "./my_mbti_classifier"


base_model = AutoModelForSequenceClassification.from_pretrained(
    BASE_MODEL,
    num_labels=16
)


model = PeftModel.from_pretrained(base_model, MODEL_DIR)


tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

print("‚úÖ Model and tokenizer loaded successfully!")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ Model and tokenizer loaded successfully!


In [19]:
import torch
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd

test_path = "/content/MBTI_test_data.csv"
test_df = pd.read_csv(test_path)

MBTI_TYPES = sorted(test_df['inferred_mbti'].unique())
label2id = {label: i for i, label in enumerate(MBTI_TYPES)}
id2label = {i: label for i, label in enumerate(MBTI_TYPES)}

model.config.label2id = label2id
model.config.id2label = id2label
NUM_LABELS = len(MBTI_TYPES)

print(f"Detected {NUM_LABELS} labels: {MBTI_TYPES}")

texts = test_df["recent_activity"].fillna("").tolist()
true_labels = test_df["inferred_mbti"].tolist()

predictions = []
model.eval()

for text in tqdm(texts, desc="Predicting MBTI types"):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        pred = torch.argmax(outputs.logits, dim=1).item()
        predictions.append(model.config.id2label[pred])

test_df["predicted_mbti"] = predictions

acc = accuracy_score(test_df["inferred_mbti"], test_df["predicted_mbti"])
f1 = f1_score(test_df["inferred_mbti"], test_df["predicted_mbti"], average="weighted")

print(f"\n‚úÖ Test Accuracy: {acc*100:.2f}%")
print(f"‚úÖ Weighted F1-Score: {f1:.4f}")

output_path = "/content/drive/MyDrive/AI & DS LAB/MBTI_test_predictions.csv"
test_df.to_csv(output_path, index=False)
print(f"üìÅ Predictions saved to: {output_path}")

test_df.head(10)

Detected 16 labels: ['ENFJ', 'ENFP', 'ENTJ', 'ENTP', 'ESFJ', 'ESFP', 'ESTJ', 'ESTP', 'INFJ', 'INFP', 'INTJ', 'INTP', 'ISFJ', 'ISFP', 'ISTJ', 'ISTP']


Predicting MBTI types: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 51/51 [00:23<00:00,  2.21it/s]


‚úÖ Test Accuracy: 5.88%
‚úÖ Weighted F1-Score: 0.0454
üìÅ Predictions saved to: /content/drive/MyDrive/AI & DS LAB/MBTI_test_predictions.csv





Unnamed: 0,recent_activity,inferred_mbti,predicted_mbti
0,Q3 engineering metrics are in. We've hit 99.99...,ENTJ,ISFJ
1,Just published a new blog post on 'The 'Why' B...,INFP,INTJ
2,Completed the quarterly compliance audit for o...,ISTJ,INFJ
3,Hot take: Is the traditional banking model com...,ENTP,ISTJ
4,So incredibly proud to announce our team's 'Gr...,ESFJ,ISFJ
5,Spent the weekend debugging a race condition i...,INTP,ISTP
6,Brainstormed so many amazing ideas with the te...,ENFP,INTJ
7,The current enterprise cloud security model is...,INTJ,ISFJ
8,The payment gateway server was down for 3 minu...,ISTP,ISTP
9,Truly inspired by our team's alignment with th...,ENFJ,INTJ
