In [3]:
import pandas as pd
import re
import torch
import os
import json
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)

In [4]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
DATA_FILE_PATH ="/content/drive/MyDrive/AI_and_DS_lab Project/preprocessed_personality_data.csv"
MODEL_OUTPUT_DIR = "./my_mbti_classifier"

In [6]:
def clean_text_for_training(text: str) -> str:

    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+', '', text)

    return text.strip()

In [12]:
def train_model():

    print("==========  STARTING MODEL TRAINING  ==========")

    SAMPLE_SIZE = 100000
    MODEL_NAME = "distilbert-base-uncased"
    TEST_SIZE = 0.2
    RANDOM_STATE = 42
    MAX_TOKEN_LENGTH = 256
    TRAIN_BATCH_SIZE = 16
    EVAL_BATCH_SIZE = 16
    NUM_EPOCHS = 3
    # ------------------------------

    # Load Data
    print(f"--- Loading Data from {DATA_FILE_PATH} ---")
    try:
        df = pd.read_csv(DATA_FILE_PATH)
    except FileNotFoundError:
        print(f"ERROR: File not found at '{DATA_FILE_PATH}'")
        print("Please update the 'DATA_FILE_PATH' variable at the top of this script.")
        return False
    except Exception as e:
        print(f"An error occurred loading the file: {e}")
        return False

    print(f"Successfully loaded {len(df)} total rows.")

    # Sample Data
    if SAMPLE_SIZE > 0 and len(df) > SAMPLE_SIZE:
        print(f"Taking a random sample of {SAMPLE_SIZE} rows...")
        df = df.sample(n=SAMPLE_SIZE, random_state=RANDOM_STATE)
    else:
        print("Using all rows.")

    # Create Label Mappings
    print("--- Creating Label Mappings ---")
    MBTI_TYPES = df['type'].unique()
    MBTI_TYPES.sort()
    label2id = {label: i for i, label in enumerate(MBTI_TYPES)}
    id2label = {i: label for i, label in enumerate(MBTI_TYPES)}
    NUM_LABELS = len(MBTI_TYPES)

    print(f"Found {NUM_LABELS} unique labels. e.g., {MBTI_TYPES[:3]}")
    df['label'] = df['type'].map(label2id)

    # Load Tokenizer
    print(f"--- Loading Tokenizer ({MODEL_NAME}) ---")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # Tokenization Function ---
    def tokenize_function(examples):
        cleaned_text = [clean_text_for_training(t) for t in examples["text"]]
        return tokenizer(
            cleaned_text,
            padding="max_length",
            truncation=True,
            max_length=MAX_TOKEN_LENGTH
        )

    # Prepare Datasets
    print("--- Preparing Datasets ---")
    train_df, test_df = train_test_split(
        df,
        test_size=TEST_SIZE,
        random_state=RANDOM_STATE,
        stratify=df['label']
    )
    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)

    print("Tokenizing datasets (this may take a few minutes)...")
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    test_dataset = test_dataset.map(tokenize_function, batched=True)

    train_dataset = train_dataset.remove_columns(["text", "type", "__index_level_0__"])
    test_dataset = test_dataset.remove_columns(["text", "type", "__index_level_0__"])
    print("Data preparation complete.")

    # Load Model
    print(f"--- Loading Model ({MODEL_NAME}) ---")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=NUM_LABELS,
        id2label=id2label,
        label2id=label2id
    )
    model.to(device)


    print("--- Setting up Trainer with Early Stopping ---")


    training_args = TrainingArguments(
        output_dir="./mbti_training_checkpoints",
        eval_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=NUM_EPOCHS,
        per_device_train_batch_size=TRAIN_BATCH_SIZE,
        per_device_eval_batch_size=EVAL_BATCH_SIZE,
        warmup_steps=300,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=50,
        report_to="none",


        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        early_stopping_patience=1
    )

    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset
    )

    # --- Step 9: Start Training ---
    print("--- Starting Model Training (with Early Stopping) ---")
    trainer.train()
    print("Training complete.")
    # The trainer will automatically have the *best* model loaded
    # (from Epoch 2) because we set load_best_model_at_end=True.

    # --- Step 10: Save Final Model ---
    print(f"--- Saving Best Model to {MODEL_OUTPUT_DIR} ---")
    os.makedirs(MODEL_OUTPUT_DIR, exist_ok=True)
    trainer.save_model(MODEL_OUTPUT_DIR) # This now saves the best model
    tokenizer.save_pretrained(MODEL_OUTPUT_DIR)

    # Set Up Trainer
    print("--- Setting up Trainer ---")
    training_args = TrainingArguments(
        output_dir="./mbti_training_checkpoints",
        eval_strategy="epoch",
        num_train_epochs=NUM_EPOCHS,
        per_device_train_batch_size=TRAIN_BATCH_SIZE,
        per_device_eval_batch_size=EVAL_BATCH_SIZE,
        warmup_steps=300,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=50,
        report_to="none",
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset
    )

    # Start Training ---
    print("--- Starting Model Training ---")
    trainer.train()
    print("Training complete.")

    # Save Final Model
    print(f"--- Saving Final Model to {MODEL_OUTPUT_DIR} ---")
    os.makedirs(MODEL_OUTPUT_DIR, exist_ok=True)
    trainer.save_model(MODEL_OUTPUT_DIR)
    tokenizer.save_pretrained(MODEL_OUTPUT_DIR)

    print(f"Model and tokenizer saved successfully to '{MODEL_OUTPUT_DIR}'.")
    print("============================================\n")
    return True # Return success


In [13]:
class PersonalityClassifier:


    def __init__(self, model_path: str = "./my_mbti_classifier"):

        print(f"Loading model from {model_path}...")
        if not os.path.exists(model_path):
             print(f"--- FATAL ERROR ---")
             print(f"Model directory not found at: {model_path}")
             print("Please run the training script first to create this directory.")
             raise FileNotFoundError(f"Model directory not found: {model_path}")

        try:
            # Load the tokenizer and model from the saved directory
            self.tokenizer = AutoTokenizer.from_pretrained(model_path)
            self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
        except OSError as e:
            print(f"--- FATAL ERROR ---")
            print(f"Could not load model files from directory: {model_path}")
            print(f"Error details: {e}")
            raise


        self.model.eval()
        print("Model loaded successfully.")

    def _preprocess_text(self, text: str) -> str:

        if not isinstance(text, str):
            return ""
        text = text.lower()
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        text = re.sub(r'\@\w+', '', text)
        text = text.replace("post text:", "")
        return text.strip()


    def classify_personality(self, text: str) -> str:

        if not text:
            print("Warning: No text provided for classification.")
            return "UNKNOWN"

        TOKEN_MAX_LENGTH = 512
        inputs = self.tokenizer(
            text,
            truncation=True,
            padding=True,
            max_length=TOKEN_MAX_LENGTH,
            return_tensors="pt"
        )

        # inference
        with torch.no_grad():
            outputs = self.model(**inputs)

        logits = outputs.logits

        predicted_class_id = torch.argmax(logits, dim=1).item()

        # Map the ID back to its string label (e.g., 5 -> "ISFJ")
        return self.model.config.id2label[predicted_class_id]


    def process_stage_1_output(self, stage_1_json: dict) -> dict:

        profile_data = stage_1_json.get("profile_data", {})
        text_parts = []

        if profile_data.get("headline"):
            text_parts.append(profile_data["headline"])

        if profile_data.get("career_history"):
            for job in profile_data["career_history"]:
                if job.get("description"):
                    text_parts.append(job["description"])

        if profile_data.get("skills"):
            text_parts.append(" ".join(profile_data["skills"]))

        if profile_data.get("technical_contributions"):
            for contrib in profile_data["technical_contributions"]:
                if contrib.get("title"):
                    text_parts.append(contrib["title"])

        if profile_data.get("recent_activity"):
            text_parts.append(" ".join(profile_data["recent_activity"]))

        # Join all text parts into one large string
        full_text_to_classify = " ".join(text_parts)
        print(f"\n--- Combined Text for Classification ---\n{full_text_to_classify}\n")


        # Pre_process the full text
        cleaned_text = self._preprocess_text(full_text_to_classify)

        if not cleaned_text:
            print("Warning: No classifiable text found in profile.")
            predicted_type = "UNKNOWN"
        else:
            predicted_type = self.classify_personality(cleaned_text)

        # Add the new key to the profile data
        profile_data["inferred_mbti"] = predicted_type
        stage_1_json["profile_data"] = profile_data

        return stage_1_json


In [14]:
def check_classifier():

    print("========== STARTING CLASSIFICATION CHECK ==========")

    # The input JSON from Stage 1
    sample_stage_1_json = {
      "objective": "Send a job offer for a Senior AI Researcher role at our new robotics lab.",
      "optional_context": "Highlight our focus on multi-modal AI and mention the competitive publication bonus.",
      "profile_data": {
        "name": "Dr. Aris Thorne",
        "headline": "Principal AI Scientist @ QuantumLeap AI | Driving innovation in NLP and Generative Models",
        "career_history": [
          {
            "role": "Principal AI Scientist",
            "company": "QuantumLeap AI",
            "duration": "2022-Present",
            "description": "Leading the core research team for the 'Odyssey' large language model. Published 3 papers at NeurIPS."
          },
          {
            "role": "Senior Machine Learning Engineer",
            "company": "DataWeave Inc.",
            "duration": "2019-2022",
            "description": "Developed and deployed scalable NLP solutions for sentiment analysis, reducing inference costs by 30%."
          }
        ],
        "skills": [
          "Large Language Models (LLMs)",
          "Transformers",
          "PyTorch",
          "Distributed Training",
          "Computer Vision",
          "Python"
        ],
        "recent_activity": [
          "Post text: 'Just released our latest research on efficient attention mechanisms. The key challenge wasn't just accuracy, but computational feasibility. Excited to see how the community builds on this. #AI #NLP #Research'"
        ],
        "technical_contributions": [
          {
            "type": "GitHub",
            "title": "efficient-attention"
          },
          {
            "type": "Medium",
            "title": "Beyond Transformers: The Future of Sequence Modeling"
          }
        ]
      }
    }

    print("Input JSON (from Stage 1):")
    print(json.dumps(sample_stage_1_json, indent=2))

    try:

        classifier = PersonalityClassifier(model_path=MODEL_OUTPUT_DIR)

        enriched_json = classifier.process_stage_1_output(sample_stage_1_json)

        print("--- Enriched JSON (Output for Stage 3) ---")
        print(json.dumps(enriched_json, indent=2))
        print(f"\nSuccessfully classified personality as: {enriched_json['profile_data']['inferred_mbti']}")
        print("===================================================\n")

    except FileNotFoundError:
        print("\n--- CLASSIFICATION FAILED ---")
        print(f"Could not find the model directory '{MODEL_OUTPUT_DIR}'.")
    except Exception as e:
        print(f"\nAn unexpected error occurred during classification: {e}")

In [11]:
if __name__ == "__main__":
    # train the model
    training_success = train_model()

    # If training was successful, run the check
    if training_success:
        check_classifier()
    else:
        print("--- Workflow Halted ---")
        print("Model training failed or was skipped. Skipping classification check.")

--- Loading Data from /content/drive/MyDrive/AI_and_DS_lab Project/preprocessed_personality_data.csv ---
Successfully loaded 2332229 total rows.
Taking a random sample of 100000 rows...
--- Creating Label Mappings ---
Found 16 unique labels. e.g., ['ENFJ' 'ENFP' 'ENTJ']
--- Loading Tokenizer (distilbert-base-uncased) ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

--- Preparing Datasets ---
Tokenizing datasets (this may take a few minutes)...


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Data preparation complete.
--- Loading Model (distilbert-base-uncased) ---
Using device: cuda


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


--- Setting up Trainer ---
--- Starting Model Training ---


Epoch,Training Loss,Validation Loss
1,2.4743,2.485764


Epoch,Training Loss,Validation Loss
1,2.4743,2.485764
2,2.3255,2.458359
3,2.1315,2.548272


Training complete.
--- Saving Final Model to ./my_mbti_classifier ---
Model and tokenizer saved successfully to './my_mbti_classifier'.

Input JSON (from Stage 1):
{
  "objective": "Send a job offer for a Senior AI Researcher role at our new robotics lab.",
  "optional_context": "Highlight our focus on multi-modal AI and mention the competitive publication bonus.",
  "profile_data": {
    "name": "Dr. Aris Thorne",
    "headline": "Principal AI Scientist @ QuantumLeap AI | Driving innovation in NLP and Generative Models",
    "career_history": [
      {
        "role": "Principal AI Scientist",
        "company": "QuantumLeap AI",
        "duration": "2022-Present",
        "description": "Leading the core research team for the 'Odyssey' large language model. Published 3 papers at NeurIPS."
      },
      {
        "role": "Senior Machine Learning Engineer",
        "company": "DataWeave Inc.",
        "duration": "2019-2022",
        "description": "Developed and deployed scalable NLP