In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
import re

In [2]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
print("=== Milestone 2: Loading Preprocessed Data  ===")

def clean_text(text: str) -> str:

    if not isinstance(text, str):
        return ""

    text = text.lower()

    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    text = re.sub(r'\@\w+', '', text)

    return text.strip()

=== Milestone 2: Loading Preprocessed Data  ===


In [4]:
file_path = "/content/drive/MyDrive/AI_and_DS_lab Project/preprocessed_personality_data.csv"
try:
    df = pd.read_csv(file_path)
except FileNotFoundError:
    print(f"ERROR: File not found at '{file_path}'")
    print("Please make sure 'preprocessed_personality_data.csv' is in your 'My Drive' folder.")
    exit()

In [5]:
sample_size = 50000
if len(df) > sample_size:
    print(f"Taking a random sample of {sample_size} rows...")
    df = df.sample(n=sample_size, random_state=42)
else:
    print("Dataset is smaller than sample size, using all rows.")

Taking a random sample of 50000 rows...


In [6]:
# Create Label Mappings
MBTI_TYPES = df['type'].unique()
MBTI_TYPES.sort()
label2id = {label: i for i, label in enumerate(MBTI_TYPES)}
id2label = {i: label for i, label in enumerate(MBTI_TYPES)}
NUM_LABELS = len(MBTI_TYPES)

print(f"Found {NUM_LABELS} unique labels. e.g., {MBTI_TYPES[:3]}")

Found 16 unique labels. e.g., ['ENFJ' 'ENFP' 'ENTJ']


In [7]:
print(f"Found {NUM_LABELS} unique labels. e.g., {MBTI_TYPES[:3]}")

# Map string labels to integer IDs for training
df['label'] = df['type'].map(label2id)

# Split Data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

# Convert to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Load Tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

Found 16 unique labels. e.g., ['ENFJ' 'ENFP' 'ENTJ']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [12]:
def tokenize_function(examples):

    # Apply the simplified text cleaning
    cleaned_text = [clean_text(t) for t in examples["text"]]


    return tokenizer(
        cleaned_text,
        padding="max_length",
        truncation=True,
        max_length=256 )

In [13]:
#Tokenize Datasets
print("Tokenizing datasets...")
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Remove unwanted columns
train_dataset = train_dataset.remove_columns(["text", "type", "__index_level_0__"])
test_dataset = test_dataset.remove_columns(["text", "type", "__index_level_0__"])

print("Data preparation complete.")

Tokenizing datasets...


Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Data preparation complete.


In [14]:

# Milestone 3: Model Architecture

print("\n--- Milestone 3: Model Architecture ---")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

print(f"Loading {model_name} architecture for {NUM_LABELS} labels...")
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=NUM_LABELS,
    id2label=id2label,
    label2id=label2id
)

model.to(device)


--- Milestone 3: Model Architecture ---
Using device: cuda
Loading distilbert-base-uncased architecture for 16 labels...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [15]:

# Milestone 4: Model Training

print("\n--- Milestone 4: Model Training ---")

training_args = TrainingArguments(
    output_dir="./mbti_training_checkpoints",
    eval_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=300,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    report_to="none",

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

print("Starting model training...")
trainer.train()
print("Training complete.")

output_model_path = "./my_mbti_classifier"
print(f"Saving final model to {output_model_path}...")
trainer.save_model(output_model_path)
tokenizer.save_pretrained(output_model_path)

print(f"'Stage 2' model is now saved in '{output_model_path}'.")


--- Milestone 4: Model Training ---
Starting model training...


Epoch,Training Loss,Validation Loss
1,2.535,2.503884
2,2.3937,2.480118


Epoch,Training Loss,Validation Loss
1,2.535,2.503884
2,2.3937,2.480118
3,2.1215,2.571627


Training complete.
Saving final model to ./my_mbti_classifier...
'Stage 2' model is now saved in './my_mbti_classifier'.


In [16]:
# Zip the model for easy download from Colab

print("Zipping model for download...")
!zip -r my_mbti_classifier.zip ./my_mbti_classifier
print("--- All Done! ---")


Zipping model for download...
  adding: my_mbti_classifier/ (stored 0%)
  adding: my_mbti_classifier/model.safetensors (deflated 8%)
  adding: my_mbti_classifier/tokenizer_config.json (deflated 75%)
  adding: my_mbti_classifier/training_args.bin (deflated 53%)
  adding: my_mbti_classifier/config.json (deflated 57%)
  adding: my_mbti_classifier/special_tokens_map.json (deflated 42%)
  adding: my_mbti_classifier/tokenizer.json (deflated 71%)
  adding: my_mbti_classifier/vocab.txt (deflated 53%)
--- All Done! ---


In [17]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import re
import json

class PersonalityClassifier:

    def __init__(self, model_path: str = "./my_mbti_classifier"):

        print(f"Loading model from {model_path}...")
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(model_path)
            self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
        except OSError:
            print(f"--- FATAL ERROR ---")
            print(f"Could not find model files in directory: {model_path}")
            print("Did you download and unzip 'my_mbti_classifier.zip' into the same folder as this script?")
            raise

        self.model.eval()
        print("Model loaded successfully.")

    def _preprocess_text(self, text: str) -> str:

        if not isinstance(text, str):
            return ""
        text = text.lower()
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        text = re.sub(r'\@\w+', '', text)
        return text.strip()


    def classify_personality(self, text: str) -> str:

        if not text:
            return "UNKNOWN"

        cleaned_text = self._preprocess_text(text)

        inputs = self.tokenizer(
            cleaned_text,
            truncation=True,
            padding=True,
            max_length=256,
            return_tensors="pt"
        )

        # Model Prediction
        with torch.no_grad():
            outputs = self.model(**inputs)

        logits = outputs.logits
        predicted_class_id = torch.argmax(logits, dim=1).item()

        return self.model.config.id2label[predicted_class_id]



    def process_stage_1_output(self, stage_1_json: dict) -> dict:

        profile_data = stage_1_json.get("profile_data", {})
        activity_list = profile_data.get("recent_activity", [])
        text_to_classify = " ".join(activity_list)

        # classifier
        predicted_type = self.classify_personality(text_to_classify)

        profile_data["inferred_mbti"] = predicted_type
        stage_1_json["profile_data"] = profile_data

        return stage_1_json


In [18]:
# --- Example of how to use this module ---
if __name__ == "__main__":

    #  input from Stage 1
    sample_stage_1_json = {
      "objective": "Send a job offer for a Senior AI Researcher role at our new robotics lab.",
      "optional_context": "Highlight our focus on multi-modal AI and mention the competitive publication bonus.",
      "profile_data": {
        "name": "Dr. Aris Thorne",
        "headline": "Principal AI Scientist @ QuantumLeap AI | Driving innovation in NLP and Generative Models",
        "career_history": [
          {
            "role": "Principal AI Scientist",
            "company": "QuantumLeap AI",
            "duration": "2022-Present",
            "description": "Leading the core research team for the 'Odyssey' large language model. Published 3 papers at NeurIPS."
          },
          {
            "role": "Senior Machine Learning Engineer",
            "company": "DataWeave Inc.",
            "duration": "2019-2022",
            "description": "Developed and deployed scalable NLP solutions for sentiment analysis, reducing inference costs by 30%."
          }
        ],
        "skills": [
          "Large Language Models (LLMs)",
          "Transformers",
          "PyTorch",
          "Distributed Training",
          "Computer Vision",
          "Python"
        ],
        "recent_activity": [
          "Post text: 'Just released our latest research on efficient attention mechanisms. The key challenge wasn't just accuracy, but computational feasibility. Excited to see how the community builds on this. #AI #NLP #Research'"
        ],
        "company_updates": [
          "QuantumLeap AI just secured $150M in Series C funding to expand its generative video platform."
        ],
        "technical_contributions": [
          {
            "type": "GitHub",
            "title": "efficient-attention",
            "url": "https://github.com/aris-thorne/efficient-attention"
          },
          {
            "type": "Medium",
            "title": "Beyond Transformers: The Future of Sequence Modeling",
            "url": "https://medium.com/ai-frontiers/beyond-transformers-..."
          }
        ]
      }
    }

    print("--- Stage 2 Workflow Started ---")
    print("Input JSON (from Stage 1):")
    print(json.dumps(sample_stage_1_json, indent=2))

    # Initialize the classifier
    try:
        classifier = PersonalityClassifier(model_path="./my_mbti_classifier")

        enriched_json = classifier.process_stage_1_output(sample_stage_1_json)


        print("\n--- Enriched JSON (Output for Stage 3) ---")
        print(json.dumps(enriched_json, indent=2))


        print(f"\nSuccessfully classified personality as: {enriched_json['profile_data']['inferred_mbti']}")

    except OSError:
        print("\n--- ERROR ---")
        print("Could not find the model directory './my_mbti_classifier'.")
        print("Please run the 'train_classifier.py' (Part 1) script first to train and save the model.")

--- Stage 2 Workflow Started ---
Input JSON (from Stage 1):
{
  "objective": "Send a job offer for a Senior AI Researcher role at our new robotics lab.",
  "optional_context": "Highlight our focus on multi-modal AI and mention the competitive publication bonus.",
  "profile_data": {
    "name": "Dr. Aris Thorne",
    "headline": "Principal AI Scientist @ QuantumLeap AI | Driving innovation in NLP and Generative Models",
    "career_history": [
      {
        "role": "Principal AI Scientist",
        "company": "QuantumLeap AI",
        "duration": "2022-Present",
        "description": "Leading the core research team for the 'Odyssey' large language model. Published 3 papers at NeurIPS."
      },
      {
        "role": "Senior Machine Learning Engineer",
        "company": "DataWeave Inc.",
        "duration": "2019-2022",
        "description": "Developed and deployed scalable NLP solutions for sentiment analysis, reducing inference costs by 30%."
      }
    ],
    "skills": [
    