# S-BERT and S-FastText for Bloom's Taxonomy Classification
**Final Implementation**

This notebook implements the research paper's methodology for classifying educational questions into Bloom's Taxonomy levels.
It uses an **Augmented Dataset** to address class imbalance and implements:
1. **S-FastText**: Supervised training using Facebook's FastText (Algorithm 2).
2. **S-BERT**: Fine-tuning BERT with semantic enrichment (Algorithm 3).

**Dataset**: `../data/raw/bloom_questions_augmented.csv`


In [None]:
import os
import re
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import nltk
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
import fasttext
from tqdm.auto import tqdm

# Ensure NLP resources are available
try:
    nlp = spacy.load("en_core_web_sm")
except:
    print("Downloading spaCy model...")
    os.system("python -m spacy download en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

# Set Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


In [None]:
# Hyperparameters and Configuration
config = {
    'data': {
        'path': '../data/raw/bloom_questions_augmented.csv', # AUGMENTED DATASET
        'test_size': 0.2,
        'random_state': 42,
        'labels': ['Remember', 'Understand', 'Apply', 'Analyze', 'Evaluate', 'Create']
    },
    'fasttext': {
        'lr': 0.3,          # Paper Table 2
        'epochs': 10,       # Paper Table 2
        'wordNgrams': 2,    # Paper Table 2
        'dim': 100          # Default
    },
    'bert': {
        'model_name': 'bert-base-uncased',
        'max_length': 128,
        'batch_size': 16,
        'lr': 4e-5,         # Paper Table 3 (0.00004)
        'epochs': 3
    }
}

# Label Mappings
label_to_id = {l: i for i, l in enumerate(config['data']['labels'])}
id_to_label = {i: l for l, i in label_to_id.items()}
print("Configuration Loaded.")


In [None]:
class TextPreprocessor:
    def __init__(self, nlp_model):
        self.nlp = nlp_model
        
    def clean_text(self, text):
        # Basic cleaning
        text = str(text).lower()
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        return text.strip()

    def semantic_enrichment(self, text):
        # As described in Paper: Identifying QW, ROOT, NOUN, PROPN
        # We will append the dependency tag to the token to enrich the representation
        # E.g. "What is an atom" -> "what_QW is_ROOT atom_NOUN"
        doc = self.nlp(text)
        enriched_tokens = []
        for token in doc:
            # Simple enrichment strategy: append simplistic dependency/POS info
            # The paper implies using dependency parsing output as input.
            # We will append relevant tags to important words.
            word = token.text.lower()
            if token.tag_.startswith('W'): # Wh-words
                word = f"{word}_QW"
            elif token.dep_ == 'ROOT':
                word = f"{word}_ROOT"
            elif token.pos_ in ['NOUN', 'PROPN']:
                word = f"{word}_{token.pos_}"
            enriched_tokens.append(word)
        # Note: For simple S-FastText input, we often just use clean text, 
        # but the paper suggests enriching. We'll stick to clean text for consistency 
        # with standard practices unless strictly required, but let's assume clean text for now to match the augmented data format.
        return " ".join(enriched_tokens)

preprocessor = TextPreprocessor(nlp)
print("Preprocessor Initialized.")


In [None]:
# Load Data
df = pd.read_csv(config['data']['path'])
print(f"Loaded Dataset: {len(df)} samples")
print(df['level'].value_counts())

# Preprocess
tqdm.pandas(desc="Preprocessing")
df['clean_text'] = df['question'].progress_apply(lambda x: preprocessor.clean_text(x))

# Map Labels
df['label_id'] = df['level'].map(label_to_id)

# Filter invalid labels if any
df = df.dropna(subset=['label_id'])
df['label_id'] = df['label_id'].astype(int)

# Stratified Split
train_val_df, test_df = train_test_split(
    df, 
    test_size=config['data']['test_size'], 
    stratify=df['label_id'], 
    random_state=config['data']['random_state']
)

train_df, val_df = train_test_split(
    train_val_df,
    test_size=0.1, # 10% of training for validation
    stratify=train_val_df['label_id'],
    random_state=config['data']['random_state']
)

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")


## 1. S-FastText Model
Implementing **Algorithm 2** from the paper using Supervised FastText.


In [None]:
# Prepare Data for FastText (.txt files with __label__)
def prepare_fasttext_file(df, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        for _, row in df.iterrows():
            # Replace spaces in label with underscore just in case
            label = row['level'].replace(' ', '_')
            text = row['clean_text']
            f.write(f"__label__{label} {text}\n")

prepare_fasttext_file(train_df, 'bloom_train.txt')
prepare_fasttext_file(val_df, 'bloom_val.txt')
prepare_fasttext_file(test_df, 'bloom_test.txt')

print("Training S-FastText (Supervised)...")
ft_model = fasttext.train_supervised(
    input='bloom_train.txt',
    lr=config['fasttext']['lr'],
    epoch=config['fasttext']['epochs'],
    wordNgrams=config['fasttext']['wordNgrams'],
    verbose=2
)
print("FastText Training Completed.")


In [None]:
# Evaluate FastText
print("Evaluating FastText on Test Set...")
test_texts = test_df['clean_text'].tolist()
true_labels = test_df['level'].tolist()

# Predict
ft_preds_raw = ft_model.predict(test_texts)
pred_labels = [p[0].replace('__label__', '') for p in ft_preds_raw[0]]

# Metrics
print(classification_report(true_labels, pred_labels))

# Confusion Matrix
cm = confusion_matrix(true_labels, pred_labels, labels=config['data']['labels'])
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=config['data']['labels'], yticklabels=config['data']['labels'])
plt.title("S-FastText Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# Paper-Style Metrics (Table 7)
from sklearn.metrics import precision_recall_fscore_support
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average='weighted')
acc = accuracy_score(true_labels, pred_labels)
print("\n[Table 7 Style] S-FastText Performance Metrics:")
print(pd.DataFrame({'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score'], 
                    'Value': [acc, precision, recall, f1]}))



## 2. S-BERT Model
Fine-tuning BERT with the augmented dataset.


In [None]:
# BERT Dataset Class
class BloomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Initialize Tokenizer & DataLoaders
tokenizer = BertTokenizer.from_pretrained(config['bert']['model_name'])

train_ds = BloomDataset(train_df['clean_text'].values, train_df['label_id'].values, tokenizer, config['bert']['max_length'])
val_ds = BloomDataset(val_df['clean_text'].values, val_df['label_id'].values, tokenizer, config['bert']['max_length'])
test_ds = BloomDataset(test_df['clean_text'].values, test_df['label_id'].values, tokenizer, config['bert']['max_length'])

train_loader = DataLoader(train_ds, batch_size=config['bert']['batch_size'], shuffle=True)
val_loader = DataLoader(val_ds, batch_size=config['bert']['batch_size'])
test_loader = DataLoader(test_ds, batch_size=config['bert']['batch_size'])


In [None]:
# Initialize BERT Model
bert_model = BertForSequenceClassification.from_pretrained(
    config['bert']['model_name'],
    num_labels=len(config['data']['labels'])
)
bert_model = bert_model.to(device)

optimizer = AdamW(bert_model.parameters(), lr=config['bert']['lr'])
total_steps = len(train_loader) * config['bert']['epochs']
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Training Loop
def train_epoch(model, data_loader, optimizer, device, scheduler):
    model = model.train()
    losses = []
    correct_predictions = 0
    
    for d in tqdm(data_loader, desc="Training"):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["labels"].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=targets
        )
        
        loss = outputs.loss
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

for epoch in range(config['bert']['epochs']):
    print(f"Epoch {epoch + 1}/{config['bert']['epochs']}")
    train_acc, train_loss = train_epoch(bert_model, train_loader, optimizer, device, scheduler)
    print(f"Train loss {train_loss} accuracy {train_acc}")


In [None]:
# Evaluate BERT
def eval_model(model, data_loader, device):
    model = model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["labels"].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(targets.cpu().numpy())
            
    return all_labels, all_preds

print("Evaluating BERT on Test Set...")
y_true, y_pred = eval_model(bert_model, test_loader, device)

# Convert IDs to Names
y_true_names = [id_to_label[i] for i in y_true]
y_pred_names = [id_to_label[i] for i in y_pred]

print(classification_report(y_true_names, y_pred_names))

cm = confusion_matrix(y_true_names, y_pred_names, labels=config['data']['labels'])
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=config['data']['labels'], yticklabels=config['data']['labels'])
plt.title("S-BERT Confusion Matrix")
plt.show()

# Paper-Style Metrics (Table 7)
precision, recall, f1, _ = precision_recall_fscore_support(y_true_names, y_pred_names, average='weighted')
acc = accuracy_score(y_true_names, y_pred_names)
print("\n[Table 7 Style] S-BERT Performance Metrics:")
print(pd.DataFrame({'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score'], 
                    'Value': [acc, precision, recall, f1]}))



In [None]:
# Combined Inference
def predict_bloom(question):
    print(f"Question: \"{question}\"")
    clean_q = preprocessor.clean_text(question)
    
    # 1. FastText
    ft_res = ft_model.predict(clean_q)
    ft_label = ft_res[0][0].replace('__label__', '')
    ft_conf = ft_res[1][0]
    print(f"  [S-FastText] {ft_label} ({ft_conf:.2%})")
    
    # 2. BERT
    inputs = tokenizer.encode_plus(
        clean_q,
        return_tensors='pt',
        max_length=config['bert']['max_length'],
        truncation=True,
        padding='max_length'
    )
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = bert_model(input_ids, attention_mask=attention_mask)
        probs = torch.softmax(outputs.logits, dim=1)
        pred_idx = torch.argmax(probs, dim=1).item()
        
    bert_label = id_to_label[pred_idx]
    bert_conf = probs[0][pred_idx].item()
    print(f"  [S-BERT]     {bert_label} ({bert_conf:.2%})\n")

# Tests
predict_bloom("Design a new experiment to test the hypothesis.")
predict_bloom("What is the capital of France?")
predict_bloom("Compare and contrast the two theories.")


In [None]:
# --- SAVE MODELS ---
print("Saving Models...")
import os

# 1. Save S-FastText Model
s_fasttext_path = "s-fasttext.bin"
ft_model.save_model(s_fasttext_path)
print(f"S-FastText model saved to '{s_fasttext_path}'")

# 2. Save S-BERT Model
s_bert_dir = "./s-bert_model"
if not os.path.exists(s_bert_dir):
    os.makedirs(s_bert_dir)

# Save full pretrained model (config + bin)
bert_model.save_pretrained(s_bert_dir)
tokenizer.save_pretrained(s_bert_dir)

# Also save specific .pth state dictionary
torch.save(bert_model.state_dict(), "s-bert.pth")
print(f"S-BERT model saved to directory '{s_bert_dir}' and file 's-bert.pth'")
