# Project Title & Overview

This notebook implements a supervised text classification pipeline to automatically classify educational questions into Bloom's Taxonomy cognitive levels using a Multinomial Logistic Regression model. The workflow includes data loading, preprocessing, TF-IDF feature extraction, model training, and evaluation using standard classification metrics.

The Bloom's Taxonomy levels considered are: **Remembering**, **Understanding**, **Applying**, **Analyzing**, **Evaluating**, and **Creating**.

# Importing Required Libraries

This section installs and imports all libraries required to run the notebook in Google Colab.

In [1]:
!pip install nltk scikit-learn pandas matplotlib seaborn



In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
)

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Set global visual style and random seed for reproducibility
sns.set(style='whitegrid')
np.random.seed(42)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


# Dataset Loading

In this section, the Bloom's Taxonomy dataset is uploaded and loaded from a CSV file. The dataset is expected to contain the following columns:

- `question`: the educational question text
- `label`: the corresponding Bloom's Taxonomy level

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
import pandas as pd

dataset_path = '/content/drive/MyDrive/dataset/blooms_taxonomy_dataset.csv'
df = pd.read_csv(dataset_path)
print(df.head())

                                           Questions Category
0  About what proportion of the population of the...      BT1
1  Correctly label the brain lobes indicated on t...      BT1
2                          Define compound interest.      BT1
3                  Define four types of traceability      BT1
4                               Define mercantilism.      BT1


# Exploratory Data Overview

In [None]:
print('Columns:', df.columns.tolist())

print('\nClass distribution (label counts):')
print(df['label'].value_counts())

# Compute question length in number of tokens (approximate, based on whitespace)
df['question_length'] = df['question'].astype(str).str.split().apply(len)

print('\nQuestion length summary (in tokens):')
print(df['question_length'].describe())

df.head()

In [None]:
# Plot class distribution
plt.figure(figsize=(8, 4))
sns.countplot(x='label', data=df, order=sorted(df['label'].unique()))
plt.title("Distribution of Bloom's Taxonomy Levels")
plt.xlabel("Bloom's Level")
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Plot distribution of question lengths
plt.figure(figsize=(8, 4))
sns.histplot(df['question_length'], bins=30, kde=True)
plt.title('Distribution of Question Lengths (in Tokens)')
plt.xlabel('Number of Tokens')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# Text Preprocessing


In [None]:
import string

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text: str) -> str:
    """Preprocess a question string by normalizing, tokenizing,
    removing stopwords and punctuation, and lemmatizing tokens."""
    # Ensure input is a string and convert to lowercase
    text = str(text).lower()

    # Tokenization
    tokens = nltk.word_tokenize(text)

    cleaned_tokens = []
    for token in tokens:
        if token.isalpha() and token not in stop_words:
            lemma = lemmatizer.lemmatize(token)
            cleaned_tokens.append(lemma)

    # Join tokens back into a single string
    return ' '.join(cleaned_tokens)

# Apply preprocessing to all questions
df['cleaned_question'] = df['question'].astype(str).apply(preprocess_text)

df[['question', 'cleaned_question']].head()

# Feature Extraction using TF-IDF

In [None]:
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2)
)

tfidf_vectorizer

# Train–Test Split

In [None]:
# Encode Bloom's Taxonomy labels numerically
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['label'])

# Use the preprocessed text as input features
X_text = df['cleaned_question'].values

# Train–test split with stratification
X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Fit TF-IDF only on the training data and transform both splits
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_text)
X_test_tfidf = tfidf_vectorizer.transform(X_test_text)

print('Training TF-IDF shape:', X_train_tfidf.shape)
print('Test TF-IDF shape:', X_test_tfidf.shape)

# Multinomial Logistic Regression Model


In [None]:
log_reg_model = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    max_iter=1000,
    random_state=42
)

log_reg_model.fit(X_train_tfidf, y_train)

print('Model training complete.')

# Model Evaluation

In [None]:
# Generate predictions on the test set
y_pred = log_reg_model.predict(X_test_tfidf)

# Compute evaluation metrics 
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)

print('Evaluation Metrics (Macro-Averaged):')
print(f'Accuracy:  {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall:    {recall:.4f}')
print(f'F1-score:  {f1:.4f}')

# Detailed per-class report
class_names = label_encoder.classes_
print('\nClassification Report:')
print(classification_report(y_test, y_pred, target_names=class_names, zero_division=0))

# Confusion matrix visualization
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
disp.plot(cmap='Blues', values_format='d')
plt.title("Confusion Matrix for Bloom's Taxonomy Classification")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()