# Educational Reading Level Classification

## Import Libraries

In [176]:
import datasets, evaluate, accelerate
import random
import numpy as np
import pandas as pd
import torch
import transformers
from transformers import AutoTokenizer
from datasets import Dataset

## Load Data

In [177]:
# Load fiction and nonfiction data
all_fiction_data = pd.read_csv("data/fiction.csv")
all_nonfiction_data = pd.read_csv("data/nonfiction.csv")

In [178]:
# Shuffle data randomly
all_fiction_data = all_fiction_data.sample(frac=1)
all_nonfiction_data = all_nonfiction_data.sample(frac=1)

## Clean Data

In [184]:
# Rename columns for consistency
all_fiction_data = all_fiction_data.rename(columns={"passage": "text",
                                                    "reading_level": "label"})

In [186]:
# Check unique classes
all_fiction_data['label'].unique()

array(['High', 'Middle ', 'High ', 'Elementary', 'Middle'], dtype=object)

In [187]:
# Combine classes that are the same
map_levels = {
    'Middle ': 'Middle',
    'High ': 'High'
}

all_fiction_data['label'] = all_fiction_data['label'].replace(map_levels)

In [188]:
# Check amount of data in each class
all_fiction_data['label'].value_counts()

label
Middle        1746
High          1741
Elementary    1661
Name: count, dtype: int64

In [189]:
# View data
all_fiction_data

Unnamed: 0,text,label
3692,"When she wrote about that night, she held no a...",High
4899,I talked to her almost every day of working ha...,High
4910,"However, though he took these freedoms with me...",High
3154,She really does care about the Doctor but she ...,Middle
4413,"The chauffeur, who had been opening something,...",High
...,...,...
413,"""No, but he wishes he'd never been born. Mothe...",Elementary
402,She led him round the laurel path and to the w...,Elementary
4428,"Even if they took him, she said, she would go ...",High
2084,Esperanza smiled. When the grapes delivered th...,Middle


In [190]:
# Convert to Hugging Face Dataset
all_fiction_data = Dataset.from_pandas(all_fiction_data)
all_nonfiction_data = Dataset.from_pandas(all_nonfiction_data)

## Prepare Data for Classification

In [191]:
id2label = {0: "Elementary", 1: "Middle", 2: "High"}
label2id = {"Elementary": 0, "Middle": 1, "High": 2}

In [192]:
all_fiction_data

Dataset({
    features: ['text', 'label', '__index_level_0__'],
    num_rows: 5148
})

In [193]:
# Turn labels into 0, 1, or 2
def map_labels_to_number(example):
  example["label"] = label2id[example["label"]]
  return example

fiction_data = all_fiction_data.map(map_labels_to_number)

Map:   0%|          | 0/5148 [00:00<?, ? examples/s]

In [194]:
# Split data into training and test sets
fiction_data = fiction_data.train_test_split(test_size=0.2, seed=42)

In [195]:
fiction_data

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 4118
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 1030
    })
})

## Tokenize Data

In [196]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="distilbert/distilbert-base-uncased",
                                          use_fast=True)

tokenizer

DistilBertTokenizerFast(name_or_path='distilbert/distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [198]:
def tokenize_text(examples):
    """
    Tokenize given example text and return the tokenized text.
    """
    return tokenizer(examples["text"],
                     padding=True,
                     truncation=True)

In [199]:
# Map tokenize_text function to the dataset
tokenized_dataset = fiction_data.map(function=tokenize_text,
                              batched=True,
                              batch_size=1000)

tokenized_dataset

Map:   0%|          | 0/4118 [00:00<?, ? examples/s]

Map:   0%|          | 0/1030 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 4118
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 1030
    })
})

In [200]:
# Get two samples from the tokenized dataset
train_tokenized_sample = tokenized_dataset["train"][0]
test_tokenized_sample = tokenized_dataset["test"][0]

for key in train_tokenized_sample.keys():
    print(f"[INFO] Key: {key}")
    print(f"Train sample: {train_tokenized_sample[key]}")
    print(f"Test sample: {test_tokenized_sample[key]}")
    print("")


[INFO] Key: text
Train sample: Dorset greeted the sally with delight. “Oh, abominably—you’ve just hit it—keeps me awake at night. The doctors tell me that’s what has knocked my digestion out—being so infernally jealous of her.—I can’t eat a mouthful of this stuff, you know,” he added suddenly, pushing back his plate with a clouded countenance; and Lily, unfailingly adaptable, accorded her radiant attention to his prolonged denunciation of other people’s cooks, with a supplementary tirade on the toxic qualities of melted butter.
Test sample: I never saw so much expression in an inanimate thing before, and we all know how much expression they have! I used to lie awake as a child and get more entertainment and terror out of blank walls and plain furniture than most children could find in a toy-store. I remember what a kindly wink the knobs of our big, old bureau used to have, and there was one chair that always seemed like a strong friend.

[INFO] Key: label
Train sample: 1
Test sample: 1

## Set Up Evaluation Metric

In [201]:
import evaluate
import numpy as np
from typing import Tuple

In [202]:
accuracy_metric = evaluate.load("accuracy")

def compute_accuracy(predictions_and_labels: Tuple[np.array, np.array]):
  """
  Computes the accuracy of a model by comparing the predictions and labels.
  """
  predictions, labels = predictions_and_labels

  # Get highest prediction probability of each prediction if predictions are probabilities
  if len(predictions.shape) >= 2:
    predictions = np.argmax(predictions, axis=1)

  return accuracy_metric.compute(predictions=predictions, references=labels)


## Set Up Model for Training

In [203]:
from transformers import AutoModelForSequenceClassification

# Setup model for fine-tuning with classification head (top layers of network)
model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path="distilbert/distilbert-base-uncased",
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [204]:
def count_params(model):
    """
    Count the parameters of a PyTorch model.
    """
    trainable_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_parameters = sum(p.numel() for p in model.parameters())

    return {"trainable_parameters": trainable_parameters, "total_parameters": total_parameters}

# Count # parameters of the model
count_params(model)

{'trainable_parameters': 66955779, 'total_parameters': 66955779}

All parameters in the model are trainable!

### Create Directory for Saving Models

In [210]:
# Create model output directory
from pathlib import Path

# Create models directory
models_dir = Path("models")
models_dir.mkdir(exist_ok=True)

# Create model save name
model_save_name = "reading_level_text_classifier-distilbert-base-uncased"

# Create model save path
model_save_dir = Path(models_dir, model_save_name)

model_save_dir

PosixPath('models/reading_level_text_classifier-distilbert-base-uncased')

### Set Up Training Arguments

In [225]:
from transformers import TrainingArguments

print(f"[INFO] Saving model checkpoints to: {model_save_dir}")

# Create training arguments
training_args = TrainingArguments(
    output_dir=model_save_dir,
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,
    num_train_epochs=3,
    fp16=False,
    bf16=False,
    eval_strategy="epoch",
    save_strategy="epoch",
    gradient_checkpointing=False,
    no_cuda=True,
    save_total_limit=3, # limit the total amount of save checkpoints (so we don't save num_epochs checkpoints)
    use_cpu=False, # set to False by default, will use CUDA GPU or MPS device if available
    seed=42, # set to 42 by default for reproducibility
    load_best_model_at_end=True, # load the best model when finished training
    logging_strategy="epoch", # log training results every epoch
    report_to="none", # optional: log experiments to Weights & Biases/other similar experimenting tracking services (we'll turn this off for now) 
    # push_to_hub=True # optional: automatically upload the model to the Hub (we'll do this manually later on)
    # hub_token="your_token_here" # optional: add your Hugging Face Hub token to push to the Hub (will default to huggingface-cli login)
    hub_private_repo=False # optional: make the uploaded model private (defaults to False)
)

[INFO] Saving model checkpoints to: models/reading_level_text_classifier-distilbert-base-uncased


### Set Up Trainer Instance

In [226]:
tokenized_dataset["train"]

Dataset({
    features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 4118
})

In [227]:
from transformers import Trainer

# Setup Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_accuracy
)

  trainer = Trainer(


## Training Text Classification Model

In [None]:
# Train a text classification model
results = trainer.train()

Epoch,Training Loss,Validation Loss
