# Educational Reading Level Classification

## Import Libraries

In [1]:
import datasets, evaluate, accelerate
import random
import numpy as np
import pandas as pd
import torch
import transformers

  from .autonotebook import tqdm as notebook_tqdm


## Load Data

In [2]:
# Load fiction and nonfiction data
all_fiction_data = pd.read_csv("data/fiction.csv")
all_nonfiction_data = pd.read_csv("data/nonfiction.csv")

In [3]:
# Shuffle data randomly
all_fiction_data = all_fiction_data.sample(frac=1)
all_nonfiction_data = all_nonfiction_data.sample(frac=1)

## Clean Data

In [4]:
# Check unique classes
all_fiction_data['reading_level'].unique()

array(['High', 'Elementary', 'Middle', 'High ', 'Middle '], dtype=object)

In [5]:
# Combine classes that are the same
map_levels = {
    'Middle ': 'Middle',
    'High ': 'High'
}

all_fiction_data['reading_level'] = all_fiction_data['reading_level'].replace(map_levels)

In [6]:
# Check amount of data in each class
all_fiction_data['reading_level'].value_counts()

reading_level
Middle        1746
High          1741
Elementary    1661
Name: count, dtype: int64

In [7]:
# View data
all_fiction_data

Unnamed: 0,passage,reading_level
5110,Its somewhat ambitious title was “The Book of ...,High
643,"That night, and for many nights after, the Vel...",Elementary
3576,"But now, when the boat swept under the merchan...",High
1966,"It’s real easy, he doesn’t weigh much and I’m ...",Middle
1403,"“Let's make a slide,” cried Little Joe Otter.",Elementary
...,...,...
956,"But the Cricket, who was a wise old philosophe...",Elementary
465,"""I wonder where I am,"" said Milo in a very wor...",Elementary
2161,My uncle continued counting and writing ; his ...,Middle
456,"""I think I can find my own way,"" said Milo, no...",Elementary


## Prepare Data for Classification

In [8]:
# Make a copy of the original data to modify
prepared_fiction_data = all_fiction_data.copy(deep=True)

In [9]:
# Map data from labels to numbers
map_levels = {
    'Elementary': 0,
    'Middle': 1,
    'High': 2
}
prepared_fiction_data['reading_level'] = prepared_fiction_data['reading_level'].replace(map_levels)

  prepared_fiction_data['reading_level'] = prepared_fiction_data['reading_level'].replace(map_levels)


In [10]:
# Split data into training and test sets
test = prepared_fiction_data.sample(frac=0.2)
train = prepared_fiction_data.drop(test.index)

In [11]:
test['reading_level'].value_counts()

reading_level
2    354
1    345
0    331
Name: count, dtype: int64

In [12]:
train['reading_level'].value_counts()

reading_level
1    1401
2    1387
0    1330
Name: count, dtype: int64