# Welcome to recommended age classification
This is a project where I try to classify the recommended reading age for children stories based on their descriptions 

We start by importing the necessary libraries.

In [31]:
import numpy as np
import pandas as pd
import plotly.express as px
import spacy # Used for lemmatization & stop word removal

from tensorflow.keras.preprocessing.text import Tokenizer # Used to tokenize thte words into sequences of integers
from tensorflow.keras.preprocessing.sequence import pad_sequences # Used to get the same length of all samples after tokenized
from sklearn.model_selection import train_test_split # Sklearn has a way to split train and test data which is easy to use
import tensorflow as tf


In [32]:
df = pd.read_csv("children_stories.Csv", encoding='ISO-8859-1')
# df # uncomment to look at the dataset

In [33]:
# What a description can look like
df["desc"][0]

'Was it just another game of hide and seek? No. It was not. First she fell into a deep, dark hole in the ground and then they found a treasure. Did it end there? No! It did not. Read more about this thrilling adventure of Sally and friends in this free illustrated kidsâ\x80\x99 book. The fun never ends when Sallyâ\x80\x99s around! '

We need the unique age categories in order to group them

In [34]:
sorted(list(df['cats'].unique())) 

['Age ',
 'Age  0-3',
 'Age 0+',
 'Age 0-2',
 'Age 0-3',
 'Age 0-4',
 'Age 0-5',
 'Age 0-6',
 'Age 1+',
 'Age 1-2',
 'Age 1-3',
 'Age 1-4',
 'Age 1-5',
 'Age 1-6',
 'Age 10+',
 'Age 10-14',
 'Age 11+',
 'Age 11-14',
 'Age 11-15',
 'Age 12+',
 'Age 13+',
 'Age 2+',
 'Age 2-4',
 'Age 2-5',
 'Age 2-6',
 'Age 2-7',
 'Age 2-9',
 'Age 3+',
 'Age 3-4',
 'Age 3-5',
 'Age 3-6',
 'Age 3-7',
 'Age 4+',
 'Age 4-11',
 'Age 4-5',
 'Age 4-6',
 'Age 4-7',
 'Age 4-8',
 'Age 5+',
 'Age 5-8',
 'Age 5-9',
 'Age 6+',
 'Age 6-11',
 'Age 6-8',
 'Age 6-9',
 'Age 6months+',
 'Age 7+',
 'Age 7-10',
 'Age 7-11',
 'Age 7-12',
 'Age 7-9',
 'Age 8+',
 'Age 8-11',
 'Age 8-12',
 'Age 9+',
 'Age 9-11',
 'Age 9-12',
 'Age 9-13',
 'Age 9-14']

Right now we are only interested in the recommended reading age since that is what we are going to predict
we create the labels by getting unique age categories
There is one entry that has no age so we remove that one. (it only says "Age ")

In [35]:
df = df.drop(df.query("cats == 'Age '").index, axis=0).reset_index(drop=True)
df['cats'].value_counts() / len(df['cats']) # See the percentage in the split of categories
# The split is quite low for each of the current categories so we should group them together

Age 9+          0.074766
Age 3+          0.074766
Age 7+          0.063084
Age 8+          0.060748
Age 10+         0.060748
Age 6+          0.049065
Age 2-9         0.046729
Age 4+          0.046729
Age 11+         0.044393
Age 5+          0.039720
Age 0-4         0.037383
Age 0+          0.030374
Age 2+          0.028037
Age 12+         0.028037
Age 0-3         0.025701
Age 2-6         0.023364
Age 3-6         0.021028
Age 2-5         0.021028
Age 0-5         0.018692
Age 3-5         0.016355
Age 1-3         0.014019
Age 1-5         0.011682
Age 3-7         0.011682
Age 1-4         0.009346
Age 8-12        0.009346
Age 7-11        0.009346
Age 13+         0.007009
Age 6-11        0.007009
Age 4-8         0.007009
Age 9-11        0.007009
Age 5-8         0.007009
Age 4-6         0.007009
Age 1+          0.004673
Age 4-7         0.004673
Age 6-8         0.004673
Age 9-13        0.004673
Age 7-10        0.004673
Age 5-9         0.004673
Age 9-12        0.004673
Age 7-9         0.004673


In [36]:
def group(entry):
    zero = ['Age  0-3', 'Age 0+', 'Age 0-2', 'Age 0-3', 'Age 0-4', 'Age 0-5', 'Age 0-6', 'Age 6months+']
    one = ['Age 1+', 'Age 1-2', 'Age 1-3', 'Age 1-4', 'Age 1-5', 'Age 1-6']
    two = ['Age 2+', 'Age 2-4', 'Age 2-5', 'Age 2-6', 'Age 2-7', 'Age 2-9']
    three = [ 'Age 3+', 'Age 3-4', 'Age 3-5', 'Age 3-6', 'Age 3-7']
    four = [ 'Age 4+', 'Age 4-11', 'Age 4-5', 'Age 4-6', 'Age 4-7', 'Age 4-8']
    five = ['Age 5+', 'Age 5-8', 'Age 5-9']
    six = ['Age 6+', 'Age 6-11', 'Age 6-8', 'Age 6-9']
    seven = [ 'Age 7+', 'Age 7-10', 'Age 7-11', 'Age 7-12', 'Age 7-9']
    eight = ['Age 8+', 'Age 8-11', 'Age 8-12']
    nine = ['Age 9+', 'Age 9-11', 'Age 9-12', 'Age 9-13', 'Age 9-14']
    ten = [ 'Age 10+', 'Age 10-14', 'Age 11+', 'Age 11-14', 'Age 11-15', 'Age 12+', 'Age 13+']
    return 0 if entry in zero else 1 if entry in one else 2 if entry in two else 3 if entry in three else 4 if entry in four else 5 if entry in five else 6 if entry in six else 7 if entry in seven else 8 if entry in eight else 9 if entry in nine else 10
df['cats'] = df['cats'].apply(group)
print(df['cats'].value_counts() / len(df['cats']))
print(df['cats'])
# Still pretty bad for some, I would like 4 categories to not have them too small, we can get the starting age as the label
# we could probably group 0-2, 3-5, 6-8, 9+

10    0.147196
3     0.126168
2     0.123832
0     0.123832
9     0.093458
7     0.084112
8     0.072430
4     0.070093
6     0.063084
5     0.051402
1     0.044393
Name: cats, dtype: float64
0       2
1       2
2       2
3       2
4       2
       ..
423    10
424     8
425     9
426     9
427     7
Name: cats, Length: 428, dtype: int64


In [37]:
# Now we get a more even split with 5 categories, they are named after their starting age respectively
def group2(entry):
    return 0 if entry < 3 else 3 if entry < 6 else 6 if entry < 9 else 9

df['cats'] = df['cats'].apply(group2)
print(df['cats'].value_counts() / len(df['cats']))
print(df['cats'])

0    0.292056
3    0.247664
9    0.240654
6    0.219626
Name: cats, dtype: float64
0      0
1      0
2      0
3      0
4      0
      ..
423    9
424    6
425    9
426    9
427    6
Name: cats, Length: 428, dtype: int64


We need a one-hot representation for a keras categorical crossentropy loss so lets make df['cats'] to a one-hot representation

In [38]:
df['cats'] = pd.get_dummies(df['cats']).to_numpy().tolist()
# We get the following split
# 3 = [0, 1, 0, 0] - 29.2%
# 0 = [1, 0, 0, 0] - 24.8%
# 9 = [0, 0, 0, 1] - 24.1%
# 6 = [0, 0, 1, 0] - 22%

In [39]:
print(df['cats'].value_counts() / len(df['cats']))
print(df['cats'])

[1, 0, 0, 0]    0.292056
[0, 1, 0, 0]    0.247664
[0, 0, 0, 1]    0.240654
[0, 0, 1, 0]    0.219626
Name: cats, dtype: float64
0      [1, 0, 0, 0]
1      [1, 0, 0, 0]
2      [1, 0, 0, 0]
3      [1, 0, 0, 0]
4      [1, 0, 0, 0]
           ...     
423    [0, 0, 0, 1]
424    [0, 0, 1, 0]
425    [0, 0, 0, 1]
426    [0, 0, 0, 1]
427    [0, 0, 1, 0]
Name: cats, Length: 428, dtype: object


## Pre-processing

With our labels fixed we preprocess the descriptions using SpaCy

In [40]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) # Used for stop word removal, nummerical removal and lemmatization

In [41]:
def preprocess(text):
    doc = nlp(text)
    # replace .lemma_ with .orht_ to skip lemmatization
    text = ' '.join([token.lemma_ for token in doc if not token.is_stop and token.lemma_.isalpha()]) # we still want a string returned so we join on ws before returning
    return text

We create copies so we dont have to work with the entire dataframe any longer since we unly use 2 of its 3 columns and we also apply the pre-processing to the descriptions

In [42]:
descriptions = df['desc'].copy().apply(preprocess)
labels_unprocessed = df['cats'].copy()

Before the labels are converted into a numpy ndarray which we need them to be in order to send as input to the network

In [43]:
labels_unprocessed

0      [1, 0, 0, 0]
1      [1, 0, 0, 0]
2      [1, 0, 0, 0]
3      [1, 0, 0, 0]
4      [1, 0, 0, 0]
           ...     
423    [0, 0, 0, 1]
424    [0, 0, 1, 0]
425    [0, 0, 0, 1]
426    [0, 0, 0, 1]
427    [0, 0, 1, 0]
Name: cats, Length: 428, dtype: object

labels need to be of the right type in order to be passed to the network, an ndarray

In [44]:
labels = np.vstack(labels_unprocessed.values)
labels

array([[1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       ...,
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 0, 1, 0]])

We use keras Tokenizer to turn our words in to a integer sequence with a unitue integer for each word is represented through a unique integer.
In case we missed some filterings for puncuations and special character when using spacy the tokenizer will remove them. It will also split the string on whitespaces and form words.
We are fitting the tokenizer on the descriptions.

In [45]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(descriptions)
descriptions = tokenizer.texts_to_sequences(descriptions)

We get this from the length of the tokenizer's word_index, and add 1 for the zero padding.
We also get the longest description and pad the descriptions to this length.

In [46]:
vocab_len = len(tokenizer.word_index) + 1
max_desc_length = np.max(list(map(lambda desc: len(desc), descriptions)))
descriptions = pad_sequences(descriptions, maxlen=max_desc_length, padding='post')
print("Vocab len:", vocab_len)
print("Max len:", max_desc_length)

Vocab len: 5320
Max len: 144


In [47]:
print(descriptions.shape) #Shape
descriptions # We see all padded now

(428, 144)


array([[ 209,  210,  349, ...,    0,    0,    0],
       [  21,  109,   12, ...,    0,    0,    0],
       [   7, 1314,   23, ...,    0,    0,    0],
       ...,
       [ 302,   23, 1506, ...,    0,    0,    0],
       [ 190,   90,  537, ...,    0,    0,    0],
       [   9,   45, 5312, ...,    0,    0,    0]])

We use sklearn's train_test_split() function to split into train and test sets. Random state 50 so these results can be reproduced.

In [48]:
descriptions_train, descriptions_test, labels_train, labels_test = train_test_split(descriptions, labels, train_size=0.8, random_state=50)

In [49]:
# We can try to augment the training data by duplicating it to get more samples, did not give any improvements tho
# labels_train = np.vstack((labels_train, labels_train))
# descriptions_train = np.vstack((descriptions_train, descriptions_train))

## Model

Descriptions is our input and with the padding all have the same size, "max_desc_length"

The embedding layer is used to embed the descriptions to a high-dimensional vectro space
This allow learning representations for words, rather than manually creating them.

With a GRU layer we capture time-dependent information. So we pass the descriptions through one.

In [50]:
input = tf.keras.Input(shape=(max_desc_length,), name="descriptions")

embedding_layer = tf.keras.layers.Embedding(
    input_dim=vocab_len,
    output_dim=128,
    input_length=max_desc_length,
    name="embedding"
)(input)

gru_layer = tf.keras.layers.GRU(
    units=256,
    return_sequences=True,
    name="gru_layer"
)(embedding_layer)

desc_flatten = tf.keras.layers.Flatten(name="flatten")(gru_layer)

output = tf.keras.layers.Dense(len(labels[0]), activation='sigmoid', name="output")(desc_flatten)

In [51]:
model = tf.keras.Model(inputs=input, outputs=output)
print(model.summary())

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
descriptions (InputLayer)    [(None, 144)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 144, 256)          1361920   
_________________________________________________________________
gru_layer (GRU)              (None, 144, 128)          148224    
_________________________________________________________________
flatten (Flatten)            (None, 18432)             0         
_________________________________________________________________
output (Dense)               (None, 4)                 73732     
Total params: 1,583,876
Trainable params: 1,583,876
Non-trainable params: 0
_________________________________________________________________
None


## Train

We train the network with NAdam as an optimizer and categorical_crossentropy loss since we use multiple categories

In [52]:
batch_size = 32
epochs = 10

model.compile(
    optimizer='nadam',
    loss='categorical_crossentropy',
    metrics=[
        'accuracy',
        tf.keras.metrics.AUC(name='auc')
    ]
)

[print(i.shape, i.dtype) for i in model.inputs]
[print(o.shape, o.dtype) for o in model.outputs]

(None, 144) <dtype: 'float32'>
(None, 4) <dtype: 'float32'>


[None]

In [53]:
history = model.fit(
    descriptions_train,
    labels_train,
    validation_split=0.2,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[
        tf.keras.callbacks.ReduceLROnPlateau(),
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3) # when there is no improvement we should terminate early
    ]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Lastly we evaluate on the test set.

In [54]:
results = model.evaluate(descriptions_test, labels_test)

print("Accuracy:", results[1])
print("ROC AUC:", results[2])

Accuracy: 0.5813953280448914
ROC AUC: 0.8194745182991028
