# This notebook contains all the data preprocessing steps for the Data Centric AI competition

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow import keras
from keras import Sequential, layers
from sklearn.model_selection import train_test_split

### Importing training data

In [2]:
# Importing the data
train =  pd.read_csv('train.csv')
train.head()

Unnamed: 0,review_id,review_text,label
0,57,I have been a Maximum PC reader since its begi...,5
1,97,"If you into gaming, into the latest PC tech in...",5
2,147,This is a great magazine.... Subscribed for m...,4
3,148,sucks (period),1
4,151,too many ads the whole magazine is ads. I cant...,2


### Importing validation data

In [3]:
val = pd.read_csv('validation.csv')
val.head()

Unnamed: 0,review_id,review_text
0,33826,"love this magazine, so glad I have to option t..."
1,87313,Highly dissatisfied. Ordered BYOU magazine for...
2,80853,This magazine fans the flame of the Airstream ...
3,86459,My granddaughter loves this...
4,18307,Only after taking my money for a Kindle Editon...


In [4]:
# extract the necessary features and labels from train dataset
sentences = list(train.review_text)
labels = list(train.label)
labels = [x-1 for x in labels]

In [5]:
# Extract the necessary features and labels from validation dataset
test_sentences = list(val.review_text)
print(f"No. of validation samples: {len(test_sentences)}")

No. of validation samples: 500


In [6]:
# Get some information about the dataset
print(f"Number of total samples: {len(sentences)}")

Number of total samples: 12652


In [7]:
# Find the values counts of labels 
train.label.value_counts()

5    4071
1    3440
4    1994
3    1873
2    1274
Name: label, dtype: int64

Therefore, this is a classification problem with 5 classes.

In [8]:
# Visualize random samples
import random

rand_ind = random.choice(np.arange(len(sentences)))
print(f"Random review text: {sentences[rand_ind]}\n")
print(f"Label for the sample review text: {labels[rand_ind]}")

Random review text: Kindle or news stand edition, it's still a rag suitable only for the aluminum foil brigade or Obamamaniacs. May be useful if have a birdcage to line.

Label for the sample review text: 0


In [9]:
# Check for any null values
train.review_text.isnull().sum()

0

### Preprocessing text


In [10]:
# splitting the data into training and validation sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(sentences, labels, test_size=0.1)

len(train_sentences), len(val_sentences), len(train_labels), len(val_labels)

(11386, 1266, 11386, 1266)

In [11]:
# Find the average length of the sentences
avg_size = train['review_text'].str.len().mean()
print(avg_size)

# let's round it and make the max_tokens to 100
num_tokens = 100

# Let's set the embedding dimension parameter
embedding_dim = 512

92.32888080935821


In [12]:
#  Create a tokenizer to convert words to tokens.
tokenizer = layers.TextVectorization(max_tokens=10000,
                                     output_sequence_length=100)

tokenizer.adapt(train_sentences)

In [13]:
# Try to tokenize a random sentence
tokenizer(random.choice(train_sentences))

<tf.Tensor: shape=(100,), dtype=int64, numpy=
array([   7,   12, 1158,    5,   41, 3973,    5,  186,  580,  359,   10,
        447,  697,  739, 2672,    2,   48,   14,  162,  428,    5, 1327,
         27,    8, 2668,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0], dtype=int64)>

## Building and training a model

In [14]:
# Create a embedding layer
embedding_layer = layers.Embedding(input_dim=10000,
                                   output_dim=embedding_dim,
                                   input_length=num_tokens,
                                   )


embedding_layer(tokenizer(random.choice(train_sentences))).shape


TensorShape([100, 512])

In [15]:
# Create a models with lstm layers
inputs = layers.Input(shape=(1,), dtype="string") # inputs are 1-dimensional strings
x = tokenizer(inputs) # turn the input text into numbers
x = embedding_layer(x) # create an embedding of the numerized numbers
x = layers.LSTM(64, return_sequences=True)(x)
x = layers.GlobalAveragePooling1D()(x) # lower the dimensionality of the embedding (try running the model without this layer and see what happens)
outputs = layers.Dense(5, activation="sigmoid")(x) # create the output layer, want binary outputs so use sigmoid activation
lstm_model = tf.keras.Model(inputs, outputs, name="lstm_model_dense") # construct the model

In [16]:
# compile the model
lstm_model.compile(loss='sparse_categorical_crossentropy',
                   optimizer='adam',
                   metrics='accuracy')

# Train the model
lstm_model.fit(train_sentences, train_labels,
                epochs=100,
                validation_data=[val_sentences, val_labels])

Epoch 1/100

KeyboardInterrupt: 