##### Google Colab Notebook
# DistilBERT with ARI
## Inputs: Review + ARI

In [1]:
import torch
torch.cuda.is_available()

True

In [2]:
from google.colab import drive

# This will prompt you to click on a link and get an authentication code
drive.mount('/content/drive')

Mounted at /content/drive


In [20]:
import pandas as pd
from tqdm.auto import tqdm
import numpy as np

import textstat
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertModel
from tensorflow.keras import layers, Input, Model

In [6]:
# Pandas Progress bar
tqdm.pandas()

In [7]:
dataset_path = '/content/drive/My Drive/Datasets/Modified_Books_rating.csv'
df = pd.read_csv(dataset_path)

In [8]:
df.head()

Unnamed: 0,review/helpfulness,Rating,Review,helpful_0,helpful_1,helpful_ratio,ratio_percent,review_len,IsHelpful
0,36/37,5.0,Ruth Allman has written an excellent book abou...,36,37,0.972973,97,153,1
1,29/30,5.0,"I have been using this book since 1988, the ei...",29,30,0.966667,96,63,1
2,25/28,5.0,"My poor dogeared, stained copy of this book ca...",25,28,0.892857,89,206,1
3,3/20,1.0,This book in my opinion is biased and takes an...,3,20,0.15,15,92,0
4,20/20,5.0,If you're already a fan of the Eyewitness Trav...,20,20,1.0,100,234,1


## ARI

In [9]:
def calculate_ari(text):
    ari = textstat.automated_readability_index(text)
    return ari

In [10]:
df['ARI'] = df['Review'].progress_apply(calculate_ari)

100%|██████████| 181104/181104 [00:18<00:00, 9551.67it/s]


In [11]:
# Zähle die Anzahl der Reviews mit einem ARI-Wert von weniger als 8
low_ari_reviews = df[df['ARI'] < 9.5]
num_low_ari_reviews = len(low_ari_reviews)

print(f"Anzahl der Reviews mit ARI < 8: {num_low_ari_reviews}")

Anzahl der Reviews mit ARI < 8: 86477


In [12]:
low_ari_reviews = df[df['ARI'] > 9.5]
num_low_ari_reviews = len(low_ari_reviews)

print(f"Anzahl der Reviews mit ARI > 8: {num_low_ari_reviews}")

Anzahl der Reviews mit ARI > 8: 92596


In [13]:
def convert_ari(df):
    if df['ARI'] <= 9.5:
        ari = 1 # for readable
    else:
        ari = 0 # for not readable
    return ari

In [14]:
df['IsReadable'] = df.apply(convert_ari, axis=1)

In [15]:
df.head()

Unnamed: 0,review/helpfulness,Rating,Review,helpful_0,helpful_1,helpful_ratio,ratio_percent,review_len,IsHelpful,ARI,IsReadable
0,36/37,5.0,Ruth Allman has written an excellent book abou...,36,37,0.972973,97,153,1,8.1,1
1,29/30,5.0,"I have been using this book since 1988, the ei...",29,30,0.966667,96,63,1,5.7,1
2,25/28,5.0,"My poor dogeared, stained copy of this book ca...",25,28,0.892857,89,206,1,7.6,1
3,3/20,1.0,This book in my opinion is biased and takes an...,3,20,0.15,15,92,0,10.9,0
4,20/20,5.0,If you're already a fan of the Eyewitness Trav...,20,20,1.0,100,234,1,18.3,0


## Model

In [16]:
# Split data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [17]:
# Tokenize the text using DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

train_text_tokens = tokenizer(list(train_df['Review']), padding=True, truncation=True, return_tensors='tf', max_length=512)
test_text_tokens = tokenizer(list(test_df['Review']), padding=True, truncation=True, return_tensors='tf', max_length=512)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [18]:
# Define DistilBERT model
distilbert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [21]:
# Define input layers
input_text = Input(shape=(512,), name='input_text', dtype=tf.int32)
input_len = Input(shape=(1,), name='input_len', dtype=tf.float32)

In [22]:
# Token embeddings from DistilBERT
text_embeddings = distilbert_model(input_text).last_hidden_state[:, 0, :]

In [23]:
# Concatenate text and numerical features
combined_features = layers.concatenate([text_embeddings, input_len])

In [24]:
# Dense layers for classification
x = layers.Dense(256, activation='relu')(combined_features)
output = layers.Dense(1, activation='sigmoid')(x)

In [25]:
# Build the model
model = Model(inputs=[input_text, input_len], outputs=output)

In [26]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [27]:
# Prepare input data
train_inputs = {'input_text': train_text_tokens['input_ids'], 'input_len': train_df['IsReadable'].values}
test_inputs = {'input_text': test_text_tokens['input_ids'], 'input_len': test_df['IsReadable'].values}

In [28]:
# Train the model
model.fit(train_inputs, train_df['IsHelpful'].values, epochs=3, batch_size=8, validation_data=(test_inputs, test_df['IsHelpful'].values))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7d0ed6a39a20>

In [29]:
# Save model so I don't have to do this again (hopefully)

# Save the model to a file
# this is temporarily saved on vm from Colab
model.save('/content/my_model')



In [30]:
# Save to Google Drive
model.save('/content/drive/MyDrive/Datasets/DistilBERT_Model_withARI')



In [31]:
from google.colab import files

# Zip the saved model files
!zip -r /content/my_model.zip /content/my_model

# Download the zipped file
files.download("/content/my_model.zip")

  adding: content/my_model/ (stored 0%)
  adding: content/my_model/variables/ (stored 0%)
  adding: content/my_model/variables/variables.data-00000-of-00001 (deflated 12%)
  adding: content/my_model/variables/variables.index (deflated 77%)
  adding: content/my_model/keras_metadata.pb (deflated 94%)
  adding: content/my_model/saved_model.pb (deflated 92%)
  adding: content/my_model/assets/ (stored 0%)
  adding: content/my_model/fingerprint.pb (stored 0%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>