##### Google Colab Notebook
# DistilBERT with Rating
## Inputs: Review + Rating

In [1]:
import torch
torch.cuda.is_available()

True

In [2]:
from google.colab import drive

# This will prompt you to click on a link and get an authentication code
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
from tqdm.auto import tqdm
import numpy as np

import textstat
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertModel
from tensorflow.keras import layers, Input, Model

Collecting textstat
  Downloading textstat-0.7.3-py3-none-any.whl (105 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/105.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyphen (from textstat)
  Downloading pyphen-0.14.0-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.14.0 textstat-0.7.3


In [4]:
# Pandas Progress bar
tqdm.pandas()

In [5]:
dataset_path = '/content/drive/My Drive/Datasets/Modified_Books_rating.csv'
df = pd.read_csv(dataset_path)

In [6]:
df.head()

Unnamed: 0,Title,helpful,Rating,Title.1,Review,helpful_0,helpful_1,ratio_percent,review_len,IsHelpful,cleaned_reviews
0,Alaska Sourdough,36/37,5.0,Real Alaskan Sourdough,Ruth Allman has written an excellent book abou...,36,37,97,153,1,ruth allman written excellent book alaskan sou...
1,Alaska Sourdough,29/30,5.0,True Alaskan cooking,"I have been using this book since 1988, the ei...",29,30,96,63,1,i using book since eighth printing i honestly ...
2,Alaska Sourdough,25/28,5.0,Cheechako to Sourdough in 190 Pages,"My poor dogeared, stained copy of this book ca...",25,28,89,206,1,my poor dogeared stained copy book came way da...
3,Eyewitness Travel Guide to Europe,3/20,1.0,Disappointed Romanian!,This book in my opinion is biased and takes an...,3,20,15,92,0,this book opinion biased take angle europe cle...
4,Eyewitness Travel Guide to Europe,20/20,5.0,Going to Europe? Get this book!,If you're already a fan of the Eyewitness Trav...,20,20,100,234,1,if youre already fan eyewitness travel guide s...


In [7]:
# Normalize Review Length
from sklearn.preprocessing import MinMaxScaler

# Assuming you have a DataFrame 'df' with a column 'review_len'
scaler = MinMaxScaler()

# Reshape the data to be a 2D array if it's a single feature
df['rating_normalized'] = scaler.fit_transform(df['Rating'].values.reshape(-1, 1))

In [8]:
df.head()

Unnamed: 0,Title,helpful,Rating,Title.1,Review,helpful_0,helpful_1,ratio_percent,review_len,IsHelpful,cleaned_reviews,rating_normalized
0,Alaska Sourdough,36/37,5.0,Real Alaskan Sourdough,Ruth Allman has written an excellent book abou...,36,37,97,153,1,ruth allman written excellent book alaskan sou...,1.0
1,Alaska Sourdough,29/30,5.0,True Alaskan cooking,"I have been using this book since 1988, the ei...",29,30,96,63,1,i using book since eighth printing i honestly ...,1.0
2,Alaska Sourdough,25/28,5.0,Cheechako to Sourdough in 190 Pages,"My poor dogeared, stained copy of this book ca...",25,28,89,206,1,my poor dogeared stained copy book came way da...,1.0
3,Eyewitness Travel Guide to Europe,3/20,1.0,Disappointed Romanian!,This book in my opinion is biased and takes an...,3,20,15,92,0,this book opinion biased take angle europe cle...,0.0
4,Eyewitness Travel Guide to Europe,20/20,5.0,Going to Europe? Get this book!,If you're already a fan of the Eyewitness Trav...,20,20,100,234,1,if youre already fan eyewitness travel guide s...,1.0


### Model

In [9]:
# Split data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [10]:
# Tokenize the text using DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

train_text_tokens = tokenizer(list(train_df['Review']), padding=True, truncation=True, return_tensors='tf', max_length=512)
test_text_tokens = tokenizer(list(test_df['Review']), padding=True, truncation=True, return_tensors='tf', max_length=512)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [11]:
# Define DistilBERT model
distilbert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [12]:
# Define input layers
input_text = Input(shape=(512,), name='input_text', dtype=tf.int32)
input_rating = Input(shape=(1,), name='input_rating', dtype=tf.float32)

In [13]:
# Token embeddings from DistilBERT
text_embeddings = distilbert_model(input_text).last_hidden_state[:, 0, :]

In [14]:
# Concatenate text and numerical features
combined_features = layers.concatenate([text_embeddings, input_rating])

In [15]:
# Dense layers for classification
x = layers.Dense(256, activation='relu')(combined_features)
output = layers.Dense(1, activation='sigmoid')(x)

In [16]:
# Build the model
model = Model(inputs=[input_text, input_rating], outputs=output)

In [17]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [18]:
# Prepare input data
train_inputs = {'input_text': train_text_tokens['input_ids'], 'input_rating': train_df['rating_normalized'].values}
test_inputs = {'input_text': test_text_tokens['input_ids'], 'input_rating': test_df['rating_normalized'].values}

In [19]:
# Train the model
model.fit(train_inputs, train_df['IsHelpful'].values, epochs=3, batch_size=8, validation_data=(test_inputs, test_df['IsHelpful'].values))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7f3edee312a0>

In [20]:
# Save model so I don't have to do this again (hopefully)

# Save the model to a file
# this is temporarily saved on vm from Colab
model.save('/content/my_model')



In [21]:
# Save to Google Drive
model.save('/content/drive/MyDrive/Datasets/DistilBERT_Model_withRating')

