# Install huggingface transformers on google colab


In [None]:
!pip install transformers
!pip install wget

# Data pre-processing

We first start by reading the dataset and extracting the most important features; Rating, Review, Verified Purchase, and label. We then turn the label and the verified purchase entries into numerical values.

In [2]:
import pandas as pd
import tensorflow as tf
from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification
import numpy as np
import os
import wget
import shutil

In [None]:
if not os.path.isfile('amazon_reviews.txt'):
    url = 'https://drive.google.com/uc?id=1-LYI_s6oZ0OTe3I0vFYRYIBVhjFswReY&export=download' # downloads db to train
    print('Downloading DB to train')
    wget.download(url)
    print('Download Completed!\nUnzipping...')
    shutil.unpack_archive('amazon_reviews.zip')

In [None]:
#data = pd.read_csv('/content/sample_data/amazon_reviews.txt', delimiter="\t", error_bad_lines=False)
data = pd.read_csv('amazon_reviews.txt', delimiter="\t", error_bad_lines=False)

features = ['RATING', 'REVIEW_TEXT', 'VERIFIED_PURCHASE', 'LABEL']


data_shortened = data[features]

data_shortened['VERIFIED_PURCHASE'] = data_shortened['VERIFIED_PURCHASE'].replace('N', 0)
data_shortened['VERIFIED_PURCHASE'] = data_shortened['VERIFIED_PURCHASE'].replace('Y', 1)

data_shortened['LABEL'] = data_shortened['LABEL'].replace('__label1__', 1)
data_shortened['LABEL'] = data_shortened['LABEL'].replace('__label2__', 0)

In [4]:
data_shortened.head()

Unnamed: 0,RATING,REVIEW_TEXT,VERIFIED_PURCHASE,LABEL
0,4,"When least you think so, this product will sav...",0,1
1,4,Lithium batteries are something new introduced...,1,1
2,3,I purchased this swing for my baby. She is 6 m...,0,1
3,4,I was looking for an inexpensive desk calcolat...,0,1
4,4,I only use it twice a week and the results are...,0,1


In [5]:
labels = data_shortened.pop('LABEL')

## Enriching the data
As we have seen in the previous notebook, using just the review text as our only feature achieves a validation accuracy of only 60%. Here we enrich our data by adding suitable text based on the rating value and whether the purchase is verified or not. This is based on the results obtained by Gu et al. in which they tried different ways to combine numerical modalities with text in NLP tasks [1]

[1] https://github.com/georgian-io/Multimodal-Toolkit

In [None]:
def enrich_data(rating, verified_purchase, review_text):
  '''
    This method takes in rating, verified_purchase and review_text and prepends the review_text with new information from rating and verified_purchase
  '''
  if rating == 1:
    new_text = 'has a rating of one. '+ review_text
  elif rating == 2:
    new_text = 'has a rating of two. '+ review_text
  elif rating == 3:
    new_text = 'has a rating of three. '+ review_text
  elif rating == 4:
    new_text = 'has a rating of four. '+ review_text      
  else:  
    new_text = 'has a rating of five. '+ review_text

  if verified_purchase == 0:
    return 'This purchase is not verified and '+ new_text
  else:
    return 'This purchase is verified and '+ new_text


data_shortened['ENRICHED_TEXT'] = data_shortened.apply(lambda x: enrich_data(x['RATING'], x['VERIFIED_PURCHASE'], x['REVIEW_TEXT']),axis=1)


In [None]:
pd.set_option('max_rows', 99999)
pd.set_option('max_colwidth', 400)
pd.describe_option('max_colwidth')

In [8]:
data_verified = data_shortened[  data_shortened['VERIFIED_PURCHASE'] == 1   ]
data_unverified = data_shortened[  data_shortened['VERIFIED_PURCHASE'] == 0   ]

In [9]:
data_verified[["ENRICHED_TEXT", "RATING", "VERIFIED_PURCHASE"]].head()

Unnamed: 0,ENRICHED_TEXT,RATING,VERIFIED_PURCHASE
1,This purchase is verified and has a rating of four. Lithium batteries are something new introduced in the market there average developing cost is relatively high but Stallion doesn't compromise on quality and provides us with the best at a low cost.<br />There are so many in built technical assistants that act like a sensor in their particular forté. The battery keeps my phone charged up and i...,4,1
7,"This purchase is verified and has a rating of four. Great vitamin C serum... I really like the oil feeling, not too sticky. I used it last week on some of my recent bug bites and it helps heal the skin faster than normal.",4,1
16,"This purchase is verified and has a rating of four. Video quality if superb, fits just fine, looks like a real mirror (I believe they enhanced this), my only gripe is I wish they put the cameras not side by side, but one on one side, the other on the other side. I only say this because due to where the left camera is place, there is wasted recording room when you start to turn it towards the d...",4,1
18,"This purchase is verified and has a rating of four. just to be fair, i really believe that this is a great slicer , and the idea was there, but unfortunately there are tricks when using this spiral slicer, i mean manual or something would have been helpful.<br /><br />I'm not saying something negative about this slicer because it does work, but however it did it took me almost a week in trying...",4,1
19,This purchase is verified and has a rating of four. These tablets are especially helpful if you use the Secure dental adhesive.<br />I bought this stuff for my father. and he was satisfied with this.,4,1


In [10]:
data_unverified[["ENRICHED_TEXT", "RATING", "VERIFIED_PURCHASE"]].head()

Unnamed: 0,ENRICHED_TEXT,RATING,VERIFIED_PURCHASE
0,"This purchase is not verified and has a rating of four. When least you think so, this product will save the day. Just keep it around just in case you need it for something.",4,0
2,"This purchase is not verified and has a rating of three. I purchased this swing for my baby. She is 6 months now and has pretty much out grown it. It is very loud and doesn't swing very well. It is beautiful though. I love the colors and it has a lot of settings, but I don't think it was worth the money.",3,0
3,This purchase is not verified and has a rating of four. I was looking for an inexpensive desk calcolatur and here it is. It works and does everything I need. Only issue is that it tilts slightly to one side so when I hit any keys it rocks a little bit. Not a big deal.,4,0
4,"This purchase is not verified and has a rating of four. I only use it twice a week and the results are great. I have used other teeth whitening solutions and most of them, for the same results I would have to use it at least three times a week. Will keep using this because of the potency of the solution and also the technique of the trays, it keeps everything in my teeth, in my mouth.",4,0
5,This purchase is not verified and has a rating of three. I'm not sure what this is supposed to be but I would recommend that you do a little more research into the culture of using pipes if you plan on giving this as a gift or using it yourself.,3,0


## Train-Validation Split
We then convert our dataframe into numpy to shuffle the data and split it into train-valid set. We use 80% of the data for training and 20% for validation. In case we are limited in our computing power, we use a subset of the data.

In [11]:
# Converting dataframe to numpy for shuffling and splitting into training-validation set
reviews_array = data_shortened["ENRICHED_TEXT"].to_numpy(dtype=object)
labels_array = labels.to_numpy(dtype=np.int32)

#Shuffling the dataset
data_length = len(reviews_array)
idx = np.random.permutation(data_length)
shuffled_reviews_array = reviews_array[idx]
shuffled_labels_array = labels_array[idx]

# Train-test split
train_ratio = 0.8
train_data_len = int( train_ratio * data_length )

train_reviews_array = shuffled_reviews_array[:train_data_len]
train_labels_array = shuffled_labels_array[:train_data_len]

test_reviews_array = shuffled_reviews_array[train_data_len:]
test_labels_array = shuffled_labels_array[train_data_len:]



train_reviews = train_reviews_array.tolist()
test_reviews = test_reviews_array.tolist()

#! If we have limited computation power, we can just train on a smaller subset of the data
training_size = 50
train_reviews_small = train_reviews[0:training_size]
train_labels_array_small = train_labels_array[0:training_size]
test_reviews_small = test_reviews[0:training_size]
test_labels_array_small = test_labels_array[0:training_size]



## Tokenization
We tokenize the data using the tokenizer for the Language model we are going to use. In our tests we used bert originally but then used the smaller version of distilbert-uncased for computational efficiency.

We pad the data to have the maximum possible length that the model can accept. 

In [None]:
# Tokenizing the reviews into the input type our model is expecting, we padd the input to be the maximum length our model can take

model_name = 'distilbert-base-uncased' # or bert-base-cased
tokenizer = AutoTokenizer.from_pretrained(model_name)

train_tokenized_df = tokenizer(train_reviews, padding="max_length", truncation=True)
test_tokenized_df = tokenizer(test_reviews, padding="max_length", truncation=True)


In [23]:
print(train_tokenized_df.keys())
embedding_len = len(train_tokenized_df['input_ids'][0])
embeddings_input = train_tokenized_df['input_ids'][0]
attention_input = train_tokenized_df['attention_mask'][0]
print(f'Embedding length: {embedding_len}')
print(f'Embeddings input: {embeddings_input}')
print(f'Attention input: {attention_input}')

dict_keys(['input_ids', 'attention_mask'])
Embedding length: 512
Embeddings input: [101, 2023, 5309, 2003, 2025, 20119, 1998, 2038, 1037, 5790, 1997, 2176, 1012, 2023, 12109, 4966, 2447, 2038, 1037, 3376, 3861, 1010, 6581, 2614, 3737, 1006, 1045, 2109, 4540, 19093, 1007, 1010, 2003, 2200, 3733, 2000, 5452, 1998, 2038, 2035, 1996, 2838, 1045, 2359, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [25]:

#Create the input dictionary for the model
train_features = {x: tf.convert_to_tensor(train_tokenized_df[x]) for x in tokenizer.model_input_names}
test_features = {x: tf.convert_to_tensor(test_tokenized_df[x]) for x in tokenizer.model_input_names} 

#The dataset now is a tuple of dictionary containing inpud_ids, (token_type_ids), attention masks        and  a tf tensor containing the value for the label
train_tf_dataset = tf.data.Dataset.from_tensor_slices((train_features, train_labels_array))
#We train on batch size of 16
train_tf_dataset = train_tf_dataset.shuffle(train_data_len).batch(16)

test_tf_dataset = tf.data.Dataset.from_tensor_slices((test_features, test_labels_array))
test_tf_dataset = test_tf_dataset.batch(16)

print(train_tf_dataset)

<BatchDataset shapes: ({input_ids: (None, 512), attention_mask: (None, 512)}, (None,)), types: ({input_ids: tf.int32, attention_mask: tf.int32}, tf.int32)>


# Training the model
We then load our base model from the huggingface collection. We set suitable callbacks to ensure we do not overfit the data through early stopping if the validation loss is not improving anymore. We also save only the best weights that give us the best validation accuracy.

Since our labels are not one-hot-encoded, we use Sparse Categorical Loss.

In [28]:
import tensorflow.keras as keras

model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# callbacks to ensure Early Stopping of the model before over-fitting if accuracy is not improving and saving best weights
callbacks = [
    keras.callbacks.EarlyStopping(monitor='val_loss', patience=1,
                                    verbose=1, mode="min", restore_best_weights=True),
        keras.callbacks.ModelCheckpoint(filepath="transformer_model/model_distillbert_total", verbose=1, save_best_only=True)
    ]

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
    
)

model.fit(train_tf_dataset, validation_data=test_tf_dataset, epochs=10, verbose=1, callbacks = callbacks)

model.save_pretrained("transformer_model")

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'vocab_projector', 'activation_13', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_59', 'classifier', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

Epoch 1/10

























Epoch 00001: val_loss improved from inf to 0.41446, saving model to transformer_model/model_distillbert_total
























































































INFO:tensorflow:Assets written to: transformer_model/model_distillbert_total/assets


INFO:tensorflow:Assets written to: transformer_model/model_distillbert_total/assets


Epoch 2/10
Restoring model weights from the end of the best epoch.

Epoch 00002: val_loss did not improve from 0.41446
Epoch 00002: early stopping


# Results
We see that we reach very high validation accuracy of 83% just after two epochs. We stopped after two epochs to prevent overfitting. We reach a good balance between 85% training data accuracy and 83% validation accuracy.

This shows that transformers when we train transformers with enriched text we can reach much higher accuracies than text alone in much shorter time.

# Testing on toy examples
We then test the output on some toy examples and see how our predicted labels match with the correct labels.

The first value is prediction for 0(Fake), and the second is for 1(Non-fake)

In [29]:

print(train_labels_array[200:205])

for i in range(200, 205):
    #y = tokenizer(data_shortened.head()["REVIEW_TEXT"].tolist()[i], padding="max_length", truncation=True)
    y = tokenizer(train_reviews[i], padding="max_length", truncation=True)
    y['input_ids'] = tf.convert_to_tensor( [y['input_ids']] )
    if model_name == 'bert-base-cased':
        y['token_type_ids'] = tf.convert_to_tensor( [y['token_type_ids']] )
    y['attention_mask'] = tf.convert_to_tensor( [y['attention_mask']] )

    label = model(y)
    probs = tf.nn.softmax(label.logits)
    print(probs)



[0 1 0 1 1]
tf.Tensor([[0.8336636 0.1663364]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.07943762 0.9205624 ]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.37431285 0.6256872 ]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.45089296 0.549107  ]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.05695679 0.94304323]], shape=(1, 2), dtype=float32)


# Downloading model from google colab

In [30]:
!zip -r /content/file.zip /content/transformer_model

from google.colab import files
files.download("/content/file.zip")

  adding: content/transformer_model/ (stored 0%)
  adding: content/transformer_model/config.json (deflated 44%)
  adding: content/transformer_model/tf_model.h5 (deflated 8%)
  adding: content/transformer_model/model_distillbert_total/ (stored 0%)
  adding: content/transformer_model/model_distillbert_total/assets/ (stored 0%)
  adding: content/transformer_model/model_distillbert_total/variables/ (stored 0%)
  adding: content/transformer_model/model_distillbert_total/variables/variables.index (deflated 79%)
  adding: content/transformer_model/model_distillbert_total/variables/variables.data-00000-of-00001 (deflated 17%)
  adding: content/transformer_model/model_distillbert_total/saved_model.pb (deflated 92%)
  adding: content/transformer_model/model_distillbert_total/keras_metadata.pb (deflated 94%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>