# Libraries

In [14]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import tensorflow as tf
import tensorflow_datasets as tfds

from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification

# Data Fetching

In [4]:
ds = tfds.load('amazon_us_reviews/Mobile_Electronics_v1_00', split='train', shuffle_files=True)
assert isinstance(ds, tf.data.Dataset)
print(ds)

INFO:absl:Generating dataset amazon_us_reviews (C:\Users\adils\tensorflow_datasets\amazon_us_reviews\Mobile_Electronics_v1_00\0.1.0)


[1mDownloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to C:\Users\adils\tensorflow_datasets\amazon_us_reviews\Mobile_Electronics_v1_00\0.1.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]
[AINFO:absl:Downloading https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Mobile_Electronics_v1_00.tsv.gz into C:\Users\adils\tensorflow_datasets\downloads\s3_amaz-revi-pds_tsv_amaz_revi_us_Mobi_ElecMsyuExqUz0P7xpw1tDot8G2LJN4cnaWfwz8hjbvd16c.gz.tmp.419a033a231749918beaace4ef919743...
Dl Completed...:   0%|          | 0/1 [00:00<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:00<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:01<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:02<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:03<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:04<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:04<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:05<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:06<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:06<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:07<?, ? url/s]
Dl Completed...:   0%|        

[1mDataset amazon_us_reviews downloaded and prepared to C:\Users\adils\tensorflow_datasets\amazon_us_reviews\Mobile_Electronics_v1_00\0.1.0. Subsequent calls will reuse this data.[0m
<_OptionsDataset shapes: {data: {customer_id: (), helpful_votes: (), marketplace: (), product_category: (), product_id: (), product_parent: (), product_title: (), review_body: (), review_date: (), review_headline: (), review_id: (), star_rating: (), total_votes: (), verified_purchase: (), vine: ()}}, types: {data: {customer_id: tf.string, helpful_votes: tf.int32, marketplace: tf.string, product_category: tf.string, product_id: tf.string, product_parent: tf.string, product_title: tf.string, review_body: tf.string, review_date: tf.string, review_headline: tf.string, review_id: tf.string, star_rating: tf.int32, total_votes: tf.int32, verified_purchase: tf.int64, vine: tf.int64}}>


In [8]:
#convert the dataset into a pandas dataframe
df = tfds.as_dataframe(ds)
df.head()

Unnamed: 0,data/customer_id,data/helpful_votes,data/marketplace,data/product_category,data/product_id,data/product_parent,data/product_title,data/review_body,data/review_date,data/review_headline,data/review_id,data/star_rating,data/total_votes,data/verified_purchase,data/vine
0,b'20980074',0,b'US',b'Mobile_Electronics',b'B00D1847NE',b'274617424',b'Teenage Mutant Ninja Turtles Boombox CD Play...,b'Does not work',b'2015-01-09',b'One Star',b'R1OVS0D6SEXPW7',1,0,0,1
1,b'779273',0,b'US',b'Mobile_Electronics',b'B00KMO6DYG',b'397452138',b'4 Gauge Amp Kit Amplifier Install Wiring Com...,b'This is a great wiring kit i used it to set ...,b'2015-08-06',b'Great kit',b'R9VSD0ET8FERB',4,0,0,1
2,b'15410531',0,b'US',b'Mobile_Electronics',b'B000GWLL0K',b'948304826',b'Travel Wall Charger fits Creative Zen Vision...,b'It works great so much faster than USB charg...,b'2007-03-15',b'A/C Charger for Creative Zen Vision M',b'R3ISXCZHWLJLBH',5,0,0,1
3,b'27389005',0,b'US',b'Mobile_Electronics',b'B008L3JE6Y',b'466340015',b'High Grade Robust 360\xc2\xb0 Adjustable Car...,b'This product was purchased to hold a monitor...,b'2013-07-30',b'camera stand',b'R1TWVUDOFJSQAW',5,0,0,1
4,b'2663569',0,b'US',b'Mobile_Electronics',b'B00GHZS4SC',b'350592810',b'HDE Multifunctional Bluetooth FM Audio Car K...,"b""it works but it has really bad sound quality...",b'2014-12-31',b'bad sound quality',b'R2PEOEUR1LP0GH',3,0,0,1


## Data Scrubbing

In [9]:
df["Sentiment"] = df["data/star_rating"].apply(lambda score: "positive" if score >= 3 else "negative")
df['Sentiment'] = df['Sentiment'].map({'positive':1, 'negative':0})
df['short_review'] =df['data/review_body'].str.decode("utf-8")
df = df[["short_review", "Sentiment"]]

In [10]:
df.head()

Unnamed: 0,short_review,Sentiment
0,Does not work,0
1,This is a great wiring kit i used it to set up...,1
2,It works great so much faster than USB charger...,1
3,This product was purchased to hold a monitor o...,1
4,it works but it has really bad sound quality. ...,1


In [12]:
# dropping last n rows using drop
df.info

<bound method DataFrame.info of                                             short_review  Sentiment
0                                          Does not work          0
1      This is a great wiring kit i used it to set up...          1
2      It works great so much faster than USB charger...          1
3      This product was purchased to hold a monitor o...          1
4      it works but it has really bad sound quality. ...          1
...                                                  ...        ...
49995                                            Awesome          1
49996  I got this cover for my daughter's ipod. It is...          1
49997  This battery does not fit the T-mobile G2 is i...          0
49998  It has good sound but , the charge only last 1...          1
49999  What I imaginaed getting from this machine was...          1

[50000 rows x 2 columns]>

# Data preprocessing

In [13]:
reviews = df['short_review'].values.tolist()
labels = df['Sentiment'].tolist()

In [15]:
training_sentences, validation_sentences, training_labels, validation_labels = train_test_split(reviews, labels, test_size=.2)

In [16]:
# assign tokenizer object to the tokenizer class
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

Downloading: 100%|██████████| 226k/226k [00:00<00:00, 354kB/s]  
Downloading: 100%|██████████| 455k/455k [00:00<00:00, 653kB/s] 
Downloading: 100%|██████████| 28.0/28.0 [00:00<00:00, 9.24kB/s]
Downloading: 100%|██████████| 483/483 [00:00<00:00, 163kB/s]


In [19]:
train_encodings = tokenizer(training_sentences,
                            truncation=True,
                            padding=True)
val_encodings = tokenizer(validation_sentences,
                            truncation=True,
                            padding=True)

In [20]:
train_dataset = tf.data.Dataset.from_tensor_slices((
                            dict(train_encodings),
                            training_labels
                            ))
val_dataset = tf.data.Dataset.from_tensor_slices((
                            dict(val_encodings),
                            validation_labels
                            ))

# Model building

In [21]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=2)

Downloading: 100%|██████████| 347M/347M [05:16<00:00, 1.15MB/s]  
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'vocab_projector', 'activation_13', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_19', 'classifier', 'pre_classifier']
You should p

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5, epsilon=1e-08)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])
model.fit(train_dataset.shuffle(100).batch(16),
          epochs=2,
          batch_size=16,
          validation_data=val_dataset.shuffle(100).batch(16))

In [None]:
model.save_pretrained("./sentiment")
loaded_model = TFDistilBertForSequenceClassification.from_pretrained("./sentiment")

# Model evaluation

In [None]:
test_sentence = "This is a really good product. I love it"
predict_input = tokenizer.encode(test_sentence,
                                 truncation=True,
                                 padding=True,
                                 return_tensors="tf")

In [None]:
tf_output = loaded_model.predict(predict_input)[0]
tf_prediction = tf.nn.softmax(tf_output, axis=1)
labels = ['Negative','Positive']
label = tf.argmax(tf_prediction, axis=1)
label = label.numpy()
print(labels[label[0]])