# Sentiment Classification Using TinyBERT 

## Load Data

#### Load Data with Hugging Face Datasets Library

In [2]:
import pandas as pd
from datasets import Dataset

data = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/IMDB-Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
dataset = Dataset.from_pandas(data)
dataset = dataset.train_test_split(test_size=0.3)
dataset

DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 35000
    })
    test: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 15000
    })
})

In [7]:
data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [8]:
label2id = {'negative': 0, 'positive': 1}
id2label = {0:'negative', 1:'positive'}

dataset = dataset.map(lambda x: {'label': label2id[x['sentiment']]})

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

In [10]:
dataset['train'][0]

{'review': 'I gather at least a few people watched it on Sept.2 on TCM. If you did you know that Hedy had to change her name to avoid being associated with this movie when she came the U.S. It was a huge scandal and I gather that the original release in the U.S. was so chopped up by censors that it was practically unintelligible. I watched because I had just seen a documentary on "bad women", actresses in the U.S. pre- movie censorship board set up in the early \'30s. It looked to me as though they got away with a lot more than Hedy\'s most "sensational" shots in "Ecstasy". In fact Hedy looked positively innocent in this, by today\'s standards, and it was nice to see her early unspoiled beauty. It was a nice, lyrical movie to relax to. I loved it for what it was: a simple romance. I watched it after pre- recording it during a sleepless early A.M. I would love to see the first version released in the U.S. for comparison\'s sake.',
 'sentiment': 'positive',
 'label': 1}

## Data Tokenization

In [11]:
from transformers import AutoTokenizer
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model_ckpt = 'huawei-noah/TinyBERT_General_4L_312D'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=True)

In [16]:
tokenizer(dataset['train'][0]['review'])

def tokenize(batch):
    temp = tokenizer(batch['review'], padding=True, truncation=True, max_length=300)
    return temp

dataset = dataset.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

In [19]:
dataset['train'][0].keys()

dict_keys(['review', 'sentiment', 'label', 'input_ids', 'token_type_ids', 'attention_mask'])

### Building Model Evaluation Functions


In [21]:
# !pip install evaluate

import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

## Model Building

In [24]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=len(label2id), label2id=label2id, id2label=id2label)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
args = TrainingArguments(
    output_dir='train_dir',
    overwrite_output_dir=True,
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [27]:
trainer.train()

  0%|          | 0/3282 [00:00<?, ?it/s]

{'loss': 0.4573, 'grad_norm': 7.713020324707031, 'learning_rate': 1.695307739183425e-05, 'epoch': 0.46}
{'loss': 0.356, 'grad_norm': 7.362026691436768, 'learning_rate': 1.3906154783668494e-05, 'epoch': 0.91}


  0%|          | 0/469 [00:00<?, ?it/s]

{'eval_loss': 0.3203084170818329, 'eval_accuracy': 0.8662, 'eval_runtime': 27.8459, 'eval_samples_per_second': 538.679, 'eval_steps_per_second': 16.843, 'epoch': 1.0}
{'loss': 0.3067, 'grad_norm': 10.578359603881836, 'learning_rate': 1.0859232175502743e-05, 'epoch': 1.37}
{'loss': 0.3017, 'grad_norm': 9.168842315673828, 'learning_rate': 7.81230956733699e-06, 'epoch': 1.83}


  0%|          | 0/469 [00:00<?, ?it/s]

{'eval_loss': 0.3019493818283081, 'eval_accuracy': 0.8788666666666667, 'eval_runtime': 27.8835, 'eval_samples_per_second': 537.952, 'eval_steps_per_second': 16.82, 'epoch': 2.0}
{'loss': 0.2685, 'grad_norm': 12.404754638671875, 'learning_rate': 4.765386959171238e-06, 'epoch': 2.29}
{'loss': 0.2632, 'grad_norm': 8.259003639221191, 'learning_rate': 1.7184643510054846e-06, 'epoch': 2.74}


  0%|          | 0/469 [00:00<?, ?it/s]

{'eval_loss': 0.3005996644496918, 'eval_accuracy': 0.8802666666666666, 'eval_runtime': 27.9011, 'eval_samples_per_second': 537.613, 'eval_steps_per_second': 16.809, 'epoch': 3.0}
{'train_runtime': 592.2551, 'train_samples_per_second': 177.288, 'train_steps_per_second': 5.542, 'train_loss': 0.319290198326692, 'epoch': 3.0}


TrainOutput(global_step=3282, training_loss=0.319290198326692, metrics={'train_runtime': 592.2551, 'train_samples_per_second': 177.288, 'train_steps_per_second': 5.542, 'total_flos': 882184338000000.0, 'train_loss': 0.319290198326692, 'epoch': 3.0})

In [28]:
trainer.evaluate()

  0%|          | 0/469 [00:00<?, ?it/s]

{'eval_loss': 0.3005996644496918,
 'eval_accuracy': 0.8802666666666666,
 'eval_runtime': 28.0824,
 'eval_samples_per_second': 534.142,
 'eval_steps_per_second': 16.701,
 'epoch': 3.0}

## Model Save and Load for Inference

In [29]:
trainer.save_model('tinybert-sentiment-analysis')

In [30]:
data = ['this movie was horrible, the plot was really boring. acting was okay',
        'the movie is really sucked. there is not plot and acting was bad',
        'what a beautiful movie. great plot. acting was good. will see it again']

In [33]:
from transformers import pipeline
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

classifier = pipeline('text-classification', model='tinybert-sentiment-analysis', device=device)

classifier(data)

[{'label': 'negative', 'score': 0.991597592830658},
 {'label': 'negative', 'score': 0.9916531443595886},
 {'label': 'positive', 'score': 0.9883714318275452}]

device(type='cuda')

## Push Model to AWS S3

In [40]:
import boto3

s3 = boto3.client('s3')

bucket_name = 'mlops-kgptalkie'

def create_bucket(bucket_name):
    response = s3.list_buckets()
    buckets = [buck['Name'] for buck in response['Buckets']]
    if bucket_name not in buckets:
        s3.create_bucket(Bucket=bucket_name)
        print("Bucket is created")

    else:
        print("Bucket already exists in your account!!! Feel free to use it.")

create_bucket(bucket_name)

Bucket already exists in your account!!! Feel free to use it.


In [42]:
# upload model folder to s3 bucket ml-models/tinybert-sentiment-analysis
import os
import boto3

s3 = boto3.client('s3')
bucket_name = 'mlops-kgptalkie'

def upload_directory(directory_path, s3_prefix):
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            file_path = os.path.join(root, file).replace("\\", "/")
            relpath = os.path.relpath(file_path, directory_path)
            s3_key = os.path.join(s3_prefix, relpath).replace("\\", "/")
            
            s3.upload_file(file_path, bucket_name, s3_key)


upload_directory('tinybert-sentiment-analysis', 'ml-models/tinybert-sentiment-analysis')


In [None]:
# s3://mlops-kgptalkie/ml-models/tinybert-sentiment-analysis/