In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


## Importing required libraries

In [3]:
import pandas as pd
import numpy as np
from datasets import Dataset

In [4]:
df = pd.read_csv(r'/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

In [5]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## To use this dataset to Transformers we transform this csv to dataset

In [6]:
Dataset = Dataset.from_pandas(df)
Dataset

Dataset({
    features: ['review', 'sentiment'],
    num_rows: 50000
})

## Split this data to train and test

In [7]:
dataset = Dataset.train_test_split(test_size = 0.3)
dataset

DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 35000
    })
    test: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 15000
    })
})

## IMP:<br> Hugginhface model work on the following structure: <br>input ids, attention_mask, label --> Numericals
## We can use sentiment column as label

In [8]:
# Checking the values from sentiment column
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

## Encode the label column

In [9]:
# We are using this to change to numerical
label2id={'negative':0, 'positive': 1}
# We will use this to change back to string
id2label={0:'negative',1:'positive'}

## Using label encoder

In [10]:
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# le.fit(df['sentiment'])

In [11]:
# we can use label Encoder but we try mapping here
dataset = dataset.map(lambda x: {'label': label2id[x['sentiment']]})

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

In [12]:
dataset

DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment', 'label'],
        num_rows: 35000
    })
    test: Dataset({
        features: ['review', 'sentiment', 'label'],
        num_rows: 15000
    })
})

## Data Tokanization

In [13]:
from transformers import AutoTokenizer
import torch


In [14]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [15]:
# # Load model directly
# from transformers import AutoModel
# model = AutoModel.from_pretrained("huawei-noah/TinyBERT_General_4L_312D")

# OR use following method

model_ckpt = "huawei-noah/TinyBERT_General_4L_312D"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=True) # we use fast method for tokenization


config.json:   0%|          | 0.00/409 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



In [16]:
tokenizer(dataset['train'][0]['review'])

{'input_ids': [101, 1000, 14574, 2152, 1000, 2003, 1010, 3383, 1010, 1996, 2087, 2104, 9250, 18296, 2121, 17312, 1997, 1996, 3865, 1012, 2009, 2003, 2028, 1997, 1996, 2261, 3152, 1999, 1996, 6907, 2008, 2003, 4372, 2705, 7941, 2989, 2802, 1012, 2008, 2108, 2056, 1010, 2009, 2036, 16803, 4600, 2006, 1996, 3115, 18296, 2121, 5675, 1024, 1037, 2177, 1997, 2402, 2273, 1998, 2308, 2131, 2730, 2028, 2011, 2028, 24665, 15808, 8462, 2135, 2127, 1996, 2345, 24419, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 1996, 3114, 2339, 1000, 14574, 2152, 1000, 4832, 2682, 2087, 5691, 1999, 2049, 6907, 2003, 2008, 2009, 3632, 2062, 2058, 1011, 1996, 1011, 2327, 1012, 12578, 1010, 1996, 6359, 1010, 2038, 2204, 3114, 2000, 2907, 1037, 24665, 15979, 2114, 2010, 2280, 19846, 1012, 2027, 16175, 26869, 2094, 2032, 2004, 2002, 2768, 6248, 1999, 1037, 2611, 1005, 1055, 12625, 2282, 6457, 1010, 14855, 15499, 2012, 2010, 28629, 2007, 1037, 23426, 1010, 1998, 1010, 2000, 2327, 2009, 2125, 1010, 25216, 2010,

In [17]:
# now we can make function to tokenize the data

def tokenize(batch):
    temp = tokenizer(batch['review'], padding=True, truncation = True, max_length = 300)
    return temp

dataset = dataset.map(tokenize, batched=True, batch_size = None)
    

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

In [18]:
# Now we can check the tokenized data
dataset['train'][0].keys()

dict_keys(['review', 'sentiment', 'label', 'input_ids', 'token_type_ids', 'attention_mask'])

### Now here 'label' is the target variable and 'input_ids', 'token_type_ids', 'attention_mask' are input features

## Model evaluation function

In [23]:
#  pip install evaluate

In [24]:
import evaluate
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [25]:
# Reference from huggingface library 
# https://huggingface.co/docs/transformers/main/en/tasks/sequence_classification#evaluate
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

## Model Building

In [26]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=len(label2id), label2id=label2id, id2label=id2label)

pytorch_model.bin:   0%|          | 0.00/62.7M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## set the arguments

In [27]:
import os

# Define the path for the output directory
output_dir = '/kaggle/working/output_dir'

# Create the directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Verify that the directory was created
print(f"Directory '{output_dir}' created.")


Directory '/kaggle/working/output_dir' created.


In [28]:
args = TrainingArguments(
    output_dir='/kaggle/working/output_dir',
    overwrite_output_dir=True,
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)



In [29]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4408,0.340212,0.853533
2,0.3219,0.321123,0.866933
3,0.2862,0.311301,0.8718


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


TrainOutput(global_step=1641, training_loss=0.34295090577138915, metrics={'train_runtime': 594.0273, 'train_samples_per_second': 176.76, 'train_steps_per_second': 2.762, 'total_flos': 882184338000000.0, 'train_loss': 0.34295090577138915, 'epoch': 3.0})

## To Evaluate the model

In [30]:
trainer.evaluate()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


{'eval_loss': 0.3113008439540863,
 'eval_accuracy': 0.8718,
 'eval_runtime': 24.2074,
 'eval_samples_per_second': 619.645,
 'eval_steps_per_second': 9.708,
 'epoch': 3.0}

## Model Save and Load for Inference

In [31]:
trainer.save_model('tinybert-sentiment-analysis')

### Test the model

In [32]:
sample_data = data = ['this movie was horrible, the plot was really boring. acting was okay',
        'the movie is really sucked. there is not plot and acting was bad',
        'what a beautiful movie. great plot. acting was good. will see it again']

In [33]:
from transformers import pipeline
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

classifier = pipeline('text-classification', model = 'tinybert-sentiment-analysis', device = device)

classifier(data)


[{'label': 'negative', 'score': 0.9847753643989563},
 {'label': 'negative', 'score': 0.9842326641082764},
 {'label': 'positive', 'score': 0.98093181848526}]

## To download the model

In [36]:
import shutil

# Path of the folder you want to download
folder_path = '/kaggle/working/tinybert-sentiment-analysis'

# Path where the zip file will be saved
output_zip_path = '/kaggle/working/output_dir.zip'

# Zipping the folder
shutil.make_archive(output_zip_path.replace('.zip', ''), 'zip', folder_path)

print(f"Folder {folder_path} has been zipped and saved as {output_zip_path}")

Folder /kaggle/working/tinybert-sentiment-analysis has been zipped and saved as /kaggle/working/output_dir.zip
