# **1. Setup environment and google drive**



In [2]:
from google.colab import drive
import os
import torch
drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/NLP')
torch.cuda.is_available()

Mounted at /content/drive


True

Setup necessary packes

In [3]:
!pip install datasets transformers[torch] huggingface_hub
!pip install accelerate -U
!apt-get install git-lfs

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers[torch]
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m74.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface_hub
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━

# **2. Data Processing**

Here we are creating the necessary splits in the data and replacing necessary variables with new names so we can later use them in our model.

In [4]:
from tables import tests
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split

imdb_dataset = pd.read_csv('/content/drive/My Drive/NLP/IMDB Dataset.csv')
imdb_dataset.rename(columns = {'review':'text'}, inplace = True)
imdb_dataset.rename(columns = {'sentiment':'label'}, inplace = True)

imdb_dataset['label'].replace('positive', 1, inplace=True)
imdb_dataset['label'].replace('negative', 0, inplace=True)

train, test = train_test_split(imdb_dataset,test_size=0.1)
train, val = train_test_split(train,test_size=(1/9))

train.to_csv('/content/drive/My Drive/NLP/train.csv', index = False)
val.to_csv('/content/drive/My Drive/NLP/val.csv', index = False)
test.to_csv('/content/drive/My Drive/NLP/test.csv', index = False)

dataset = load_dataset('csv', data_files={'train': '/content/drive/My Drive/NLP/train.csv',
                                          'val': '/content/drive/My Drive/NLP/val.csv',
                                          'test': '/content/drive/My Drive/NLP/test.csv'})



Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Creates our datasets individually instead of in an entire dict

In [22]:

train_dataset = dataset["train"].shuffle(seed=42).select([i for i in list(range(40000))])
val_dataset   = dataset["val"].shuffle(seed=42).select([i for i in list(range(5000))])
test_dataset  = dataset["test"].shuffle(seed=42).select([i for i in list(range(5000))])

Set up our tokenizer

In [6]:
# Set DistilBERT tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Here we preprocess the data using the map function of the dataset to have the correct inputs for the model

In [7]:
# Prepare the text inputs for the model
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val   = val_dataset.map(preprocess_function, batched=True)
tokenized_test  = test_dataset.map(preprocess_function, batched=True)



Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Creates the necessary tensors so we run this faster and adds the padding

In [23]:

from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Use the DistilBERT model

In [9]:
# Define DistilBERT as our base model:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Create our evaluation metrics

In [10]:
# Define the evaluation metrics
import numpy as np
from datasets import load_metric

def compute_metrics(eval_pred):
    load_accuracy = load_metric("accuracy")
    load_f1 = load_metric("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    return {"accuracy": accuracy, "f1": f1}


Adds the hugging face API so we can save the model

In [11]:
# Log in to your Hugging Face account
# Get your API token here https://huggingface.co/settings/token
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# **3. Create the Model and Evaluate**

Creates a new model with the necessary arguments

In [12]:
# Define a new Trainer with all the objects we constructed so far
from transformers import TrainingArguments, Trainer

repo_name = "Sentiment-Analysis"

training_args = TrainingArguments(
    output_dir=repo_name,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch",
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Train our model

In [13]:
# Train the model
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.3224
1000,0.2455
1500,0.2288
2000,0.2146
2500,0.2004
3000,0.1324
3500,0.1484
4000,0.1278
4500,0.133
5000,0.1232


TrainOutput(global_step=5000, training_loss=0.18763758392333985, metrics={'train_runtime': 3800.9892, 'train_samples_per_second': 21.047, 'train_steps_per_second': 1.315, 'total_flos': 1.0485961972573056e+16, 'train_loss': 0.18763758392333985, 'epoch': 2.0})

Evaluate on the val dataset

In [14]:
# Compute the evaluation metrics
trainer.evaluate()

  load_accuracy = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

{'eval_loss': 0.21278105676174164,
 'eval_accuracy': 0.937,
 'eval_f1': 0.938126104890984,
 'eval_runtime': 89.1426,
 'eval_samples_per_second': 56.09,
 'eval_steps_per_second': 3.511,
 'epoch': 2.0}

Pushes our model to the hub

In [15]:
# Upload the model to the Hub
trainer.push_to_hub()

'https://huggingface.co/Benlitzen43/Sentiment-Analysis/tree/main/'

Predicts our test F1 scores using our model

In [19]:
trainer.predict(tokenized_test)

PredictionOutput(predictions=array([[ 1.7706089, -1.3731587],
       [ 3.489526 , -2.9561894],
       [ 3.575431 , -3.064788 ],
       ...,
       [-1.6761444,  2.0894654],
       [-2.6762483,  3.1080992],
       [-2.9708714,  3.3922803]], dtype=float32), label_ids=array([0, 0, 0, ..., 1, 1, 1]), metrics={'test_loss': 0.23133352398872375, 'test_accuracy': 0.9356, 'test_f1': 0.9353153877059058, 'test_runtime': 88.3361, 'test_samples_per_second': 56.602, 'test_steps_per_second': 3.543})

Predicts our val F1 scores using our model

In [20]:
trainer.predict(tokenized_val)

PredictionOutput(predictions=array([[-2.9652977,  3.3470838],
       [-2.9097128,  3.3125756],
       [ 3.1212382, -2.801854 ],
       ...,
       [ 1.7138125, -1.2114831],
       [ 3.4473019, -2.982126 ],
       [ 3.4895256, -3.0307813]], dtype=float32), label_ids=array([1, 1, 0, ..., 0, 0, 0]), metrics={'test_loss': 0.21278105676174164, 'test_accuracy': 0.937, 'test_f1': 0.938126104890984, 'test_runtime': 87.8736, 'test_samples_per_second': 56.9, 'test_steps_per_second': 3.562})

Predicts our train F1 scores using our model

In [21]:
trainer.predict(tokenized_train)

PredictionOutput(predictions=array([[-2.746052 ,  3.1731355],
       [-2.9837062,  3.3365796],
       [-2.4713664,  2.8521552],
       ...,
       [ 3.4872863, -2.9868793],
       [ 3.4608846, -2.9418588],
       [-1.5390328,  2.0253768]], dtype=float32), label_ids=array([1, 1, 1, ..., 0, 0, 1]), metrics={'test_loss': 0.07669205963611603, 'test_accuracy': 0.97865, 'test_f1': 0.9787392949611631, 'test_runtime': 689.2066, 'test_samples_per_second': 58.038, 'test_steps_per_second': 3.627})