In [None]:
import torch

if torch.cuda.is_available():
  device = torch.device('cuda')

  print('there are %d GPU(s) available.' % torch.cuda.device_count())

  print('we will use the GPU: ', torch.cuda.get_device_name(0))

else:
  print("No GPU available, using the CPU instead")
  device = torch.device("cpu")

there are 1 GPU(s) available.
we will use the GPU:  Tesla T4


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
train_data = '/content/gdrive/MyDrive/datasetML/train2.csv'

In [None]:
test_data = '/content/gdrive/MyDrive/datasetML/test2.csv'

In [None]:
!pip install transformers



In [None]:
!pip install wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9656 sha256=36b38b5955c71a6767ecef85090ecda612f244f1c8272bfda9016cfb4872ff88
  Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [None]:
import pandas as pd
from transformers import BertTokenizer
from keras.preprocessing.sequence import pad_sequences
train_df = pd.read_csv(train_data, on_bad_lines='skip')
test_df = pd.read_csv(test_data, on_bad_lines='skip')


In [None]:
train_df['label'] = train_df['label'].apply(lambda x: 0 if x == '__label__1' else 1)
test_df['label'] = test_df['label'].apply(lambda x: 0 if x == '__label__1' else 1)

# Sample only 10k from train data and 2k from test data
train_df = train_df.sample(n=10000, random_state=42)
test_df = test_df.sample(n=2000, random_state=42)


In [None]:
train_sentences = train_df['reviews'].values
train_labels = train_df['label'].values
test_sentences = test_df['reviews'].values
test_labels = test_df['label'].values


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case=True)

# Tokenize train and test sentences
train_input_ids = []
test_input_ids = []


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [None]:
for sent in train_sentences:
    encoded_sent = tokenizer.encode(
        sent,
        add_special_tokens=True
    )
    train_input_ids.append(encoded_sent)


for sent in test_sentences:
    encoded_sent = tokenizer.encode(
        sent,
        add_special_tokens=True
    )
    test_input_ids.append(encoded_sent)



In [None]:
MAX_LEN = 64
train_input_ids = pad_sequences(train_input_ids, maxlen=MAX_LEN, dtype='long', value=0, truncating='post', padding='post')
test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN, dtype='long', value=0, truncating='post', padding='post')

train_attention_masks = []
test_attention_masks = []

for sent in train_input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]
    train_attention_masks.append(att_mask)

for sent in test_input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]
    test_attention_masks.append(att_mask)

import torch

train_inputs = torch.tensor(train_input_ids)
train_masks = torch.tensor(train_attention_masks)
train_labels = torch.tensor(train_labels)

test_inputs = torch.tensor(test_input_ids)
test_masks = torch.tensor(test_attention_masks)
test_labels = torch.tensor(test_labels)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

train_data = TensorDataset(train_inputs, train_masks, train_labels)
test_data = TensorDataset(test_inputs, test_masks, test_labels)

batch_size = 32


train_sampler = RandomSampler(train_data)
test_sampler = SequentialSampler(test_data)


train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)


In [None]:
import torch
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import datetime

model = BertForSequenceClassification.from_pretrained(
    "bert-base-multilingual-uncased",
    num_labels=3,
    output_attentions=False,
    output_hidden_states=False
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(),
                  lr=2e-5,
                  eps=1e-8)


epochs = 10

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    elapsed_rounded = int(round(elapsed))
    return str(datetime.timedelta(seconds=elapsed_rounded))


model.safetensors:   0%|          | 0.00/672M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import random
import time
import numpy as np



test_input_ids = []
test_attention_masks = []

for sent in test_sentences:
    encoded_sent = tokenizer.encode(
        sent,
        add_special_tokens=True
    )
    test_input_ids.append(encoded_sent)


test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN, dtype='long', value=0, truncating='post', padding='post')

test_attention_masks = [[int(token_id > 0) for token_id in sent] for sent in test_input_ids]

# Convert test data to tensors
test_input_ids = torch.tensor(test_input_ids)
test_attention_masks = torch.tensor(test_attention_masks)
test_labels = torch.tensor(test_labels)


validation_batch_size = 32


validation_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)


validation_sampler = SequentialSampler(validation_data)


validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=validation_batch_size)


seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

loss_values = []


for epoch_i in range(0, epochs):

    # ===================================
    #              Training
    # ===================================

    print("======= Epoch {:} / {:} =======".format(epoch_i + 1, epochs))
    print("Training...")

    t0 = time.time()

    total_loss = 0


    model.train()


    for step, batch in enumerate(train_dataloader):


        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)

            print("Batch {:>5,} of {:>5,}.     Elapsed: {:}".format(step, len(train_dataloader), elapsed))


        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)


        model.zero_grad()

        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels)


        loss = outputs[0]


        total_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)

    loss_values.append(avg_train_loss)

    print("   Average training loss: {0:.2f}".format(avg_train_loss))
    print("   Training epoch took: {:}".format(format_time(time.time() - t0)))

    # ===================================
    #             Validation
    # ===================================

    print("Running Validation...")

    t0 = time.time()

    model.eval()

    # Tracking variables
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    for batch in validation_dataloader:

        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask)

        logits = outputs[0]

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        eval_accuracy += flat_accuracy(logits, label_ids)

        nb_eval_steps += 1

    print("   Accuracy: {0:.2f}".format(eval_accuracy / nb_eval_steps))
    print("   Validation took: {:}".format(format_time(time.time() - t0)))

print("Training complete!")


  test_labels = torch.tensor(test_labels)


Training...
Batch    40 of   313.     Elapsed: 0:00:14
Batch    80 of   313.     Elapsed: 0:00:29
Batch   120 of   313.     Elapsed: 0:00:43
Batch   160 of   313.     Elapsed: 0:00:58
Batch   200 of   313.     Elapsed: 0:01:12
Batch   240 of   313.     Elapsed: 0:01:27
Batch   280 of   313.     Elapsed: 0:01:41
   Average training loss: 0.16
   Training epoch took: 0:01:53
Running Validation...
   Accuracy: 0.89
   Validation took: 0:00:07
Training...
Batch    40 of   313.     Elapsed: 0:00:14
Batch    80 of   313.     Elapsed: 0:00:29
Batch   120 of   313.     Elapsed: 0:00:43
Batch   160 of   313.     Elapsed: 0:00:58
Batch   200 of   313.     Elapsed: 0:01:12
Batch   240 of   313.     Elapsed: 0:01:26
Batch   280 of   313.     Elapsed: 0:01:41
   Average training loss: 0.14
   Training epoch took: 0:01:53
Running Validation...
   Accuracy: 0.91
   Validation took: 0:00:07
Training...
Batch    40 of   313.     Elapsed: 0:00:14
Batch    80 of   313.     Elapsed: 0:00:29
Batch   120 of

In [None]:
from sklearn.metrics import classification_report, accuracy_score

model.eval()

predictions, true_labels = [], []

print("Predicting labels for {:,} test sentences".format(len(test_input_ids)))

for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)

    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

    logits = outputs[0]

    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    predictions.append(logits)
    true_labels.append(label_ids)

flat_predictions = np.argmax([item for sublist in predictions for item in sublist], axis=1).flatten()
flat_true_labels = [item for sublist in true_labels for item in sublist]

accuracy = accuracy_score(flat_true_labels, flat_predictions)
print("Accuracy:", accuracy)

print(classification_report(flat_true_labels, flat_predictions))

from sklearn.metrics import matthews_corrcoef
mcc = matthews_corrcoef(flat_true_labels, flat_predictions)
print("MCC: %.3f" % mcc)


Predicting labels for 2,000 test sentences
Accuracy: 0.899
              precision    recall  f1-score   support

           0       0.90      0.90      0.90      1001
           1       0.90      0.90      0.90       999

    accuracy                           0.90      2000
   macro avg       0.90      0.90      0.90      2000
weighted avg       0.90      0.90      0.90      2000

MCC: 0.798


In [None]:

def predict_sentiment(text):
    encoded_text = tokenizer.encode(
        text,
        add_special_tokens=True
    )

    padded_text = pad_sequences([encoded_text], maxlen=MAX_LEN, dtype='long', value=0, truncating='post', padding='post')

    attention_mask = [int(token_id > 0) for token_id in padded_text[0]]

    input_ids = torch.tensor(padded_text)
    attention_mask = torch.tensor([attention_mask])

    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_mask)

    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    prediction = np.argmax(logits)

    if prediction == 0:
        return "Negative"
    else:
        return "Positive"

input_text = "DAMN I LOVE THIS PRODUCT"
print("Predicted sentiment:", predict_sentiment(input_text))


Predicted sentiment: Positive
