In [None]:
%pip install transformers
%pip install torch

In [5]:
# necessary imports
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer

In [6]:
import torch

class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}

In [7]:
# dataset that we'll be working with
df = pd.read_csv('sorted_data_sheet_1.csv').rename(columns={"Input": "text", "Sentiment": "label"})
df['label'] = df['label'].transform(lambda sentiment: 0 if sentiment == 'Positive' else 1)
train = df.iloc[:int(df.shape[0] * .80)] # train on 80% of data
test = df.iloc[int(df.shape[0]*.80):] # test on 20% of data
train_texts = list(train['text'])
test_texts = list(test['text'])
train_labels = list(train['label'])
test_labels = list(test['label'])

In [8]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)

In [None]:
model_name = "siebert/sentiment-roberta-large-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [10]:
train_texts_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_texts_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_texts_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [49]:
train_dataset = ReviewDataset(train_texts_encodings, train_labels)
val_dataset = ReviewDataset(val_texts_encodings, val_labels)
test_dataset = ReviewDataset(test_texts_encodings, test_labels)

In [12]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(output_dir="test_trainer", logging_dir="logs")



In [13]:
%pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash
  Downloading xxhash-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting datasets>=2.0.0
  Downloading datasets-2.9.0-py3-none-any.whl (462 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 KB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m9.0 MB/s[0

In [14]:
import evaluate

In [15]:
metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [16]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [19]:
trainer.train()

***** Running training *****
  Num examples = 236
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 90
  Number of trainable parameters = 355361794


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=90, training_loss=0.7207984076605902, metrics={'train_runtime': 3528.5817, 'train_samples_per_second': 0.201, 'train_steps_per_second': 0.026, 'total_flos': 158508419617296.0, 'train_loss': 0.7207984076605902, 'epoch': 3.0})

In [20]:
trainer.evaluate(test_dataset)

***** Running Evaluation *****
  Num examples = 74
  Batch size = 8


{'eval_loss': 0.7196896076202393,
 'eval_accuracy': 0.6081081081081081,
 'eval_runtime': 102.766,
 'eval_samples_per_second': 0.72,
 'eval_steps_per_second': 0.097,
 'epoch': 3.0}

In [55]:
# without any training, model works at 72% accuracy
# 0.7162162162162162 # I got this number when I used the base model and manually calculated accuracy
# it went down somehow

0.28378378378378377

In [24]:
output_2 = trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 74
  Batch size = 8


In [44]:
# predicted_labels = [model.config.id2label[id] for id in predicted_label_classes.squeeze().tolist()]
output_2

PredictionOutput(predictions=array([[-0.8613183 ,  0.24755043],
       [-0.86128736,  0.24754797],
       [-0.8612461 ,  0.24767019],
       [-0.8612961 ,  0.24762008],
       [-0.86129063,  0.24761021],
       [-0.86132944,  0.24762067],
       [-0.8612844 ,  0.24767333],
       [-0.86123616,  0.24765946],
       [-0.8612639 ,  0.24763536],
       [-0.86131775,  0.24762605],
       [-0.8612436 ,  0.24766028],
       [-0.8612886 ,  0.24756084],
       [-0.8612789 ,  0.2476416 ],
       [-0.8612262 ,  0.2476017 ],
       [-0.8613261 ,  0.24757005],
       [-0.8612609 ,  0.24753945],
       [-0.8613705 ,  0.24762875],
       [-0.8613602 ,  0.24757008],
       [-0.8612317 ,  0.24766499],
       [-0.8613538 ,  0.24768442],
       [-0.8612973 ,  0.2475798 ],
       [-0.8613498 ,  0.24759641],
       [-0.86135185,  0.247552  ],
       [-0.86131907,  0.24760315],
       [-0.8612528 ,  0.24761754],
       [-0.8613193 ,  0.24763542],
       [-0.86131954,  0.24758255],
       [-0.8612528 ,  0.24

In [50]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test_labels, output_2[1])

array([[29,  0],
       [ 0, 45]])

In [48]:
trainer.save_model('senti_model')

Saving model checkpoint to senti_model
Configuration saved in senti_model/config.json
Model weights saved in senti_model/pytorch_model.bin
