In [1]:
!pip install transformers[torch] accelerate -U

Collecting transformers[torch]
  Downloading transformers-4.42.3-py3-none-any.whl (9.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m37.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using

#First, we import the necessary libraries and modules.
#Pandas: for working with data.
#Sklearn: for data segmentation and model evaluation.
#Transformers: for loading the BERT model and tokenizer and for training and prediction.
#torch: for GPU support and tensor management.

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import TextClassificationPipeline
import torch
from torch.utils.data import Dataset, DataLoader

#This block checks if a GPU is available and sets the device accordingly.

In [3]:
# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"There are {torch.cuda.device_count()} GPU(s) available.")
    print(f"GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("No GPU available, using the CPU instead.")

There are 1 GPU(s) available.
GPU: Tesla T4


#data.csv: dataset containing tweets and their associated emotional tags.
#header=None, names=['ID', 'Game', 'Sentiment', 'Text']: Specifies that the dataset has no header and provides column names.


In [4]:
# Load dataset
data = pd.read_csv('data.csv', header=None, names=['ID', 'Game', 'Sentiment', 'Text'])

#Converts text to lowercase and handles missing values by replacing them with an empty string.

In [5]:
# Load dataset
data = pd.read_csv('data.csv', header=None, names=['ID', 'Game', 'Sentiment', 'Text'])

# Preprocess the text data
def preprocess_text(text):
    if pd.isna(text):
        return ""
    return str(text).lower()

data['Text'] = data['Text'].apply(preprocess_text)

#It divides the data into training (80%) and test (20%) sets.

In [6]:
# Split the dataset into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

#Loads the tokenizer and BERT model with three output labels (negative, neutral, positive).
#If available, it transfers the model to the GPU.

In [7]:
# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

#Tokenizes, truncates, or tokens text data up to 512 tokens long.

In [8]:
# Tokenize the text data
train_encodings = tokenizer(train_data['Text'].tolist(), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_data['Text'].tolist(), truncation=True, padding=True, max_length=512)

#Converts sentiment labels (negative, neutral, positive) to numeric values (0, 1, 2).

In [9]:
# Convert sentiment labels to numerical format
def sentiment_to_label(sentiment):
    if sentiment == 'Negative':
        return 0
    elif sentiment == 'Neutral':
        return 1
    else:
        return 2

train_labels = train_data['Sentiment'].apply(sentiment_to_label).tolist()
test_labels = test_data['Sentiment'].apply(sentiment_to_label).tolist()

#Defines a custom data collection class that returns marked entries and labels.

In [10]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_labels)
test_dataset = CustomDataset(test_encodings, test_labels)

#It specifies training parameters such as number of courses, batch size and logging.

In [11]:
# Define training arguments with optimization
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    fp16=True,  # Enable mixed precision training
    gradient_accumulation_steps=2  # Accumulate gradients to effectively increase batch size
)



#Initializes the Trainer with the model, training arguments, and dataset.

In [12]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

#It trains the model.

In [13]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
0,0.3781,0.330494
1,0.2127,0.251823


TrainOutput(global_step=7468, training_loss=0.4083721682980026, metrics={'train_runtime': 2105.7764, 'train_samples_per_second': 56.744, 'train_steps_per_second': 3.546, 'total_flos': 1.934115643168191e+16, 'train_loss': 0.4083721682980026, 'epoch': 1.9997322265363504})

#Evaluates the model on the test set.

In [14]:
# Evaluate the model
eval_result = trainer.evaluate()

#It predicts sentiment labels for the test set and extracts the predicted label with the highest score.

In [15]:
# Predict sentiment for the test set
predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)

#Prints a classification report showing the precision, recall, and F1 score for each emotion class.

In [16]:
# Print classification report
print(classification_report(test_labels, preds, target_names=['Negative', 'Neutral', 'Positive']))

              precision    recall  f1-score   support

    Negative       0.95      0.91      0.93      4519
     Neutral       0.92      0.90      0.91      3596
    Positive       0.91      0.95      0.93      6822

    accuracy                           0.92     14937
   macro avg       0.93      0.92      0.92     14937
weighted avg       0.93      0.92      0.92     14937



#Creates a pipeline for sentiment analysis using a trained model.

In [17]:
# Create a sentiment analysis pipeline
pipeline = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True, device=0 if torch.cuda.is_available() else -1)



#The pipeline applies sentiment analysis to the original dataset and stores the predictions.

In [18]:
# Predict sentiment for the original dataset
data['Predicted_Sentiment'] = data['Text'].apply(lambda x: pipeline(x)[0])

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


#Saves results to a CSV file, including predicted sentiment.

In [19]:
# Save the results
data.to_csv('predicted_sentiments.csv', index=False)

#The model has been successfully trained.

Training results:
Global Step: 7468
Training Loss: 0.4083
Train Runtime: 2105.7764 seconds
Train Samples per Second: 56.744
Train Steps per Second: 3.546
Total FLOPS: 1.9341*〖10〗^16
Epochs: 2.0

This model has performed very well with an overall accuracy of 92%.

Detailed performance metrics:
Negative emotions:
Accuracy: 0.95
Recall: 0.91
F1 score: 0.93
Support: 4519

Neutral emotions:
Accuracy: 0.93
Recall: 0.88
F1 score: 0.90
Support: 3596

Positive emotions:
Accuracy: 0.90
Recall: 0.95
F1 score: 0.93
Support: 6822

General criteria:
Accuracy: 0.92

Macro Average:
Accuracy: 0.93
Recall: 0.91
F1 score: 0.92

Weighted Average:
Accuracy: 0.92
Recall: 0.92
F1 score: 0.92
This model has high accuracy and recall for negative and positive emotions, which shows that it is good at accurately identifying these emotions.

The predicted_sentiments.csv file contains the original data along with model predictions for sentiment analysis.

ID: A unique identifier for each record.
Game: Game name mentioned in the text.
Sentiment: The actual sentiment label assigned to the text (eg, positive, negative, neutral).
Text: The text content of the tweet or comment.
Predicted_Sentiment: model prediction for sentiment in a detailed format, showing scores for each label (negative, neutral, positive).

Example of predictions in CSV file:
ID,Game,Sentiment,Text,Predicted_Sentiment
2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,","[{'label': 'LABEL_0', 'score': 0.00022317680122796446}, {'label': 'LABEL_1', 'score': 0.00030716147739440203}, {'label': 'LABEL_2', 'score': 0.9994696974754333}]"
2401,Borderlands,Positive,"i am coming to the borders and i will kill you all,","[{'label': 'LABEL_0', 'score': 0.00018267231644131243}, {'label': 'LABEL_1', 'score': 0.0004388584347907454}, {'label': 'LABEL_2', 'score': 0.9993784427642822}]"
2401,Borderlands,Positive,"im getting on borderlands and i will kill you all,","[{'label': 'LABEL_0', 'score': 0.00019511835125740618}, {'label': 'LABEL_1', 'score': 0.0003024300967808813}, {'label': 'LABEL_2', 'score': 0.999502420425415}]"
2401,Borderlands,Positive,"im coming on borderlands and i will murder you all,","[{'label': 'LABEL_0', 'score': 0.00022581385564990342}, {'label': 'LABEL_1', 'score': 0.00030427431920543313}, {'label': 'LABEL_2', 'score': 0.9994699358940125}]"
2401,Borderlands,Positive,"im getting on borderlands 2 and i will murder you me all,","[{'label': 'LABEL_0', 'score': 0.0001595183421159163}, {'label': 'LABEL_1', 'score': 0.0003770108160097152}, {'label': 'LABEL_2', 'score': 0.9994634985923767}]"
2401,Borderlands,Positive,"im getting into borderlands and i can murder you all,","[{'label': 'LABEL_0', 'score': 0.0002854418125934899}, {'label': 'LABEL_1', 'score': 0.000294297729851678}, {'label': 'LABEL_2', 'score': 0.9994202852249146}]"
2402,Borderlands,Positive,so i spent a few hours making something for fun. . . if you don't know i am a huge @borderlands fan and maya is one of my favorite characters. so i decided to make myself a wallpaper for my pc. . here is the original image versus the creation i made :) enjoy! pic.twitter.com/mlsi5wf9jg,"[{'label': 'LABEL_0', 'score': 0.0001660394627833739}, {'label': 'LABEL_1', 'score': 0.0003505792119540274}, {'label': 'LABEL_2', 'score': 0.9994833469390869}]"
2402,Borderlands,Positive,"so i spent a couple of hours doing something for fun... if you don't know that i'm a huge @ borderlands fan and maya is one of my favorite characters, i decided to make a wallpaper for my pc.. here's the original picture compared to the creation i made:) have fun! pic.twitter.com / mlsi5wf9jg","[{'label': 'LABEL_0', 'score': 0.00017550277698319405}, {'label': 'LABEL_1', 'score': 0.0003316322108730674}, {'label': 'LABEL_2', 'score': 0.999492883682251}]"
2402,Borderlands,Positive,so i spent a few hours doing something for fun... if you don't know i'm a huge @ borderlands fan and maya is one of my favorite characters.,"[{'label': 'LABEL_0', 'score': 0.0001786408683983609}, {'label': 'LABEL_1', 'score': 0.0003270627639722079}, {'label': 'LABEL_2', 'score': 0.9994943141937256}]"
Each entry in the Predicted_Sentiment column is a JSON-like string containing a list of dictionaries. Each dictionary represents a label (sense category) with its corresponding score. Tags are:
LABEL_0: negative
LABEL_1: Neutral
LABEL_2: Positive
Model confidence scores for each label. The label with the highest score is the predicted sentiment for that text.

For example, in the first row:
"[{'label': 'LABEL_0', 'score': 0.00022317680122796446}, {'label': 'LABEL_1', 'score': 0.00030716147739440203}, {'label': 'LABEL_2', 'score': 0.9994696974754333}]"
LABEL_0 (negative): score = 0.00022317680122796446
LABEL_1 (neutral): score = 0.00030716147739440203
LABEL_2 (positive): score = 0.9994696974754333
Since LABEL_2 has the highest score, the model predicts sentiment positively.

The CSV file provides a detailed view of the model's sentiment prediction for each text input. This format is useful for analyzing the model's performance and understanding its confidence in its predictions in different emotions.
