In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [4]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [5]:
from datasets import Dataset, DatasetDict

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
path = '/content/drive/MyDrive/emotions_dataset/'

In [8]:
test_df = pd.read_csv(path + 'test.csv', encoding='ISO-8859-1')
train_df = pd.read_csv(path + 'train.csv', encoding='ISO-8859-1')

In [9]:
# 1. Load the dataset
train_df = train_df[['text', 'sentiment']].dropna()
test_df = test_df[['text', 'sentiment']].dropna()

# Convert text to string (in case it's float or something else)
train_df['text'] = train_df['text'].astype(str)
test_df['text'] = test_df['text'].astype(str)


# Encode labels to integers
label2id = {'negative': 0, 'neutral': 1, 'positive': 2}
id2label = {v: k for k, v in label2id.items()}
train_df['label'] = train_df['sentiment'].map(label2id)
test_df['label'] = test_df['sentiment'].map(label2id)

In [10]:
train_df

Unnamed: 0,text,sentiment,label
0,"I`d have responded, if I were going",neutral,1
1,Sooo SAD I will miss you here in San Diego!!!,negative,0
2,my boss is bullying me...,negative,0
3,what interview! leave me alone,negative,0
4,"Sons of ****, why couldn`t they put them on t...",negative,0
...,...,...,...
27476,wish we could come see u on Denver husband l...,negative,0
27477,I`ve wondered about rake to. The client has ...,negative,0
27478,Yay good for both of you. Enjoy the break - y...,positive,2
27479,But it was worth it ****.,positive,2


In [11]:
len(train_df), len(test_df)

(27480, 3534)

In [12]:
# 3. Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df[['text', 'label']])
test_dataset = Dataset.from_pandas(test_df[['text', 'label']])
dataset = DatasetDict({'train': train_dataset, 'test': test_dataset})

In [13]:
train_dataset

Dataset({
    features: ['text', 'label', '__index_level_0__'],
    num_rows: 27480
})

In [14]:
test_dataset

Dataset({
    features: ['text', 'label', '__index_level_0__'],
    num_rows: 3534
})

In [15]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 27480
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 3534
    })
})

In [16]:
print(type(dataset['train'][0]['text']))  # should be str
print(dataset['train'][0]['text'])        # print one
print(dataset['train'][:2]['text'])       # should be list of 2 strings


<class 'str'>
 I`d have responded, if I were going
[' I`d have responded, if I were going', ' Sooo SAD I will miss you here in San Diego!!!']


In [18]:
import warnings
warnings.filterwarnings("ignore")

In [19]:
# 4. Load tokenizer
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(batch):
  texts = [str(t) for t in batch["text"]]
  return tokenizer(texts, padding=True, truncation=True) # Changed code

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [20]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/27480 [00:00<?, ? examples/s]

Map:   0%|          | 0/3534 [00:00<?, ? examples/s]

In [21]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 27480
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 3534
    })
})

In [22]:
label2id

{'negative': 0, 'neutral': 1, 'positive': 2}

In [25]:
# 5. Load model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
# 6. Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    load_best_model_at_end=True,
    logging_dir="./logs"
)

In [34]:
# 7. Metrics function
from sklearn.metrics import classification_report

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = predictions.argmax(axis=1)
    return classification_report(labels, preds, target_names=id2label.values(), output_dict=True)

In [35]:
# 8. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 9. Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Negative,Neutral,Positive,Accuracy,Macro avg,Weighted avg
1,0.4696,0.503579,"{'precision': 0.8169934640522876, 'recall': 0.7492507492507493, 'f1-score': 0.7816571130797291, 'support': 1001.0}","{'precision': 0.7489768076398363, 'recall': 0.7678321678321678, 'f1-score': 0.7582872928176796, 'support': 1430.0}","{'precision': 0.8165217391304348, 'recall': 0.8513145965548504, 'f1-score': 0.833555259653795, 'support': 1103.0}",0.788625,"{'precision': 0.7941640036075196, 'recall': 0.7894658378792557, 'f1-score': 0.7911665551837346, 'support': 3534.0}","{'precision': 0.7893239305891837, 'recall': 0.7886247877758913, 'f1-score': 0.7883987125976871, 'support': 3534.0}"
2,0.3372,0.551502,"{'precision': 0.7543221110100091, 'recall': 0.8281718281718282, 'f1-score': 0.7895238095238095, 'support': 1001.0}","{'precision': 0.7402159244264508, 'recall': 0.7671328671328671, 'f1-score': 0.7534340659340659, 'support': 1430.0}","{'precision': 0.8887722980062959, 'recall': 0.7679057116953762, 'f1-score': 0.8239299610894941, 'support': 1103.0}",0.784663,"{'precision': 0.7944367778142519, 'recall': 0.7877368023333572, 'f1-score': 0.7889626121824564, 'support': 3534.0}","{'precision': 0.7905775466190685, 'recall': 0.7846632710809281, 'f1-score': 0.7856589685061571, 'support': 3534.0}"
3,0.1958,0.812416,"{'precision': 0.7550644567219152, 'recall': 0.8191808191808192, 'f1-score': 0.785816962146622, 'support': 1001.0}","{'precision': 0.7463917525773196, 'recall': 0.7594405594405594, 'f1-score': 0.7528596187175043, 'support': 1430.0}","{'precision': 0.8721047331319235, 'recall': 0.785131459655485, 'f1-score': 0.8263358778625954, 'support': 1103.0}",0.78438,"{'precision': 0.7911869808103861, 'recall': 0.7879176127589546, 'f1-score': 0.7883374862422405, 'support': 3534.0}","{'precision': 0.7880846768558901, 'recall': 0.7843803056027164, 'f1-score': 0.7851274779731869, 'support': 3534.0}"
4,0.1248,0.939824,"{'precision': 0.7812187812187812, 'recall': 0.7812187812187812, 'f1-score': 0.7812187812187812, 'support': 1001.0}","{'precision': 0.7419137466307277, 'recall': 0.76993006993007, 'f1-score': 0.755662319835278, 'support': 1430.0}","{'precision': 0.8512869399428027, 'recall': 0.8096101541251133, 'f1-score': 0.8299256505576208, 'support': 1103.0}",0.785512,"{'precision': 0.7914731559307705, 'recall': 0.7869196684246548, 'f1-score': 0.78893558387056, 'support': 3534.0}","{'precision': 0.7871834047648139, 'recall': 0.7855121675155631, 'f1-score': 0.786079544405632, 'support': 3534.0}"
5,0.0693,1.117684,"{'precision': 0.7811550151975684, 'recall': 0.7702297702297702, 'f1-score': 0.7756539235412475, 'support': 1001.0}","{'precision': 0.7395973154362416, 'recall': 0.7706293706293706, 'f1-score': 0.7547945205479452, 'support': 1430.0}","{'precision': 0.8486281929990539, 'recall': 0.8132366273798731, 'f1-score': 0.8305555555555556, 'support': 1103.0}",0.783814,"{'precision': 0.7897935078776213, 'recall': 0.7846985894130047, 'f1-score': 0.7870013332149162, 'support': 3534.0}","{'precision': 0.7853981969905343, 'recall': 0.7838143746462931, 'f1-score': 0.7843487605054126, 'support': 3534.0}"


Trainer is attempting to log a value of "{'precision': 0.8169934640522876, 'recall': 0.7492507492507493, 'f1-score': 0.7816571130797291, 'support': 1001.0}" of type <class 'dict'> for key "eval/negative" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.7489768076398363, 'recall': 0.7678321678321678, 'f1-score': 0.7582872928176796, 'support': 1430.0}" of type <class 'dict'> for key "eval/neutral" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.8165217391304348, 'recall': 0.8513145965548504, 'f1-score': 0.833555259653795, 'support': 1103.0}" of type <class 'dict'> for key "eval/positive" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.794164

TrainOutput(global_step=4295, training_loss=0.24088234773753547, metrics={'train_runtime': 1221.6318, 'train_samples_per_second': 112.473, 'train_steps_per_second': 3.516, 'total_flos': 3692549052063648.0, 'train_loss': 0.24088234773753547, 'epoch': 5.0})

In [36]:
# 10. Evaluate
eval_results = trainer.evaluate()
print("\nClassification Report:")
predictions = trainer.predict(tokenized_dataset["test"])
y_true = predictions.label_ids
y_pred = predictions.predictions.argmax(axis=1)
print(classification_report(y_true, y_pred, target_names=id2label.values()))

Trainer is attempting to log a value of "{'precision': 0.8169934640522876, 'recall': 0.7492507492507493, 'f1-score': 0.7816571130797291, 'support': 1001.0}" of type <class 'dict'> for key "eval/negative" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.7489768076398363, 'recall': 0.7678321678321678, 'f1-score': 0.7582872928176796, 'support': 1430.0}" of type <class 'dict'> for key "eval/neutral" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.8165217391304348, 'recall': 0.8513145965548504, 'f1-score': 0.833555259653795, 'support': 1103.0}" of type <class 'dict'> for key "eval/positive" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.794164


Classification Report:
              precision    recall  f1-score   support

    negative       0.82      0.75      0.78      1001
     neutral       0.75      0.77      0.76      1430
    positive       0.82      0.85      0.83      1103

    accuracy                           0.79      3534
   macro avg       0.79      0.79      0.79      3534
weighted avg       0.79      0.79      0.79      3534



In [37]:
eval_results

{'eval_loss': 0.5035794377326965,
 'eval_negative': {'precision': 0.8169934640522876,
  'recall': 0.7492507492507493,
  'f1-score': 0.7816571130797291,
  'support': 1001.0},
 'eval_neutral': {'precision': 0.7489768076398363,
  'recall': 0.7678321678321678,
  'f1-score': 0.7582872928176796,
  'support': 1430.0},
 'eval_positive': {'precision': 0.8165217391304348,
  'recall': 0.8513145965548504,
  'f1-score': 0.833555259653795,
  'support': 1103.0},
 'eval_accuracy': 0.7886247877758913,
 'eval_macro avg': {'precision': 0.7941640036075196,
  'recall': 0.7894658378792557,
  'f1-score': 0.7911665551837346,
  'support': 3534.0},
 'eval_weighted avg': {'precision': 0.7893239305891837,
  'recall': 0.7886247877758913,
  'f1-score': 0.7883987125976871,
  'support': 3534.0},
 'eval_runtime': 6.0353,
 'eval_samples_per_second': 585.554,
 'eval_steps_per_second': 18.392,
 'epoch': 5.0}