In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("./twitter_multi_class_sentiment.csv")
df.head()

Unnamed: 0,text,label,label_name
0,i didnt feel humiliated,0,sadness
1,i can go from feeling so hopeless to so damned...,0,sadness
2,im grabbing a minute to post i feel greedy wrong,3,anger
3,i am ever feeling nostalgic about the fireplac...,2,love
4,i am feeling grouchy,3,anger


In [4]:
df['Words_per_tweet'] = df['text'].str.split().apply(len)
df.head()

Unnamed: 0,text,label,label_name,Words_per_tweet
0,i didnt feel humiliated,0,sadness,4
1,i can go from feeling so hopeless to so damned...,0,sadness,21
2,im grabbing a minute to post i feel greedy wrong,3,anger,10
3,i am ever feeling nostalgic about the fireplac...,2,love,18
4,i am feeling grouchy,3,anger,4


In [5]:
from transformers import AutoTokenizer

In [6]:
model_chkpt = "google-bert/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_chkpt)

In [7]:
from sklearn.model_selection import train_test_split


train, test = train_test_split(df, test_size=0.3, stratify=df['label_name'])
test, val = train_test_split(test, test_size=1/3, stratify=test['label_name'])


train.shape, test.shape, val.shape

((11200, 4), (3200, 4), (1600, 4))

In [8]:
from datasets import Dataset, DatasetDict

dataset = DatasetDict(
    {
        'train':Dataset.from_pandas(train,  preserve_index=False),
        'test':Dataset.from_pandas(test,preserve_index=False),
        'val':Dataset.from_pandas(val,preserve_index=False)
    }
)

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_name', 'Words_per_tweet'],
        num_rows: 11200
    })
    test: Dataset({
        features: ['text', 'label', 'label_name', 'Words_per_tweet'],
        num_rows: 3200
    })
    val: Dataset({
        features: ['text', 'label', 'label_name', 'Words_per_tweet'],
        num_rows: 1600
    })
})

In [9]:
def tokenize(batch):
    temp = tokenizer(batch['text'], padding=True, truncation=True)
    return temp


print(tokenize(dataset['test'][:2]))

{'input_ids': [[101, 1045, 2113, 2045, 2015, 2053, 9241, 2000, 2131, 2009, 2589, 2021, 2009, 2145, 5683, 1037, 2978, 6881, 2000, 2025, 2022, 9361, 2041, 1996, 14751, 8983, 4180, 102], [101, 1045, 2572, 2025, 3110, 2062, 1998, 2062, 13847, 8363, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}


In [10]:
emotion_encoded = dataset.map(tokenize, batched=True, batch_size=None)

Map: 100%|██████████| 11200/11200 [00:00<00:00, 14761.00 examples/s]
Map: 100%|██████████| 3200/3200 [00:00<00:00, 16823.86 examples/s]
Map: 100%|██████████| 1600/1600 [00:00<00:00, 20145.19 examples/s]


In [11]:
label2id = {x['label_name']:x['label'] for x in dataset["train"]}
id2label = {v:k for k, v in label2id.items()}

In [12]:
from transformers import AutoModel
import torch

In [13]:
model = AutoModel.from_pretrained(model_chkpt)

In [14]:
from transformers import AutoModelForSequenceClassification, AutoConfig


num_labels = len(label2id)
device = torch.device("cude" if torch.cuda.is_available() else "cpu")
config = AutoConfig.from_pretrained(model_chkpt, label2id=label2id, id2label=id2label)

model = AutoModelForSequenceClassification.from_pretrained(model_chkpt, config=config).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
model.config

BertConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "google-bert/bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "sadness",
    "1": "joy",
    "2": "love",
    "3": "anger",
    "4": "fear",
    "5": "surprise"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "anger": 3,
    "fear": 4,
    "joy": 1,
    "love": 2,
    "sadness": 0,
    "surprise": 5
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.49.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [None]:
from transformers import TrainingArguments

batch_size = 16
training_dir = "bert_base_train_dir"

training_args = TrainingArguments(
    output_dir=training_dir,
    overwrite_output_dir=True,
    num_train_epochs=2,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size = batch_size,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    disable_tqdm=False
)



In [17]:
import evaluate
import numpy as np
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis = 1)
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<?, ?B/s]


In [18]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args,
                  compute_metrics="compute_metrics",
                  train_dataset= emotion_encoded['train'],
                  eval_dataset=emotion_encoded['val'],
                  tokenizer=tokenizer)

  trainer = Trainer(model=model, args=training_args,


In [19]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
text = "helloadfadfa"
def pred_output():
    input_encoded = tokenizer(text, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**input_encoded)
    
    logits = outputs.logits

    pred = torch.argmax(logits, dim = 1).item()
    return pred, id2label[pred]
