In [1]:
pip install transformers

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification
import numpy as np
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import get_scheduler
import torch
from tqdm.auto import tqdm
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

In [3]:
train_df = pd.read_csv("/kaggle/input/datafiles/train_data.csv",header=None)
train_df.columns = ["Text","Emotion"]
train_df.head()

Unnamed: 0,Text,Emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [4]:
train_df['Emotion'].value_counts()

joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: Emotion, dtype: int64

In [5]:
val_df = pd.read_csv("/kaggle/input/datafiles/val_data.csv",header=None)
val_df.columns = ["Text","Emotion"]
val_df.head()

Unnamed: 0,Text,Emotion
0,im feeling quite sad and sorry for myself but ...,sadness
1,i feel like i am still looking at a blank canv...,sadness
2,i feel like a faithful servant,love
3,i am just feeling cranky and blue,anger
4,i can have for a treat or if i am feeling festive,joy


In [6]:
val_df['Emotion'].value_counts()

joy         704
sadness     550
anger       275
fear        212
love        178
surprise     81
Name: Emotion, dtype: int64

In [7]:
data_dict = {0:'joy',1:'sadness',2:'anger',3:'fear',4:'love',5:'surprise'}
my_dict = { data_dict[k]:k for k in data_dict}
train_df['Emotion'] = [my_dict.get(i,i) for i in list(train_df['Emotion'])]
train_df.head()

Unnamed: 0,Text,Emotion
0,i didnt feel humiliated,1
1,i can go from feeling so hopeless to so damned...,1
2,im grabbing a minute to post i feel greedy wrong,2
3,i am ever feeling nostalgic about the fireplac...,4
4,i am feeling grouchy,2


In [8]:
train_df['Emotion'].value_counts()

0    5362
1    4666
2    2159
3    1937
4    1304
5     572
Name: Emotion, dtype: int64

In [9]:
val_df['Emotion'] = [my_dict.get(i,i) for i in list(val_df['Emotion'])]
val_df.head()

Unnamed: 0,Text,Emotion
0,im feeling quite sad and sorry for myself but ...,1
1,i feel like i am still looking at a blank canv...,1
2,i feel like a faithful servant,4
3,i am just feeling cranky and blue,2
4,i can have for a treat or if i am feeling festive,0


In [10]:
val_df['Emotion'].value_counts()

0    704
1    550
2    275
3    212
4    178
5     81
Name: Emotion, dtype: int64

In [11]:
train_df.to_csv("train.csv", index=False)
val_df.to_csv("val.csv", index=False)

In [12]:
data_files = {'train': "train.csv",
              'val': "val.csv"}

In [13]:
dataset = load_dataset('csv', data_files=data_files)

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-f41f6ad7623d60e5/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-f41f6ad7623d60e5/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Text', 'Emotion'],
        num_rows: 16000
    })
    val: Dataset({
        features: ['Text', 'Emotion'],
        num_rows: 2000
    })
})

In [15]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [16]:
def tokenize_function(example):
    return tokenizer(example["Text"], truncation=True)

In [17]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [18]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['Text', 'Emotion', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16000
    })
    val: Dataset({
        features: ['Text', 'Emotion', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [19]:
tokenized_datasets = tokenized_datasets.remove_columns(["Text"])
tokenized_datasets = tokenized_datasets.rename_column("Emotion", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [20]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [21]:
train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=4, collate_fn=data_collator
)
val_dataloader = DataLoader(
    tokenized_datasets["val"], batch_size=4, collate_fn=data_collator
)

In [22]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=6)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [23]:
optimizer = AdamW(model.parameters(), lr=5e-5)



In [24]:
num_epochs = 4
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

16000


In [25]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [26]:
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/16000 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [27]:
val = []
val_pred = []
model.eval()
for batch in val_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    val_pred = val_pred + (outputs.logits.argmax(axis=-1).flatten().tolist())
    val = val + batch['labels'].tolist()

In [28]:
val = [data_dict.get(i,i) for i in val]
val_pred = [data_dict.get(i,i) for i in val_pred]

In [29]:
cr_val = classification_report(val,val_pred)
val_accuracy = accuracy_score(val,val_pred)
print("Validation accuracy:", val_accuracy)
print(cr_val)

Validation accuracy: 0.9455
              precision    recall  f1-score   support

       anger       0.97      0.93      0.95       275
        fear       0.88      0.93      0.90       212
         joy       0.95      0.98      0.96       704
        love       0.91      0.87      0.89       178
     sadness       0.97      0.97      0.97       550
    surprise       0.93      0.79      0.85        81

    accuracy                           0.95      2000
   macro avg       0.93      0.91      0.92      2000
weighted avg       0.95      0.95      0.95      2000



In [30]:
with open("test_prediction.csv","w",encoding="utf-8") as f1:
    with open ("/kaggle/input/train-data/test_data.txt","r",encoding="utf-8") as f2:
        for line in f2:
            single_tokenized_test = tokenizer(line,truncation=True, return_tensors='pt')
            with torch.no_grad():
                single_tokenized_test = {k: v.to(device) for k, v in single_tokenized_test.items()}
                output = model(**single_tokenized_test)
                single_test_pred = output.logits.argmax(axis=-1).flatten().tolist()
                f1.write(str(data_dict[single_test_pred[0]])+"\n")

In [31]:
pip install emoji

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[0mNote: you may need to restart the kernel to use updated packages.


In [32]:
from ipywidgets import widgets
lbl1=widgets.Label("Input Sentence:")
display(lbl1)
text=widgets.Text()
display(text)
btn=widgets.Button(description="The predicted emotion")
display(btn)
lbl2=widgets.Label()
display(lbl2)
emotion_dict = {0:'\U0001F601',1:'\U0001F62D',2:'\U0001F621',3:'\U0001F631',4:'\U0001F60D',5:'\U0001F632'}
def predictedemotion(b):
    inp=text.value
    single_tokenized_test = tokenizer(inp,truncation=True, return_tensors='pt')
    with torch.no_grad():
        single_tokenized_test = {k: v.to(device) for k, v in single_tokenized_test.items()}
        output = model(**single_tokenized_test)
        single_test_pred = output.logits.argmax(axis=-1).flatten().tolist()
        lbl2.value=emotion_dict[single_test_pred[0]]
btn.on_click(predictedemotion)

Label(value='Input Sentence:')

Text(value='')

Button(description='The predicted emotion', style=ButtonStyle())

Label(value='')