This notebook is to train RoBERTa, to classify people into the SIX THINKING HATS

Our dataset consists of texts that are classified into different emotions. We map the emotions to the hats. A model is then made that can predict the emotions, and then categorize people into the hats by using the mapping mentioned.

We only train the model, the prediction and the categorization is mentioned in the main.py file.

In [None]:
# Importing dependencies
import numpy as np
import pandas as pd
import os

In [None]:
# Reading the dataset
data = pd.read_csv('dataset.csv')

In [None]:
data.head()

In [None]:
# Extracting texts
texts = data.pop('text')

In [None]:
# Converting one hot encoded labels to integers
def one_hot_to_integer(sequence):
    return int(''.join(map(str, sequence)), 2)

In [None]:
encoded = data.apply(one_hot_to_integer, axis=1)
data = pd.concat([texts, encoded], axis=1)
data.rename(columns={0: 'labels'}, inplace=True)
data.tail()

In [None]:
# Extracting number of labels
num_labels = data['labels'].unique()

In [None]:
# Splitting the dataset into train and test
train = data.head(int(data.shape[0] * 0.8))
test = data.tail(int(data.shape[0] * 0.2))

In [None]:
train.head()

In [None]:
test.head()

In [None]:
# Importing the transformers library, and calling the tokenizer
import tensorflow as tf
from transformers import TFRobertaForSequenceClassification, RobertaTokenizer, RobertaConfig

config = RobertaConfig(
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
    num_labels = num_labels
)

RobertaTokenizer = RobertaTokenizer.from_pretrained('roberta-base')
RobertaForSequenceClassification = TFRobertaForSequenceClassification.from_pretrained('roberta-base', config=config, num_labels = len(num_labels))

In [None]:
tokenizer = RobertaTokenizer
model = RobertaForSequenceClassification

In [None]:
tokenized_texts = tokenizer(train['text'].to_list(), padding=True, truncation=True, return_tensors='tf')
train_labels = train['labels'].to_numpy().reshape(-1, 1)

In [None]:
# Defining the loss function, optimizer and metrics
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

In [None]:
# Compiling the model
model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

In [None]:
# Training the model
history = model.fit(
    dict(tokenized_texts),
    train_labels,
    epochs=1,
    batch_size=2
)

In [None]:
# Saving the model
model.save_pretrained('models/roberta')