In [None]:
import pandas as pd

all_tweets_df = pd.read_csv('all_tweets')
all_tweets_df = all_tweets_df.dropna(axis=0)
all_tweets_df['condition'] = all_tweets_df['condition'].replace({'depression':0, 'ptsd':1, 'control':2})

In [None]:
from sklearn.model_selection import train_test_split

# Training = 0.7, validation = 0.1, test = 0.2
training_tweets, temp_tweets, training_labels, temp_labels = train_test_split(list(all_tweets_df['text']),list(all_tweets_df['condition']), test_size=0.3, random_state=123)
val_tweets, test_tweets, val_labels, test_labels = train_test_split(temp_tweets, temp_labels, test_size=(1/3), random_state=123)

In [None]:
from transformers import RobertaTokenizer, RobertaModel

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

def tokenize_function(dataset):
    model_inputs = tokenizer(dataset['text'], padding="max_length", truncation=True)
    return model_inputs


In [None]:
train = {'text': training_tweets, 'label': training_labels}
val = {'text': val_tweets, 'label': val_labels}
test = {'text': test_tweets, 'label': test_labels}

In [None]:
from datasets import Dataset

train_dataset = Dataset.from_dict(train)
val_dataset = Dataset.from_dict(val)
test_dataset = Dataset.from_dict(test)

In [None]:
train_dataset

Issue: There was NaN in the dataset not allowing it to be transfered to a Dataset type, thus checking the data condition in roufh cells below.

In [None]:
integers = [elm for elm in all_tweets_df['text'] if not (isinstance(elm, str))]
print(len(integers))

In [None]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="transformer_checkpoints",  # specify the directory where models weights will be saved a certain points during training (checkpoints)
    num_train_epochs=3,  # change this if it is taking too long on your computer
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

In [None]:
import numpy as np
import torch

def predict_nn(trained_model, test_dataset):

    output = trained_model(attention_mask=torch.tensor(test_dataset["attention_mask"]), input_ids=torch.tensor(test_dataset["input_ids"]))

    pred_labs = np.argmax(output["logits"].detach().numpy(), axis=1)

    gold_labs = test_dataset["label"]

    return gold_labs, pred_labs

gold_labs, pred_labs = predict_nn(model, test_dataset)

In [None]:
from sklearn.metrics import accuracy_score, multilabel_confusion_matrix
print("RoBERTa on the John Hopkins Twitter dataset:")
print(f'The accuracy score is {accuracy_score(pred_labs, gold_labs)}')
print('The confusion matrix is displayed as follows: ', multilabel_confusion_matrix(pred_labs,gold_labs))