In [81]:
import pandas as pd

all_tweets_df = pd.read_csv('all_tweets')
all_tweets_df = all_tweets_df.dropna(axis=0)
all_tweets_df['condition'] = all_tweets_df['condition'].replace({'depression':0, 'ptsd':1, 'control':2})
all_tweets_df = all_tweets_df[pd.to_numeric(all_tweets_df['condition'], errors='coerce').notnull()]

- Find out how many classes are in the set
- Get the model working

In [82]:
all_tweets_df['condition'].unique()

array([2, 0, 1], dtype=object)

In [83]:
from sklearn.model_selection import train_test_split

# Training = 0.7, validation = 0.1, test = 0.2
training_tweets, temp_tweets, training_labels, temp_labels = train_test_split(list(all_tweets_df['text']),list(all_tweets_df['condition']), test_size=0.3, random_state=123)
val_tweets, test_tweets, val_labels, test_labels = train_test_split(temp_tweets, temp_labels, test_size=(1/3), random_state=123)

In [84]:
from transformers import RobertaTokenizer, RobertaModel

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

def tokenize_function(dataset):
    model_inputs = tokenizer(dataset['text'], padding="max_length", truncation=True)
    return model_inputs


loading file https://huggingface.co/roberta-base/resolve/main/vocab.json from cache at /Users/brookegrantham/.cache/huggingface/transformers/d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
loading file https://huggingface.co/roberta-base/resolve/main/merges.txt from cache at /Users/brookegrantham/.cache/huggingface/transformers/cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/roberta-base/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/tokenizer_config.json from cache at None
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /Users/brookegrantham/.cache/huggingface/transfor

In [85]:
train = {'text': training_tweets, 'label': training_labels}
val = {'text': val_tweets, 'label': val_labels}
test = {'text': test_tweets, 'label': test_labels}

In [86]:
from datasets import Dataset

train_dataset = Dataset.from_dict(train)
val_dataset = Dataset.from_dict(val)
test_dataset = Dataset.from_dict(test)

Issue: There was NaN in the dataset not allowing it to be transfered to a Dataset type, thus checking the data condition in roufh cells below.

In [78]:
integers = [elm for elm in all_tweets_df['condition'] if not (isinstance(elm, int))]
print(len(integers))

3000


In [87]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

  0%|          | 0/2821 [00:00<?, ?ba/s]

  0%|          | 0/806 [00:00<?, ?ba/s]

  0%|          | 0/403 [00:00<?, ?ba/s]

In [88]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="transformer_checkpoints",  # specify the directory where models weights will be saved a certain points during training (checkpoints)
    num_train_epochs=3,  # change this if it is taking too long on your computer
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
from transformers import Trainer, AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=3)
trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /Users/brookegrantham/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absol

Step,Training Loss


In [None]:
import numpy as np
import torch

def predict_nn(trained_model, test_dataset):

    output = trained_model(attention_mask=torch.tensor(test_dataset["attention_mask"]), input_ids=torch.tensor(test_dataset["input_ids"]))

    pred_labs = np.argmax(output["logits"].detach().numpy(), axis=1)

    gold_labs = test_dataset["label"]

    return gold_labs, pred_labs

gold_labs, pred_labs = predict_nn(model, test_dataset)

In [None]:
from sklearn.metrics import accuracy_score, multilabel_confusion_matrix
print("RoBERTa on the John Hopkins Twitter dataset:")
print(f'The accuracy score is {accuracy_score(pred_labs, gold_labs)}')
print('The confusion matrix is displayed as follows: ', multilabel_confusion_matrix(pred_labs,gold_labs))