In [None]:
# Installing the transformers library and additional libraries if looking process 

!pip install -q transformers

In [None]:
import pandas as pd
import numpy as np
import string

In [None]:
# Importing stock ml libraries
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

In [None]:
df = pd.read_csv('dataset.csv')

In [None]:
df['content'] = df['title'] + ' + ' + df['text']

In [None]:
selected_labels = {'relationships': 0, 'anxiety': 1, 'depression': 2, 
          'family-conflict': 3, 'intimacy': 4, 'social-relationships': 5, 
          'marriage': 6, 'parenting': 7, 'human-sexuality': 8,
          'behavioral-change': 9, 'relationship-dissolution': 10}

df = df[df['label'].isin(selected_labels.keys())]
id2label = {value:key for key,value in selected_labels.items()}
label2id = {key:value for value,key in selected_labels.items()}

In [None]:
#unique contents
seqs = df.content.to_numpy().tolist()
seqs = list(set(seqs))[1:]

In [None]:
#labels for each content
labels = []
for i in range(len(seqs)):
  temp = df[df.content.isin([seqs[i]])].label.to_numpy()
  temp = list(set(temp.tolist()))
  temp_label = [0] * 11
  for l in temp:
    temp_label[selected_labels[l]] = 1
  labels.append(temp_label)

In [None]:
new_df = pd.DataFrame(list(zip(labels, seqs)), columns = ['labels', 'content'])
new_df

Unnamed: 0,labels,content
0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",How do I deal with my alcoholic boyfriend with...
1,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",How can I slow down and enjoy life? + I feel l...
2,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1]",I'm depressed because my wife is divorcing me ...
3,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]",I'm scared to go to a doctor or take anti-depr...
4,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]",Is it normal to blame myself for someone else'...
...,...,...
745,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]",Can I sign my brother into a mental health fac...
746,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",Why can't I let myself trust my partner? + I'v...
747,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",What should I do about my stress before track ...
748,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]",I think my daughter is stressing too much + Wh...


In [None]:
new_df.to_csv('new_dataset.csv')

In [None]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, max_len):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.content = dataframe.content
        self.targets = dataframe.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.content)

    def __getitem__(self, index):
        content = self.content[index]
        content = " ".join(content.split())

        inputs = self.tokenizer.encode_plus(
                  content,
                  add_special_tokens = True,
                  max_length = 256,           
                  padding='max_length',
                  truncation=True,
                  return_attention_mask = True,
                  return_token_type_ids=True, 
                  return_tensors = 'pt', 
                  )
        ids = torch.tensor(inputs['input_ids'], dtype=torch.long)
        mask = torch.tensor(inputs['attention_mask'], dtype=torch.long)
        token_type_ids = torch.tensor(inputs["token_type_ids"], dtype=torch.long)

        return {
            'input_ids': ids.squeeze(),
            'attention_mask': mask.squeeze(),
            'token_type_ids': token_type_ids.squeeze(),
            'labels': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=new_df.sample(frac=train_size,random_state=22)
test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

training_set = CustomDataset(train_dataset, 128)
testing_set = CustomDataset(test_dataset, 128)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 11,
    output_attentions = False,
    output_hidden_states = False, 
    problem_type="multi_label_classification",
    id2label = id2label,
    label2id = label2id
)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "relationships",
    "1": "anxiety",
    "2": "depression",
    "3": "family-conflict",
    "4": "intimacy",
    "5": "social-relationships",
    "6": "marriage",
    "7": "parenting",
    "8": "human-sexuality",
    "9": "behavioral-change",
    "10": "relationship-dissolution"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "0": "relationships",
    "1": "anxiety",
 

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir='./result',
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    #push_to_hub=True,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=training_set,
    eval_dataset=testing_set,
    compute_metrics=compute_metrics   
)

train_out = trainer.train()

***** Running training *****
  Num examples = 600
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 750


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.396935,0.132231,0.534635,0.053333
2,No log,0.354325,0.217899,0.560612,0.086667


***** Running Evaluation *****
  Num examples = 150
  Batch size = 8
Saving model checkpoint to ./result/checkpoint-75
Configuration saved in ./result/checkpoint-75/config.json
Model weights saved in ./result/checkpoint-75/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8
Saving model checkpoint to ./result/checkpoint-150
Configuration saved in ./result/checkpoint-150/config.json
Model weights saved in ./result/checkpoint-150/pytorch_model.bin
