In [None]:
# !pip install transformers

In [None]:
# !pip install datasets

In [None]:
import json

In [None]:
from transformers import AutoTokenizer

from transformers import AutoModelForSequenceClassification
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import get_scheduler

from datasets import DatasetDict

from transformers import DataCollatorWithPadding

from tqdm.auto import tqdm
import torch
from datasets import load_metric

from sklearn.utils import shuffle
import pandas as pd
from datasets import Dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased", force_download=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def tokenize_function(examples):

    return tokenizer(examples["FQText"], padding="max_length", truncation=True)

In [None]:
def preprocess_pandas_dataframe(df, label):
    df = df[["FQText", label]]
    df = df.rename(columns={label: "labels"}) # IFF we uncomment this, we can change the rest of labels to hard coded string "labels"


    data = DatasetDict()
    data = Dataset.from_pandas(df).train_test_split(train_size=90, seed=42)
    data = data.class_encode_column("labels")
    cols_to_remove = [col for col in data["train"].column_names if col != "labels"]
    tokenized_datasets = data.map(tokenize_function, batched=True, remove_columns=cols_to_remove)
    tokenized_datasets.set_format('torch', columns=["labels", 'input_ids', 'attention_mask'])

    return tokenized_datasets

In [None]:
def data_collator(tokenized_datasets, tokenizer=tokenizer):

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    train_dataLoader = DataLoader(tokenized_datasets['train'],
                           shuffle=True,
                           batch_size=8,
                           collate_fn=data_collator)
    eval_dataLoader = DataLoader(tokenized_datasets['test'], batch_size=8, collate_fn=data_collator)

    return train_dataLoader, eval_dataLoader

In [None]:
def train_evaluate(tokenized_datasets, train_dataLoader, eval_dataLoader):

    num_labels = tokenized_datasets['train'].features["labels"].num_classes
    model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

    learning_rate = 5e-5
    num_epochs = 5

    optimizer = AdamW(model.parameters(), lr=learning_rate)

    num_training_batches = len(train_dataLoader)
    num_training_steps = num_epochs * num_training_batches
    lr_scheduler = get_scheduler("linear",
                                optimizer=optimizer,
                                num_warmup_steps=0,
                                num_training_steps=num_training_steps)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # print(device)

    model.to(device)

    progress_bar = tqdm(range(num_training_steps))

    # Train the model with PyTorch training loop
    model.train()
    for epoch in range(num_epochs):
        for batch in train_dataLoader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

    # Load metric
    metric = load_metric("glue", "mrpc")

    # Iteratively evaluate the model and compute metrics
    model.eval()
    for batch in eval_dataLoader:
        # print(batch)
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])

    # Get model accuracy and F1 score
    metrics = metric.compute()
    print(metrics)
    return metrics

In [None]:
def train_model(df, label, tokenizer=tokenizer):
    tokenized_datasets = preprocess_pandas_dataframe(df, label)
    train_dataLoader, eval_dataLoader = data_collator(tokenized_datasets, tokenizer)
    metrics = train_evaluate(tokenized_datasets, train_dataLoader, eval_dataLoader)
    return metrics

In [None]:
def get_instances(df, col):
  positive_class = df[df[col] == 1].shape[0]
  negative_class = df[df[col] == 0].shape[0]
  return [positive_class, negative_class]

TRANSFORMER MODEL FOR CONTENTS

In [None]:
data_contents = pd.read_csv("data_clean_contents.csv")
columns_contents = list(data_contents.columns)
columns_contents = [col for col in columns_contents if col not in ["Unnamed: 0", "FQText"]]

In [None]:
print(columns_contents)

['(A)', '(Ad)', '(H)', '(Hd)', 'A', 'Abs', 'Ad', 'Alim', 'Anat', 'Art', 'Bot', 'Elem', 'Frag', 'Ge', 'H', 'Hd', 'Id', 'Nat', 'Obj', 'Pays', 'Radio', 'Sc', 'Sex', 'Sg', 'Vet']


In [None]:
dict_results_contents = {}

for col in columns_contents:
  dict_results_contents[col] = {}
  try:
    metrics = train_model(data_contents, col)
    dict_results_contents[col]["metrics"] = metrics
    dict_results_contents[col]["{}_class_balance".format(col)] = get_instances(data_contents, col)

  except:
    dict_results_contents[col]["metrics"] = "failed"
    dict_results_contents[col]["{}_class_balance".format(col)] = get_instances(data_contents, col)
    # print("something went wrong")

Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.974124809741248, 'f1': 0.0}


Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.9817351598173516, 'f1': 0.0}


Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.9071537290715372, 'f1': 0.0}


Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.9558599695585996, 'f1': 0.0}


Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.7945205479452054, 'f1': 0.2105263157894737}


Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.9604261796042618, 'f1': 0.0}


Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.9147640791476408, 'f1': 0.0}


Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.974124809741248, 'f1': 0.0}


Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.923896499238965, 'f1': 0.10714285714285714}


Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.9452054794520548, 'f1': 0.0}


Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.9756468797564688, 'f1': 0.2727272727272727}


Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.989345509893455, 'f1': 0.0}


Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.8888888888888888, 'f1': 0.0}


Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.8417047184170472, 'f1': 0.05454545454545454}


Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.9863013698630136, 'f1': 0.0}


Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.9269406392694064, 'f1': 0.0}


Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.928462709284627, 'f1': 0.0}


Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.9512937595129376, 'f1': 0.0}


Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.989345509893455, 'f1': 0.0}


Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.893455098934551, 'f1': 0.0}


Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.9832572298325722, 'f1': 0.0}


Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.9817351598173516, 'f1': 0.0}


Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.9117199391171994, 'f1': 0.0}


In [None]:
dict_results_contents

{'(A)': {'metrics': {'accuracy': 0.974124809741248, 'f1': 0.0},
  '(A)_class_balance': [19, 728]},
 '(Ad)': {'metrics': {'accuracy': 0.9817351598173516, 'f1': 0.0},
  '(Ad)_class_balance': [13, 734]},
 '(H)': {'metrics': {'accuracy': 0.9071537290715372, 'f1': 0.0},
  '(H)_class_balance': [69, 678]},
 '(Hd)': {'metrics': {'accuracy': 0.9558599695585996, 'f1': 0.0},
  '(Hd)_class_balance': [35, 712]},
 'A': {'metrics': {'accuracy': 0.7945205479452054, 'f1': 0.2105263157894737},
  'A_class_balance': [165, 582]},
 'Abs': {'metrics': {'accuracy': 0.9604261796042618, 'f1': 0.0},
  'Abs_class_balance': [29, 718]},
 'Ad': {'metrics': {'accuracy': 0.9147640791476408, 'f1': 0.0},
  'Ad_class_balance': [64, 683]},
 'Alim': {'metrics': {'accuracy': 0.974124809741248, 'f1': 0.0},
  'Alim_class_balance': [19, 728]},
 'Anat': {'metrics': {'accuracy': 0.923896499238965,
   'f1': 0.10714285714285714},
  'Anat_class_balance': [52, 695]},
 'Art': {'metrics': {'accuracy': 0.9452054794520548, 'f1': 0.0},
 

In [None]:
with open("results_transformers_contents.json", "w") as f:
  json.dump(dict_results_contents, f)

TRANSFORMER MODEL FOR DETERMINANTS

In [None]:
data_determinants = pd.read_csv("data_clean_determinants.csv")
columns_determinants = list(data_determinants.columns)
columns_determinants = [col for col in columns_determinants if col not in ["Unnamed: 0", "FQText"]]

In [None]:
print(columns_determinants)

['C', "C'", "C'F", 'CF', 'E', 'EF', 'F', 'FC', "FC'", 'FE', 'K', 'kan']


In [None]:
dict_results_determinants = {}

for col in columns_determinants:
  dict_results_determinants[col] = {}
  try:
    metrics = train_model(data_determinants, col)
    dict_results_determinants[col]["metrics"] = metrics
    dict_results_determinants[col]["{}_class_balance".format(col)] = get_instances(data_determinants, col)

  except:
    dict_results_determinants[col]["metrics"] = "failed"
    dict_results_determinants[col]["{}_class_balance".format(col)] = get_instances(data_determinants, col)
    # print("something went wrong")

Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.9817351598173516, 'f1': 0.0}


Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.9375951293759512, 'f1': 0.0}


Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.9726027397260274, 'f1': 0.0}


Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.9665144596651446, 'f1': 0.0}


Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.6210045662100456, 'f1': 0.336}


Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.9056316590563166, 'f1': 0.0}


Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.9512937595129376, 'f1': 0.0}


Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.8584474885844748, 'f1': 0.0}


Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.6621004566210046, 'f1': 0.3967391304347826}


Stringifying the column:   0%|          | 0/90 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/90 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/657 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/657 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/657 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.8386605783866058, 'f1': 0.0}


In [None]:
dict_results_determinants

{'C': {'metrics': 'failed', 'C_class_balance': [15, 732]},
 "C'": {'metrics': 'failed', "C'_class_balance": [7, 740]},
 "C'F": {'metrics': {'accuracy': 0.9817351598173516, 'f1': 0.0},
  "C'F_class_balance": [16, 731]},
 'CF': {'metrics': {'accuracy': 0.9375951293759512, 'f1': 0.0},
  'CF_class_balance': [46, 701]},
 'E': {'metrics': {'accuracy': 0.9726027397260274, 'f1': 0.0},
  'E_class_balance': [19, 728]},
 'EF': {'metrics': {'accuracy': 0.9665144596651446, 'f1': 0.0},
  'EF_class_balance': [25, 722]},
 'F': {'metrics': {'accuracy': 0.6210045662100456, 'f1': 0.336},
  'F_class_balance': [253, 494]},
 'FC': {'metrics': {'accuracy': 0.9056316590563166, 'f1': 0.0},
  'FC_class_balance': [70, 677]},
 "FC'": {'metrics': {'accuracy': 0.9512937595129376, 'f1': 0.0},
  "FC'_class_balance": [40, 707]},
 'FE': {'metrics': {'accuracy': 0.8584474885844748, 'f1': 0.0},
  'FE_class_balance': [111, 636]},
 'K': {'metrics': {'accuracy': 0.6621004566210046, 'f1': 0.3967391304347826},
  'K_class_bala

In [None]:
with open("results_transformers_determinants.json", "w") as f:
  json.dump(dict_results_determinants, f)