In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [4]:
!ls ../datasets/adept/train-dev-test-split/

README.md  test.json  train.json  val.json


In [5]:
import json
import nltk
import re

In [6]:
adept_data_path = "../datasets/adept/train-dev-test-split"
split = "train"

In [7]:
train_data = json.load(open('{}/{}.json'.format(adept_data_path, split), 'r'))

In [8]:
df_train = pd.DataFrame(train_data)
df_train.head()

Unnamed: 0,sentence1,sentence2,modifier,noun,label,idx
0,The effect of sleeping is rejuvenation.,The effect of additional sleeping is rejuvenat...,additional,sleeping,3,13484
1,A toothbrush is for fresh breath.,A regular toothbrush is for fresh breath.,regular,toothbrush,2,2620
2,A scene is painted.,A negative scene is painted.,negative,scene,2,3324
3,A bone breaks a tooth.,An alleged bone breaks a tooth.,alleged,bone,2,10610
4,A trip causes a happening.,A fabulous trip causes a happening.,fabulous,trip,2,14917


In [9]:
label_to_class_map = {0:"Impossible", 1:"Less Likely", 2:"Equally Likely", 3:"More Likely", 4:"Necessarily True"}

In [10]:
label_to_class_map.values()

dict_values(['Impossible', 'Less Likely', 'Equally Likely', 'More Likely', 'Necessarily True'])

In [11]:
df_train['sentence2_preprocessed'] = df_train['sentence2'].map(lambda x: re.sub('[,\.!?]', '', x))
df_train['sentence2_preprocessed'] = df_train['sentence2_preprocessed'].map(lambda x: x.lower())
# df_train['class_label'] = df_train.label.map(lambda x: label_to_class_map[x])
df_train.head(2)

Unnamed: 0,sentence1,sentence2,modifier,noun,label,idx,sentence2_preprocessed
0,The effect of sleeping is rejuvenation.,The effect of additional sleeping is rejuvenat...,additional,sleeping,3,13484,the effect of additional sleeping is rejuvenation
1,A toothbrush is for fresh breath.,A regular toothbrush is for fresh breath.,regular,toothbrush,2,2620,a regular toothbrush is for fresh breath


In [12]:
# We got this dataset for concreteness of 40k words (https://pubmed.ncbi.nlm.nih.gov/24142837/) from https://web.stanford.edu/class/linguist278/data/
concreteness_df = pd.read_csv('../datasets/concreteness/Concreteness_ratings_Brysbaert_et_al_BRM.csv')
concreteness_df.head(2)

Unnamed: 0,Word,Bigram,Conc.M,Conc.SD,Unknown,Total,Percent_known,SUBTLEX,Dom_Pos
0,roadsweeper,0,4.85,0.37,1,27,0.96,0,0
1,traindriver,0,4.54,0.71,3,29,0.9,0,0


In [13]:
word_to_concreteness_score_map = dict()
for idx, row in concreteness_df.iterrows():
    row = row.to_dict()
    word_to_concreteness_score_map[row['Word']] = row['Conc.M']/5.0 # Normalizing to a scale of 0 to 1

In [14]:
len(word_to_concreteness_score_map.keys())

39954

In [15]:
def get_concreteness_score(word):
    """
    Get the concreteness score of a word based on the Concreteness Ratings dataset.
    """
    # If the word is not found in the dataset, return a default score of 0.5
    return word_to_concreteness_score_map.get(word, 0.5)

In [16]:
def calculate_text_concreteness(text):
    """
    Calculate the concreteness score for a given text.
    """
    words = nltk.word_tokenize(text)
    concreteness_scores = [get_concreteness_score(word) for word in words]
    # Take the average concreteness score of all words in the text
    return sum(concreteness_scores) / len(concreteness_scores)

In [17]:
# Example usage
text = "the laws of the world can't stop him"
concreteness_score = calculate_text_concreteness(text)
print(f"Concreteness Score: {concreteness_score}")

Concreteness Score: 0.5246666666666667


In [18]:
# Example usage
text = "car crash"
concreteness_score = calculate_text_concreteness(text)
print(f"Concreteness Score: {concreteness_score}")

Concreteness Score: 0.868


In [19]:
df_train['concreteness_score'] = df_train.sentence2_preprocessed.apply(calculate_text_concreteness)

In [20]:
df_train.head(2)

Unnamed: 0,sentence1,sentence2,modifier,noun,label,idx,sentence2_preprocessed,concreteness_score
0,The effect of sleeping is rejuvenation.,The effect of additional sleeping is rejuvenat...,additional,sleeping,3,13484,the effect of additional sleeping is rejuvenation,0.435714
1,A toothbrush is for fresh breath.,A regular toothbrush is for fresh breath.,regular,toothbrush,2,2620,a regular toothbrush is for fresh breath,0.526


In [21]:
df_train.shape

(12892, 8)

In [22]:
df_validation = pd.DataFrame(json.load(open('{}/{}.json'.format(adept_data_path, "val"), 'r')))
df_test = pd.DataFrame(json.load(open('{}/{}.json'.format(adept_data_path, "test"), 'r')))
print(df_validation.shape, df_test.shape)

(1611, 6) (1612, 6)


In [23]:
df_validation['sentence2_preprocessed'] = df_validation['sentence2'].map(lambda x: re.sub('[,\.!?]', '', x))
df_validation['sentence2_preprocessed'] = df_validation['sentence2_preprocessed'].map(lambda x: x.lower())
df_validation['concreteness_score'] = df_validation.sentence2_preprocessed.apply(calculate_text_concreteness)

df_test['sentence2_preprocessed'] = df_test['sentence2'].map(lambda x: re.sub('[,\.!?]', '', x))
df_test['sentence2_preprocessed'] = df_test['sentence2_preprocessed'].map(lambda x: x.lower())
df_test['concreteness_score'] = df_test.sentence2_preprocessed.apply(calculate_text_concreteness)

# Fine Tuning Bert Base Uncased on ADEPT

## Data Preparation

In [24]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel
from transformers import DataCollatorWithPadding, get_scheduler
from datasets import load_dataset, Dataset, DatasetDict
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [25]:
adept_data_path

'../datasets/adept/train-dev-test-split'

In [26]:
!ls '../datasets/adept/train-dev-test-split'

README.md  test.json  train.json  val.json


In [27]:
train_split = "train.json"
validation_split = "val.json"
test_split = "test.json"

In [28]:
adept_dataset = DatasetDict({
    'train': Dataset.from_pandas(df_train),
    'validation': Dataset.from_pandas(df_validation),
    'test': Dataset.from_pandas(df_test)
})

In [29]:
# adept_dataset = load_dataset("json", data_files=data_files)
adept_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'modifier', 'noun', 'label', 'idx', 'sentence2_preprocessed', 'concreteness_score'],
        num_rows: 12892
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'modifier', 'noun', 'label', 'idx', 'sentence2_preprocessed', 'concreteness_score'],
        num_rows: 1611
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'modifier', 'noun', 'label', 'idx', 'sentence2_preprocessed', 'concreteness_score'],
        num_rows: 1612
    })
})

In [30]:
adept_dataset['train'].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'modifier': Value(dtype='string', id=None),
 'noun': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None),
 'idx': Value(dtype='int64', id=None),
 'sentence2_preprocessed': Value(dtype='string', id=None),
 'concreteness_score': Value(dtype='float64', id=None)}

In [31]:
adept_dataset['train'][10]

{'sentence1': 'A year is made up of 365 days.',
 'sentence2': 'An outstanding year is made up of 365 days.',
 'modifier': 'outstanding',
 'noun': 'year',
 'label': 2,
 'idx': 2825,
 'sentence2_preprocessed': 'an outstanding year is made up of 365 days',
 'concreteness_score': 0.48733333333333334}

In [32]:
best_params = {'learning_rate': 3.660515504756857e-05,
 'num_train_epochs': 3,
 'model_name': 'microsoft/deberta-base'}

In [33]:
checkpoint = best_params['model_name']

In [34]:
models_dict = {"BERT": "bert-base-uncased",
"ROBERTA": "grammarly/detexd-roberta-base",
"DEBERTA": "sileod/deberta-v3-base-tasksource-nli"}

In [35]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [36]:
# model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=5, ignore_mismatched_sizes=True)

In [37]:
def tokenize_sentence(item):
    return tokenizer(item['sentence2'], truncation=True)

In [38]:
tokenized_dataset = adept_dataset.map(tokenize_sentence, batched=True)

Map: 100%|██████████| 12892/12892 [00:00<00:00, 43526.39 examples/s]
Map: 100%|██████████| 1611/1611 [00:00<00:00, 67098.53 examples/s]
Map: 100%|██████████| 1612/1612 [00:00<00:00, 54113.12 examples/s]


In [39]:
tokenized_dataset['train'].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'modifier': Value(dtype='string', id=None),
 'noun': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None),
 'idx': Value(dtype='int64', id=None),
 'sentence2_preprocessed': Value(dtype='string', id=None),
 'concreteness_score': Value(dtype='float64', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [40]:
tokenized_dataset = tokenized_dataset.remove_columns(['sentence1', 'sentence2', 'idx', 'modifier', 'noun', 'sentence2_preprocessed'])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset = tokenized_dataset.with_format("torch")
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'concreteness_score', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 12892
    })
    validation: Dataset({
        features: ['labels', 'concreteness_score', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1611
    })
    test: Dataset({
        features: ['labels', 'concreteness_score', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1612
    })
})

In [41]:
# model

In [42]:
# headless_model = AutoModel.from_pretrained(checkpoint)

In [43]:
# headless_model

In [44]:
# model.config

In [45]:
from transformers import AutoConfig

In [46]:
model_config = AutoConfig.from_pretrained(checkpoint, num_labels=5)

#### Custom Model

In [47]:
import transformers

In [48]:
transformers.models.deberta.modeling_deberta.ContextPooler

transformers.models.deberta.modeling_deberta.ContextPooler

In [49]:
class CustomModel(torch.nn.Module):
    def __init__(self):
        super(CustomModel, self).__init__()
        self.deberta = AutoModel.from_pretrained('distilbert-base-uncased')
        self.linear = torch.nn.Linear(in_features=768 + 1, out_features=2)  # Adjust input size

    def forward(self, input_ids, attention_mask, feature):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
        combined_input = torch.cat((bert_output, feature), dim=1)
        output = self.linear(combined_input)
        return output

In [50]:
from transformers.models.deberta.modeling_deberta import *

In [51]:
class CustomDebertaForSequenceClassification(DebertaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        num_labels = getattr(config, "num_labels", 2)
        self.num_labels = num_labels

        self.deberta = DebertaModel(config)
        self.pooler = ContextPooler(config)
        output_dim = self.pooler.output_dim

        ###### +1 for concreteness score ######
        self.classifier = torch.nn.Linear(in_features=output_dim+1, out_features=num_labels) 
        drop_out = getattr(config, "cls_dropout", None)
        drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
        self.dropout = StableDropout(drop_out)

        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        concreteness_score: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.deberta(
            input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        encoder_layer = outputs[0]
        pooled_output = self.pooler(encoder_layer)
        pooled_output = self.dropout(pooled_output)
        ##### combining concreteness score input with pooled context #####
        ##### Unsqueezing concreteness score to match dimensions #####
        combined_input = torch.cat((pooled_output, concreteness_score.unsqueeze(1)), dim=1)
        logits = self.classifier(combined_input)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    # regression task
                    loss_fn = nn.MSELoss()
                    logits = logits.view(-1).to(labels.dtype)
                    loss = loss_fn(logits, labels.view(-1))
                elif labels.dim() == 1 or labels.size(-1) == 1:
                    label_index = (labels >= 0).nonzero()
                    labels = labels.long()
                    if label_index.size(0) > 0:
                        labeled_logits = torch.gather(
                            logits, 0, label_index.expand(label_index.size(0), logits.size(1))
                        )
                        labels = torch.gather(labels, 0, label_index.view(-1))
                        loss_fct = CrossEntropyLoss()
                        loss = loss_fct(labeled_logits.view(-1, self.num_labels).float(), labels.view(-1))
                    else:
                        loss = torch.tensor(0).to(logits)
                else:
                    log_softmax = nn.LogSoftmax(-1)
                    loss = -((log_softmax(logits) * labels).sum(-1)).mean()
            elif self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
        )

In [52]:
custom_model = CustomDebertaForSequenceClassification(model_config)

In [53]:
custom_model

CustomDebertaForSequenceClassification(
  (deberta): DebertaModel(
    (embeddings): DebertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): StableDropout()
    )
    (encoder): DebertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaLayer(
          (attention): DebertaAttention(
            (self): DisentangledSelfAttention(
              (in_proj): Linear(in_features=768, out_features=2304, bias=False)
              (pos_dropout): StableDropout()
              (pos_proj): Linear(in_features=768, out_features=768, bias=False)
              (pos_q_proj): Linear(in_features=768, out_features=768, bias=True)
              (dropout): StableDropout()
            )
            (output): DebertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): DebertaLayerNorm()
              (dropout): StableDropout()
            )
          )
     

In [54]:


# # Step 5: Adjust Training Loop
# model = CustomModel()
# criterion = torch.nn.CrossEntropyLoss()
# optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

#### Setting Up Data Loaders

In [55]:
num_epochs = 1
batch_size = 32

In [56]:
from torch.utils.data import DataLoader

In [57]:
data_collator = DataCollatorWithPadding(tokenizer)

In [58]:
train_dataloader = DataLoader(tokenized_dataset['train'], batch_size=batch_size, shuffle=True, collate_fn=data_collator)
validation_dataloader = DataLoader(tokenized_dataset['validation'], batch_size=batch_size, collate_fn=data_collator)
test_dataloader = DataLoader(tokenized_dataset['test'], batch_size=batch_size, collate_fn=data_collator)

In [59]:
device = torch.device("cuda") if torch.cuda.is_available else torch.device("cpu")
custom_model.to(device)
print(device)

cuda


## Training

#### Setting up Optimizer with deacaying learning rate

In [60]:
from torch.optim.adamw import AdamW

In [61]:
optimizer = AdamW(custom_model.parameters(), lr=3.660515504756857e-05)

In [62]:
num_training_steps = num_epochs*len(train_dataloader)

In [63]:
learning_rate_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

#### Training Loop

In [64]:
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k:v.to(device) for k, v in batch.items()}
        outputs = custom_model(**batch)
        loss = outputs.loss
        # calculating gradients
        loss.backward()
        # optimizing weights
        optimizer.step()
        # updating learning rate
        learning_rate_scheduler.step()
        # flushing gradients
        optimizer.zero_grad()
        # updating progress bar
        progress_bar.update(1)

  0%|          | 0/1209 [00:00<?, ?it/s]You're using a DebertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 1209/1209 [07:36<00:00,  3.67it/s]

### Evaluation

In [65]:
custom_model.eval()

CustomDebertaForSequenceClassification(
  (deberta): DebertaModel(
    (embeddings): DebertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): StableDropout()
    )
    (encoder): DebertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaLayer(
          (attention): DebertaAttention(
            (self): DisentangledSelfAttention(
              (in_proj): Linear(in_features=768, out_features=2304, bias=False)
              (pos_dropout): StableDropout()
              (pos_proj): Linear(in_features=768, out_features=768, bias=False)
              (pos_q_proj): Linear(in_features=768, out_features=768, bias=True)
              (dropout): StableDropout()
            )
            (output): DebertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): DebertaLayerNorm()
              (dropout): StableDropout()
            )
          )
     

In [66]:
import evaluate

In [67]:
accuracy = evaluate.load('accuracy')
precision = evaluate.load('precision')
recall = evaluate.load('recall')
f1 = evaluate.load('f1')
roc_auc =  evaluate.load("roc_auc", "multiclass")

In [68]:
metrics = [accuracy, precision, recall, f1]

#### Evaluation Loop

In [69]:
from sklearn.metrics import roc_auc_score

In [70]:
from datasets import load_metric

In [71]:
for batch in validation_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = custom_model(**batch)
    # Extract logits and predictions
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

    # Apply softmax to convert logits to probabilities
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    
    # Extract probabilities for the positive class
    positive_probabilities = probabilities

    # Update metrics for accuracy, precision, recall, and F1
    for metric in metrics:
        metric.add_batch(predictions=predictions, references=batch['labels'])

    # Update ROC AUC metric
    roc_auc.add_batch(prediction_scores=positive_probabilities, references=batch['labels'])

# # Compute metrics for accuracy, precision, recall, and F1
validation_eval_dict = {}
# for metric in metrics.values():
#     eval_dict.update(metric.compute(average="macro"))

validation_eval_dict.update(accuracy.compute())
validation_eval_dict.update(precision.compute(average="macro"))
validation_eval_dict.update(recall.compute(average="macro"))
validation_eval_dict.update(f1.compute(average="macro"))
validation_eval_dict.update(roc_auc.compute(multi_class='ovo', average="macro"))


  _warn_prf(average, modifier, msg_start, len(result))


In [81]:
prev_valid_3_res = {'accuracy': 0.65983860955928,
 'precision': 0.28776653062324054,
 'recall': 0.27802638010999664,
 'f1': 0.2769530906793819,
 'roc_auc': 0.6346530029171855}
prev_valid_3_res

{'accuracy': 0.65983860955928,
 'precision': 0.28776653062324054,
 'recall': 0.27802638010999664,
 'f1': 0.2769530906793819,
 'roc_auc': 0.6346530029171855}

In [72]:
prev_valid_10_res = {'accuracy': 0.5859714463066419,
 'precision': 0.2790926099980391,
 'recall': 0.27399491306003604,
 'f1': 0.2757435501839343,
 'roc_auc': 0.5916102736812134}
prev_valid_10_res

{'accuracy': 0.5859714463066419,
 'precision': 0.2790926099980391,
 'recall': 0.27399491306003604,
 'f1': 0.2757435501839343,
 'roc_auc': 0.5916102736812134}

In [73]:
validation_eval_dict

{'accuracy': 0.65983860955928,
 'precision': 0.28776653062324054,
 'recall': 0.27802638010999664,
 'f1': 0.2769530906793819,
 'roc_auc': 0.6346530029171855}

In [77]:
for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = custom_model(**batch)
    # Extract logits and predictions
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

    # Apply softmax to convert logits to probabilities
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    
    # Extract probabilities for the positive class
    positive_probabilities = probabilities

    # Update metrics for accuracy, precision, recall, and F1
    for metric in metrics:
        metric.add_batch(predictions=predictions, references=batch['labels'])

    # Update ROC AUC metric
    roc_auc.add_batch(prediction_scores=positive_probabilities, references=batch['labels'])

# # Compute metrics for accuracy, precision, recall, and F1
test_eval_dict = {}
# for metric in metrics.values():
#     eval_dict.update(metric.compute(average="macro"))

test_eval_dict.update(accuracy.compute())
test_eval_dict.update(precision.compute(average="macro"))
test_eval_dict.update(recall.compute(average="macro"))
test_eval_dict.update(f1.compute(average="macro"))
test_eval_dict.update(roc_auc.compute(multi_class='ovo', average="macro"))


  _warn_prf(average, modifier, msg_start, len(result))


In [80]:
prev_test_3_res = {'accuracy': 0.673697270471464,
 'precision': 0.2849209452293981,
 'recall': 0.2732542150719794,
 'f1': 0.27303419884506624,
 'roc_auc': 0.6543617977047307}
prev_test_3_res

{'accuracy': 0.673697270471464,
 'precision': 0.2849209452293981,
 'recall': 0.2732542150719794,
 'f1': 0.27303419884506624,
 'roc_auc': 0.6543617977047307}

In [78]:
prev_test_10_res = {'accuracy': 0.5859714463066419,
 'precision': 0.2790926099980391,
 'recall': 0.27399491306003604,
 'f1': 0.2757435501839343,
 'roc_auc': 0.5916102736812134}
prev_test_10_res

{'accuracy': 0.5859714463066419,
 'precision': 0.2790926099980391,
 'recall': 0.27399491306003604,
 'f1': 0.2757435501839343,
 'roc_auc': 0.5916102736812134}

In [79]:
test_eval_dict

{'accuracy': 0.673697270471464,
 'precision': 0.2849209452293981,
 'recall': 0.2732542150719794,
 'f1': 0.27303419884506624,
 'roc_auc': 0.6543617977047307}