In [10]:
!pip install evaluate
import torch
import numpy as np
import evaluate
from datasets import load_dataset
from datasets import Dataset
from transformers import pipeline
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, AutoModelForCausalLM
import os
os.environ["WANDB_DISABLED"] = "true"



In [11]:
dataset = np.load('labeled_data.npy', allow_pickle=True)

## Bert Validation

In [13]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
def tokenize_function(examples):
    return tokenizer(text=examples["title"], padding="max_length", truncation=True)

In [14]:
from sklearn.model_selection import KFold
accuracy = []
k = 6
kf = KFold(n_splits=k, shuffle=True, random_state=42)
for train_index, test_index in kf.split(dataset):
    train_data = dataset[train_index]
    test_data = dataset[test_index]
    trainset = [{'title': data['title'], 'label':data['label']} for data in train_data]
    testset = [{'title': data['title'], 'label':data['label']} for data in test_data]
    for data in trainset:
      if data['label'] == 2:
        data['label'] = 1
    for data in testset:
      if data['label'] == 2:
        data['label'] = 1
    train_dataset = Dataset.from_list(trainset)
    test_dataset = Dataset.from_list(testset)
    tokenized_train = train_dataset.map(tokenize_function, batched=True)
    tokenized_test = test_dataset.map(tokenize_function, batched=True)
    model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2, device_map="cuda")
    training_args = TrainingArguments(output_dir="test_trainer")
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        data_collator=data_collator,
        tokenizer = tokenizer
    )
    metric = evaluate.load("accuracy")
    trainer.train()
    predictions = trainer.predict(tokenized_test)
    preds = np.argmax(predictions.predictions, axis=-1)
    accuracy.append(metric.compute(predictions=preds, references=tokenized_test['label']))

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss


Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss


Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss


Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss


Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss


Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss


In [15]:
acculist = []
for idx, accu in enumerate(accuracy):
  print(f"{idx+1}th fold has accuracy {accu['accuracy']}")
  acculist.append(accu['accuracy'])
mean = np.mean(acculist)
var = np.var(acculist)
print(f"Mean is {mean}")
print(f"Variance is {var}")

1th fold has accuracy 0.88
2th fold has accuracy 0.92
3th fold has accuracy 0.88
4th fold has accuracy 0.96
5th fold has accuracy 0.92
6th fold has accuracy 1.0
Mean is 0.9266666666666667
Variance is 0.0018222222222222216


## RoBERTa Validation

In [16]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
def tokenize_function(examples):
    return tokenizer(text=examples["title"], padding="max_length", truncation=True)

In [17]:
accuracy = []
k = 6
kf = KFold(n_splits=k, shuffle=True, random_state=42)
for train_index, test_index in kf.split(dataset):
    train_data = dataset[train_index]
    test_data = dataset[test_index]
    trainset = [{'title': data['title'], 'label':data['label']} for data in train_data]
    testset = [{'title': data['title'], 'label':data['label']} for data in test_data]
    for data in trainset:
      if data['label'] == 2:
        data['label'] = 1
    for data in testset:
      if data['label'] == 2:
        data['label'] = 1
    train_dataset = Dataset.from_list(trainset)
    test_dataset = Dataset.from_list(testset)
    tokenized_train = train_dataset.map(tokenize_function, batched=True)
    tokenized_test = test_dataset.map(tokenize_function, batched=True)
    model2 = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2, device_map="cuda")
    training_args = TrainingArguments(output_dir="test_trainer")
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    trainer2 = Trainer(
      model=model2,
      args=training_args,
      train_dataset=tokenized_train,
      eval_dataset=tokenized_test,
      data_collator=data_collator,
      tokenizer = tokenizer
    )
    metric = evaluate.load("accuracy")
    trainer2.train()
    predictions2 = trainer2.predict(tokenized_test)
    preds2 = np.argmax(predictions2.predictions, axis=-1)
    accuracy.append(metric.compute(predictions=preds2, references=tokenized_test['label']))

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer2 = Trainer(


Step,Training Loss


Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer2 = Trainer(


Step,Training Loss


Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer2 = Trainer(


Step,Training Loss


Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer2 = Trainer(


Step,Training Loss


Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer2 = Trainer(


Step,Training Loss


Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer2 = Trainer(


Step,Training Loss


In [18]:
acculist = []
for idx, accu in enumerate(accuracy):
  print(f"{idx+1}th fold has accuracy {accu['accuracy']}")
  acculist.append(accu['accuracy'])
mean = np.mean(acculist)
var = np.var(acculist)
print(f"Mean is {mean}")
print(f"Variance is {var}")

1th fold has accuracy 1.0
2th fold has accuracy 0.84
3th fold has accuracy 0.92
4th fold has accuracy 0.88
5th fold has accuracy 1.0
6th fold has accuracy 1.0
Mean is 0.94
Variance is 0.0041333333333333335
