In [1]:
import torch
torch.cuda.empty_cache()
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification,\
    TrainingArguments, Trainer, pipeline, DataCollatorWithPadding, set_seed
import evaluate
import numpy as np
import torch.nn as nn
import math
import time
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
set_seed(42)
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [3]:
snli = load_dataset("snli")
# snli_test = load_dataset("json", data_files="SNLI_Dataset/snli_1.0_test.jsonl")
snli

Found cached dataset snli (C:/Users/zebzi/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)
100%|██████████| 3/3 [00:00<00:00, 435.30it/s]


DatasetDict({
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 550152
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
})

In [4]:
print(snli["train"]["premise"][0])
snli = snli.filter(lambda example: example["label"] != -1)

Loading cached processed dataset at C:\Users\zebzi\.cache\huggingface\datasets\snli\plain_text\1.0.0\1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b\cache-17e02a99a93fd477.arrow
Loading cached processed dataset at C:\Users\zebzi\.cache\huggingface\datasets\snli\plain_text\1.0.0\1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b\cache-b71979dc8e6f8a8f.arrow
Loading cached processed dataset at C:\Users\zebzi\.cache\huggingface\datasets\snli\plain_text\1.0.0\1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b\cache-3ac348b3e4f4ff95.arrow


A person on a horse jumps over a broken down airplane.


In [5]:
type(snli["train"]["hypothesis"])
textTrain = np.char.add(np.char.add(snli['train']['premise'], ' '), snli['train']['hypothesis'])
textTest = np.char.add(np.char.add(snli['test']['premise'], ' '), snli['test']['hypothesis'])
textVali = np.char.add(np.char.add(snli['validation']['premise'], ' '), snli['validation']['hypothesis'])
print(snli)


DatasetDict({
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 9824
    })
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 549367
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 9842
    })
})


In [6]:
print(snli)
snli["train"] = snli["train"].remove_columns(["premise", "hypothesis"])
snli["train"] = snli["train"].add_column("text", textTrain)

snli["test"] = snli["test"].remove_columns(["premise", "hypothesis"])
snli["test"] = snli["test"].add_column("text", textTest)

snli["validation"] = snli["validation"].remove_columns(["premise", "hypothesis"])
snli["validation"] = snli["validation"].add_column("text", textVali)

DatasetDict({
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 9824
    })
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 549367
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 9842
    })
})


Loading cached processed dataset at C:\Users\zebzi\.cache\huggingface\datasets\snli\plain_text\1.0.0\1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b\cache-64c7ea292a984126.arrow
Loading cached processed dataset at C:\Users\zebzi\.cache\huggingface\datasets\snli\plain_text\1.0.0\1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b\cache-3a136d10d4818fd9.arrow
Loading cached processed dataset at C:\Users\zebzi\.cache\huggingface\datasets\snli\plain_text\1.0.0\1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b\cache-56d5f0c3e7bb6421.arrow


In [None]:
print(snli)

In [7]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [8]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)
    
tokenized_snli= snli.map(preprocess_function, batched=True)

Loading cached processed dataset at C:\Users\zebzi\.cache\huggingface\datasets\snli\plain_text\1.0.0\1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b\cache-e2ef9dd11bde0188.arrow
Loading cached processed dataset at C:\Users\zebzi\.cache\huggingface\datasets\snli\plain_text\1.0.0\1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b\cache-eff9f8234b7d432d.arrow
100%|██████████| 10/10 [00:00<00:00, 13.32ba/s]


In [9]:
print(tokenized_snli)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

DatasetDict({
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9824
    })
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 549367
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9842
    })
})


In [10]:
accuracy = evaluate.load("accuracy")

In [11]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = numpy.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [12]:
# Is this necessary? yes
id2label = {0: "entailment", 1: "neutral", 2:"contradiction"}
label2id = {"entailment": 0, "neutral": 1, "contradiction": 2}
model = AutoModelForSequenceClassification.from_pretrained(
    "C:/Users/zebzi/Documents/School/Senior_Year/CSCI 5541/NLP/hamza_model", num_labels=3, id2label=id2label, label2id=label2id
)

In [13]:
train_loss_list = []
train_acc_list = []
eval_loss_list = []
eval_acc_list = []
time_per_epoch_list = []

In [14]:
class CustomTrainer(Trainer):
    def _inner_training_loop(
        self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None
    ):
        number_of_epochs = args.num_train_epochs
        start = time.time()
        train_loss=[]
        train_acc=[]
        eval_acc=[]

        criterion = torch.nn.CrossEntropyLoss().to(device)
        self.optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, 1, gamma=0.9)
        
        train_dataloader = self.get_train_dataloader()
        eval_dataloader = self.get_eval_dataloader()

        for epoch in range(number_of_epochs):
            train_loss_per_epoch = 0
            train_acc_per_epoch = 0
            with tqdm(train_dataloader, unit="batch") as training_epoch:
                training_epoch.set_description(f"Training Epoch {epoch}")
                for step, inputs in enumerate(training_epoch):
                    inputs = inputs.to(device)
                    labels = inputs['labels']
                    # forward pass
                    self.optimizer.zero_grad()
                    output = model(inputs['input_ids'])
                    # get the loss
                    loss = criterion(output['logits'], labels)
                    train_loss_per_epoch += loss.item()
                    #calculate gradients
                    loss.backward()
                    #update weights
                    self.optimizer.step()
                    train_acc_per_epoch += (output['logits'].argmax(1) == labels).sum().item()
            # adjust the learning rate
            self.scheduler.step()
            train_loss_per_epoch /= len(train_dataloader)
            train_acc_per_epoch /= (len(train_dataloader)*batch_size)
            
            
            eval_loss_per_epoch = 0
            eval_acc_per_epoch = 0
            with tqdm(eval_dataloader, unit="batch") as eval_epoch:
                eval_epoch.set_description(f"Evaluation Epoch {epoch}")
                for step, inputs in enumerate(eval_epoch):
                    inputs = inputs.to(device)
                    labels = inputs['labels']
                    with torch.no_grad():
                        output = model(inputs['input_ids'])
                        loss = criterion(output['logits'], labels)
                        eval_loss_per_epoch += loss.item()
                        eval_acc_per_epoch += (output['logits'].argmax(1) == labels).sum().item()
            eval_loss_per_epoch /= (len(eval_dataloader))
            eval_acc_per_epoch /= (len(eval_dataloader)*batch_size)
        
            
            print(f'\tTrain Loss: {train_loss_per_epoch:.3f} | Train Acc: {train_acc_per_epoch*100:.2f}%')
            print(f'\tEval Loss: {eval_loss_per_epoch:.3f} | Eval Acc: {eval_acc_per_epoch*100:.2f}%')
            train_loss_list.append(train_loss_per_epoch)
            train_acc_list.append(train_acc_per_epoch)
            eval_loss_list.append(eval_loss_per_epoch)
            eval_acc_list.append(eval_acc_per_epoch)
            time_per_epoch_list.append((time.time()-start)/60)
    
        print(f'Time: {(time.time()-start)/60:.3f} minutes')

In [None]:
# https://huggingface.co/transformers/v4.4.2/main_classes/trainer.html#trainingarguments
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=1e-5,
    per_device_train_batch_size=32, # was 32
    per_device_eval_batch_size=32,  # was 32
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# https://huggingface.co/transformers/v4.4.2/main_classes/trainer.html#id1
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_snli["train"],
    eval_dataset=tokenized_snli["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
model.save_pretrained("C:/Users/zebzi/Documents/School/Senior_Year/CSCI 5541/NLP/modelSaver")

In [None]:
import matplotlib.pyplot as plt

# Data for plotting
t = np.arange(0.0, 100, 1)

# fig, ax = plt.subplots()
plt.plot(train_loss_list, label="training loss")
plt.plot(train_acc_list, label="training accuracy")
# plt.plot(eval_loss_list, label="evaluation loss")
plt.plot(eval_acc_list, label="evaluation accuracy")
plt.legend()
# ax.set(xlabel='time (s)', ylabel='voltage (mV)',
#        title='About as simple as it gets, folks')
plt.grid()

# fig.savefig("test.png")
plt.show()

In [15]:
# Load the pipeline for natural language inference tasks
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=device)

# Define your input sentences as strings
premise = "A person is playing a guitar."
hypothesis = "The person is singing."

# Combine the premise and hypothesis into a single string
text = premise + " " + hypothesis

# Pass the input string through the pipeline to get the predicted label
result = nlp(text)[0]

# Print the prediction
print("The prediction is:", result["label"])

The prediction is: contradiction


In [16]:
nlp_base = pipeline("sentiment-analysis", model="bert-base-uncased")
result_base = nlp_base(text)[0]
print("The base prediction is: ", result_base["label"])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

The base prediction is:  LABEL_0


In [19]:
tokenized_snli["test"]["text"][0]
for i in range(10):
    temp = tokenized_snli["train"]["text"][i].split(".")
    premise = temp[0]
    hypothesis = temp[1]
    print("Premise: ",end="")
    print(premise)
    print("Hypothesis: ",end="")
    print(hypothesis)
    print()
    result = nlp(tokenized_snli["train"]["text"][i])[0]
    print("The prediction is: ", result["label"])
    print("The correct is: ", id2label[snli["test"]["label"][i]])
    print()
    print()


Premise: A person on a horse jumps over a broken down airplane
Hypothesis:  A person is training his horse for a competition

The prediction is:  contradiction
The correct is:  neutral






Premise: A person on a horse jumps over a broken down airplane
Hypothesis:  A person is at a diner, ordering an omelette

The prediction is:  contradiction
The correct is:  entailment


Premise: A person on a horse jumps over a broken down airplane
Hypothesis:  A person is outdoors, on a horse

The prediction is:  contradiction
The correct is:  contradiction




IndexError: list index out of range

In [None]:
for i in range(10):
  premise = test_dataset[i]["premise"]
  hypothesis = test_dataset[i]["hypothesis"]
  print("Premise: ",end="")
  print(premise)
  print("Hypothesis: ",end="")
  print(hypothesis)
  print()
  text = premise + " " + hypothesis
  result = nlp(text)[0]
  print("The prediction is: ", result["label"])
  print("The correct is: ", id2label[test_dataset[i]["label"]])
  print()
  print()
  print()