<a href="https://colab.research.google.com/github/amrtanair/master_thesis/blob/main/GPT2_for_text_classification(Mega_dataset).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
# from IPython.display import HTML, display

# def set_css():
#   display(HTML('''
#   <style>
#     pre {
#         white-space: pre-wrap;
#     }
#   </style>
#   '''))
# get_ipython().events.register('pre_run_cell', set_css)

# !pip uninstall transformers
# !pip install transformers==4.35.2
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
import os
import datetime
import json
import numpy as np
import random
import pandas as pd
import torch
import re
import pickle
import math
import shutil

from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, accuracy_score
from transformers import (set_seed,
                          GPT2Config,
                          GPT2Tokenizer,
                          GPT2ForSequenceClassification)

from sklearn.metrics import matthews_corrcoef

In [18]:
class AcceptabilityDataset(Dataset):
  def __init__(self, texts, labels):
    self.texts = texts
    self.labels = labels
    self.n_examples = len(self.labels)
    return

  def __len__(self):
    return self.n_examples

  def __getitem__(self, item):
    return {'text':self.texts[item],
            'label':self.labels[item]}


class Collator(object):
    def __init__(self, tokenizer, labels_encoder):
        self.tokenizer = tokenizer
        self.labels_encoder = labels_encoder
        return

    def __call__(self, sequences):
        texts = [sequence['text'] for sequence in sequences]
        labels = [sequence['label'] for sequence in sequences]
        labels = [self.labels_encoder[label] for label in labels]
        max_tokens = max([len(self.tokenizer.tokenize(text)) for text in texts])

        inputs = self.tokenizer(text=texts, return_tensors="pt", padding=True, truncation=True,  max_length=max_tokens)
        inputs.update({'labels':torch.tensor(labels)})
        return inputs


In [19]:
def train(dataloader, model, optimizer, device):
	predictions_labels = []
	true_labels = []
	total_loss = 0
	model.train()
	for batch in dataloader:
		true_labels += batch['labels'].numpy().flatten().tolist()
		batch = {k:v.type(torch.long).to(device) for k,v in batch.items()}
		model.zero_grad()
		outputs = model(**batch)
		loss, logits = outputs[:2]
		total_loss += loss.item()
		loss.backward()
		torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
		optimizer.step()
		logits = logits.detach().cpu().numpy()
		predictions_labels += logits.argmax(axis=-1).flatten().tolist()
	avg_epoch_loss = total_loss / len(dataloader)
	return model, true_labels, predictions_labels, avg_epoch_loss

def validation(dataloader, model, device):
	predictions_labels = []
	true_labels = []
	total_loss = 0
	model.eval()
	for batch in dataloader:
		true_labels += batch['labels'].numpy().flatten().tolist()
		batch = {k:v.type(torch.long).to(device) for k,v in batch.items()}
		with torch.no_grad():
			outputs = model(**batch)
			loss, logits = outputs[:2]
			logits = logits.detach().cpu().numpy()
			total_loss += loss.item()
			predict_content = logits.argmax(axis=-1).flatten().tolist()
			predictions_labels += predict_content
	avg_epoch_loss = total_loss / len(dataloader)
	return true_labels, predictions_labels, avg_epoch_loss

def ranking_param(measure, logprob, unilp, sentence_len):
    if measure == 'LogProb':
        return logprob
    elif measure == 'MeanLP':
        return logprob/sentence_len
    elif measure == 'NormLP_div':
        return -(logprob/unilp)
    elif measure == 'SLOR':
        return (logprob - logprob)/sentence_len
    elif measure == 'NormLP_sub':
        return logprob-unilp
    elif measure == 'Prob':
        return

def save_model(model, path):
	model.save_pretrained(path)

def load_model(path):
	model = GPT2ForSequenceClassification.from_pretrained(path)
	return model

In [20]:
def run_model(device, model, tokenizer, measure):
    unigram_freq = pickle.load(open('/content/drive/MyDrive/gpt-openwebtext.pickle', "rb"))
    unigram_total = sum(unigram_freq.values())

    input_file_path = '/content/drive/MyDrive/OIE2016_search_res.json'
    with open(input_file_path, 'r') as file:
        json_data = file.read()

    data_dict = json.loads(json_data)
    pattern = r'\$input_txt:\$ '

    search_results = {}
    sentences = {}

    for key, value in data_dict.items():
        sentence = re.sub(pattern, '', value[0][0])
        sentences[key] = sentence
        search_results[key] = []
        for k, v in value[0][1]["deduplicated:"].items():
            np_pair = [sentence[start:end] for start, end in v[2]]
            triple_text =  np_pair[0] + ' [SEP] ' + k.split(' [SEP] ')[1] + ' [SEP] ' + np_pair[1]
            search_results[key].append([triple_text, v[1]])

    k = 4
    print("Selecting top: ", k)
    result_file = 'gpt2-finetuned-mega_' + 'tab_seperated_' + measure + '_' + str(k) + '.txt'

    with open(result_file, "w") as f:
        ID = 0
        for key, value in tqdm(sentences.items()):
            temp = []

            triples = search_results[key]
            for triple in triples:
                input_text = triple[0].replace("[SEP] ", "")
                input_id = tokenizer(input_text, return_tensors='pt')["input_ids"].to(device)
                tokenize_input = tokenizer.tokenize(input_text)
                uni_lp = 0.0
                # for w in tokenize_input:
                #     uni_lp += math.log(float(unigram_freq[w])/unigram_total)
                for w in tokenize_input:
                    try:
                        if unigram_freq[w] > 0 and unigram_total > 0:
                            uni_lp += math.log(float(unigram_freq[w]) / unigram_total)
                    except:
                        print(triple)
                output = model(input_id)
                probabilities = torch.softmax(output.logits, dim=1)[0][1].item()
                logprob = math.log(probabilities)
                if measure == 'Prob':
                    ranking_value = probabilities
                else:
                    ranking_value = ranking_param(measure, logprob, uni_lp, len(input_text))
                temp.append([triple, ranking_value])

            temp = sorted(temp, key=lambda x: x[1], reverse = True)[:k]
            f.write(value+"\n")

            ID = ID + 1
            for t in temp:
                text = t[0][0].split("[SEP] ")
                try:
                    f.write(str(ID)+'\t'+
                     ('"'+text[0]+'"')+'\t'+
                      ('"'+text[1]+'"')+'\t'+
                       ('"'+text[2]+'"')+'\t'+
                            str(t[0][1])+ '\n')
                except:
                    print(ID)

    return result_file


In [21]:
if __name__ == "__main__":
	seed = 2024
	random.seed(seed)
	np.random.seed(seed)
	torch.manual_seed(seed)

	if torch.cuda.is_available():
		torch.cuda.manual_seed_all(seed)
		device = torch.device("cuda")
		device_name = torch.cuda.get_device_name(0)
		print('GPU:', device_name)
	else:
		print('Using CPU')
		device = torch.device("cpu")
		device_name = 'cpu'

	if not os.path.exists('/content/drive/MyDrive/thesis/models/GPT2-model'):
		batch_size = 124
		model_name = 'gpt2'
		lr = 2e-05
		labels_ids = {'0': 0, '1': 1}
		n_labels = len(labels_ids)
		print("Batch size: ", batch_size)

		config = GPT2Config.from_pretrained(model_name, activation_function = "gelu", attn_pdrop = 0.15,
												num_labels=n_labels)

		print('Loading tokenizer...')
		tokenizer = GPT2Tokenizer.from_pretrained(model_name)
		tokenizer.padding_side = "left"
		tokenizer.pad_token = tokenizer.eos_token

		dataset_collator = Collator(tokenizer = tokenizer, labels_encoder = labels_ids)

		path = '/content/drive/MyDrive/thesis/mega_acceptability.tsv'
		df = pd.read_csv(path,
		                 delimiter='\t',
										header=None,
										names=['sentence_source', 'label', 'label_notes', 'sentence'])
		texts = df.sentence.values
		df['label'] = df['label'].astype(str)
		labels = df.label.values

		kf = KFold(n_splits=10, shuffle=True, random_state=42)
		best_mcc = 0
		results = {'fold': [], 'mcc': [], 'val_acc': [], 'epochs': []}

		for fold, (train_index, val_index) in enumerate(tqdm(kf.split(texts), total=kf.n_splits)):
			print('Initializing model...')
			model = GPT2ForSequenceClassification.from_pretrained(model_name, config = config)
			model.config.pad_token_id = model.config.eos_token_id
			model.to(device)
			print('Model loaded to ', device)

			print(f'Fold {fold + 1}...')
			train_texts, val_texts = texts[train_index], texts[val_index]
			train_labels, val_labels = labels[train_index], labels[val_index]

			train_dataset = AcceptabilityDataset(train_texts, train_labels)
			val_dataset = AcceptabilityDataset(val_texts, val_labels)

			train_dataloader = DataLoader(train_dataset,
			                              batch_size=batch_size,
																		shuffle=True,
																		collate_fn=dataset_collator)
			val_dataloader = DataLoader(val_dataset,
			                            batch_size=batch_size,
																	shuffle=False,
																	collate_fn=dataset_collator)

			print(f'Created train and validation splits.')

			optimizer = torch.optim.Adam(model.parameters(),
									lr = lr,
									eps = 1e-08)

			all_loss = {'train_loss':[], 'val_loss':[]}
			all_acc = {'train_acc':[], 'val_acc':[]}

			curr_val_loss = None
			prev_val_loss = None
			epoch = 1
			penalty = 0

			#dynamic early stopping based on validation loss
			while True:
				print("Epoch: ", epoch)
				epoch = epoch + 1
				print('Training on batches...')
				model, train_labels, train_predict, train_loss = train(train_dataloader, model, optimizer, device)
				train_acc = accuracy_score(train_labels, train_predict)
				print('Validation on batches...')
				valid_labels, valid_predict, val_loss = validation(val_dataloader, model, device)
				val_acc = accuracy_score(valid_labels, valid_predict)
				prev_val_loss = curr_val_loss
				curr_val_loss = val_loss
				print("  train_loss: %.5f - val_loss: %.5f - train_acc: %.5f - valid_acc: %.5f"%(train_loss, val_loss, train_acc, val_acc))

				if curr_val_loss is not None and prev_val_loss is not None:
					if penalty > 1:
						break
					if curr_val_loss > prev_val_loss:
						penalty = penalty + 1
						print("Penalty applied")
				all_loss['train_loss'].append(train_loss)
				all_loss['val_loss'].append(val_loss)
				all_acc['train_acc'].append(train_acc)
				all_acc['val_acc'].append(val_acc)

			mcc = matthews_corrcoef(valid_labels, valid_predict)
			print(f'For fold {fold + 1}, MCC is {mcc}, validation accuracy {val_acc}')

			results['fold'].append(fold+1)
			results['mcc'].append(mcc)
			results['val_acc'].append(val_acc)
			results['epochs'].append(epoch)

			if best_mcc < mcc:
				best_mcc = mcc
				output_dir = './GPT2-model/'
				if os.path.exists(output_dir):
						shutil.rmtree(output_dir)

				training_args = {'created': datetime.datetime.now().strftime('%d_%m_%Y_%H_%M_%S'),
								'model': model_name,
								'device': device_name,
								'batch_size': batch_size,
								'epochs': epoch,
								'learning_rate': lr,
								'optimizer': type(optimizer).__name__,
								'seed': seed,
								'MCC': mcc,
								'accuracy': val_acc,
								}
				os.makedirs(output_dir)
				print("Saving model to: ", output_dir)
				save_model(model, output_dir)

				with open(output_dir + '/training_args.json', "w") as json_file:
					json.dump(training_args, json_file)
				with open(output_dir + '/all_loss.json', "w") as json_file:
					json.dump(all_loss, json_file)
				with open(output_dir + '/all_acc.json', "w") as json_file:
					json.dump(all_acc, json_file)

		print(results)
	else:
		print("Model is present, creating evaluation file for different normalization schemes.")
		print("Loading model...")
		model = load_model('/content/drive/MyDrive/models/GPT2-model')
		model.to(device)

		print('Loading tokenizer...')
		tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
		tokenizer.padding_side = "left"
		tokenizer.pad_token = tokenizer.eos_token

	# measures = ['NormLP_sub','LogProb', 'MeanLP', 'NormLP_div', 'SLOR', 'Prob']
	# for measure in measures:
	# 	print("The measure being used is: ", measure)
	# 	file_path = run_model(device, model, tokenizer, measure)
	# 	print("Linguistic Acceptability Model Result saved at: ", file_path)



GPU: Tesla T4
Batch size:  124




Loading tokenizer...


  0%|          | 0/10 [00:00<?, ?it/s]

Initializing model...


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded to  cuda
Fold 1...
Created train and validation splits.
Epoch:  1
Training on batches...
Validation on batches...
  train_loss: 0.57518 - val_loss: 0.44552 - train_acc: 0.70876 - valid_acc: 0.80289
Epoch:  2
Training on batches...
Validation on batches...
  train_loss: 0.43486 - val_loss: 0.38356 - train_acc: 0.80384 - valid_acc: 0.83229
Epoch:  3
Training on batches...
Validation on batches...
  train_loss: 0.38922 - val_loss: 0.37457 - train_acc: 0.82588 - valid_acc: 0.83990
Epoch:  4
Training on batches...
Validation on batches...
  train_loss: 0.36817 - val_loss: 0.35101 - train_acc: 0.83645 - valid_acc: 0.85233
Epoch:  5
Training on batches...
Validation on batches...
  train_loss: 0.34842 - val_loss: 0.35366 - train_acc: 0.84603 - valid_acc: 0.84819
Penalty applied
Epoch:  6
Training on batches...
Validation on batches...
  train_loss: 0.33625 - val_loss: 0.35191 - train_acc: 0.85154 - valid_acc: 0.84926
Epoch:  7
Training on batches...
Validation on batches...
  tra

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded to  cuda
Fold 2...
Created train and validation splits.
Epoch:  1
Training on batches...
Validation on batches...
  train_loss: 0.57505 - val_loss: 0.44420 - train_acc: 0.70617 - valid_acc: 0.79941
Epoch:  2
Training on batches...
Validation on batches...
  train_loss: 0.43950 - val_loss: 0.38272 - train_acc: 0.80218 - valid_acc: 0.83001
Epoch:  3
Training on batches...
Validation on batches...
  train_loss: 0.39501 - val_loss: 0.37312 - train_acc: 0.82355 - valid_acc: 0.83282
Epoch:  4
Training on batches...
Validation on batches...
  train_loss: 0.37043 - val_loss: 0.36298 - train_acc: 0.83544 - valid_acc: 0.84218
Epoch:  5
Training on batches...
Validation on batches...
  train_loss: 0.35208 - val_loss: 0.35492 - train_acc: 0.84542 - valid_acc: 0.84618
Epoch:  6
Training on batches...
Validation on batches...
  train_loss: 0.34117 - val_loss: 0.35749 - train_acc: 0.84872 - valid_acc: 0.84431
Penalty applied
Epoch:  7
Training on batches...
Validation on batches...
  tra

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded to  cuda
Fold 3...
Created train and validation splits.
Epoch:  1
Training on batches...
Validation on batches...
  train_loss: 0.57924 - val_loss: 0.45667 - train_acc: 0.69391 - valid_acc: 0.79687
Epoch:  2
Training on batches...
Validation on batches...
  train_loss: 0.44225 - val_loss: 0.41214 - train_acc: 0.79905 - valid_acc: 0.82400
Epoch:  3
Training on batches...
Validation on batches...
  train_loss: 0.39778 - val_loss: 0.38268 - train_acc: 0.82251 - valid_acc: 0.83777
Epoch:  4
Training on batches...
Validation on batches...
  train_loss: 0.37239 - val_loss: 0.37590 - train_acc: 0.83519 - valid_acc: 0.84271
Epoch:  5
Training on batches...
Validation on batches...
  train_loss: 0.35511 - val_loss: 0.37043 - train_acc: 0.84230 - valid_acc: 0.84271
Epoch:  6
Training on batches...
Validation on batches...
  train_loss: 0.34208 - val_loss: 0.36294 - train_acc: 0.84821 - valid_acc: 0.84672
Epoch:  7
Training on batches...
Validation on batches...
  train_loss: 0.32995

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded to  cuda
Fold 4...
Created train and validation splits.
Epoch:  1
Training on batches...
Validation on batches...
  train_loss: 0.59589 - val_loss: 0.46057 - train_acc: 0.70646 - valid_acc: 0.79914
Epoch:  2
Training on batches...
Validation on batches...
  train_loss: 0.44107 - val_loss: 0.39118 - train_acc: 0.80026 - valid_acc: 0.82601
Epoch:  3
Training on batches...
Validation on batches...
  train_loss: 0.39300 - val_loss: 0.36982 - train_acc: 0.82386 - valid_acc: 0.83576
Epoch:  4
Training on batches...
Validation on batches...
  train_loss: 0.36820 - val_loss: 0.38375 - train_acc: 0.83899 - valid_acc: 0.83950
Penalty applied
Epoch:  5
Training on batches...
Validation on batches...
  train_loss: 0.35326 - val_loss: 0.36021 - train_acc: 0.84411 - valid_acc: 0.84792
Epoch:  6
Training on batches...
Validation on batches...
  train_loss: 0.33815 - val_loss: 0.35854 - train_acc: 0.85093 - valid_acc: 0.84645
Epoch:  7
Training on batches...
Validation on batches...
  tra

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded to  cuda
Fold 5...
Created train and validation splits.
Epoch:  1
Training on batches...
Validation on batches...
  train_loss: 0.60237 - val_loss: 0.46181 - train_acc: 0.69573 - valid_acc: 0.79540
Epoch:  2
Training on batches...
Validation on batches...
  train_loss: 0.43663 - val_loss: 0.39170 - train_acc: 0.80387 - valid_acc: 0.82560
Epoch:  3
Training on batches...
Validation on batches...
  train_loss: 0.38791 - val_loss: 0.37603 - train_acc: 0.82769 - valid_acc: 0.83750
Epoch:  4
Training on batches...
Validation on batches...
  train_loss: 0.36300 - val_loss: 0.36523 - train_acc: 0.83875 - valid_acc: 0.84365
Epoch:  5
Training on batches...
Validation on batches...
  train_loss: 0.34804 - val_loss: 0.35606 - train_acc: 0.84683 - valid_acc: 0.84765
Epoch:  6
Training on batches...
Validation on batches...
  train_loss: 0.33381 - val_loss: 0.36400 - train_acc: 0.85236 - valid_acc: 0.84699
Penalty applied
Epoch:  7
Training on batches...
Validation on batches...
  tra

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded to  cuda
Fold 6...
Created train and validation splits.
Epoch:  1
Training on batches...
Validation on batches...
  train_loss: 0.58420 - val_loss: 0.44844 - train_acc: 0.70805 - valid_acc: 0.80155
Epoch:  2
Training on batches...
Validation on batches...
  train_loss: 0.43896 - val_loss: 0.37904 - train_acc: 0.80203 - valid_acc: 0.83242
Epoch:  3
Training on batches...
Validation on batches...
  train_loss: 0.39079 - val_loss: 0.36083 - train_acc: 0.82523 - valid_acc: 0.84204
Epoch:  4
Training on batches...
Validation on batches...
  train_loss: 0.36611 - val_loss: 0.35834 - train_acc: 0.83777 - valid_acc: 0.84659
Epoch:  5
Training on batches...
Validation on batches...
  train_loss: 0.35044 - val_loss: 0.34903 - train_acc: 0.84613 - valid_acc: 0.85100
Epoch:  6
Training on batches...
Validation on batches...
  train_loss: 0.33624 - val_loss: 0.35406 - train_acc: 0.85133 - valid_acc: 0.85100
Penalty applied
Epoch:  7
Training on batches...
Validation on batches...
  tra

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded to  cuda
Fold 7...
Created train and validation splits.
Epoch:  1
Training on batches...
Validation on batches...
  train_loss: 0.59012 - val_loss: 0.48294 - train_acc: 0.69230 - valid_acc: 0.78378
Epoch:  2
Training on batches...
Validation on batches...
  train_loss: 0.43186 - val_loss: 0.40344 - train_acc: 0.80634 - valid_acc: 0.81932
Epoch:  3
Training on batches...
Validation on batches...
  train_loss: 0.38550 - val_loss: 0.38701 - train_acc: 0.82864 - valid_acc: 0.83295
Epoch:  4
Training on batches...
Validation on batches...
  train_loss: 0.36193 - val_loss: 0.38806 - train_acc: 0.84025 - valid_acc: 0.82788
Penalty applied
Epoch:  5
Training on batches...
Validation on batches...
  train_loss: 0.34330 - val_loss: 0.36436 - train_acc: 0.84903 - valid_acc: 0.84151
Epoch:  6
Training on batches...
Validation on batches...
  train_loss: 0.33096 - val_loss: 0.37962 - train_acc: 0.85405 - valid_acc: 0.83523
Penalty applied
Epoch:  7
Training on batches...
Validation on 

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded to  cuda
Fold 8...
Created train and validation splits.
Epoch:  1
Training on batches...
Validation on batches...
  train_loss: 0.59657 - val_loss: 0.46265 - train_acc: 0.70492 - valid_acc: 0.79591
Epoch:  2
Training on batches...
Validation on batches...
  train_loss: 0.43900 - val_loss: 0.37732 - train_acc: 0.80104 - valid_acc: 0.83360
Epoch:  3
Training on batches...
Validation on batches...
  train_loss: 0.39121 - val_loss: 0.37300 - train_acc: 0.82740 - valid_acc: 0.83855
Epoch:  4
Training on batches...
Validation on batches...
  train_loss: 0.36739 - val_loss: 0.34993 - train_acc: 0.83737 - valid_acc: 0.84550
Epoch:  5
Training on batches...
Validation on batches...
  train_loss: 0.34957 - val_loss: 0.34958 - train_acc: 0.84570 - valid_acc: 0.84563
Epoch:  6
Training on batches...
Validation on batches...
  train_loss: 0.33777 - val_loss: 0.35845 - train_acc: 0.85124 - valid_acc: 0.84496
Penalty applied
Epoch:  7
Training on batches...
Validation on batches...
  tra

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded to  cuda
Fold 9...
Created train and validation splits.
Epoch:  1
Training on batches...
Validation on batches...
  train_loss: 0.55943 - val_loss: 0.42415 - train_acc: 0.71607 - valid_acc: 0.81409
Epoch:  2
Training on batches...
Validation on batches...
  train_loss: 0.42433 - val_loss: 0.38568 - train_acc: 0.80722 - valid_acc: 0.83307
Epoch:  3
Training on batches...
Validation on batches...
  train_loss: 0.38117 - val_loss: 0.37534 - train_acc: 0.83074 - valid_acc: 0.83895
Epoch:  4
Training on batches...
Validation on batches...
  train_loss: 0.35719 - val_loss: 0.36238 - train_acc: 0.84137 - valid_acc: 0.84149
Epoch:  5
Training on batches...
Validation on batches...
  train_loss: 0.34321 - val_loss: 0.36889 - train_acc: 0.84902 - valid_acc: 0.84269
Penalty applied
Epoch:  6
Training on batches...
Validation on batches...
  train_loss: 0.33060 - val_loss: 0.37327 - train_acc: 0.85331 - valid_acc: 0.84496
Penalty applied
Epoch:  7
Training on batches...
Validation on 

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded to  cuda
Fold 10...
Created train and validation splits.
Epoch:  1
Training on batches...
Validation on batches...
  train_loss: 0.57854 - val_loss: 0.43375 - train_acc: 0.70724 - valid_acc: 0.80954
Epoch:  2
Training on batches...
Validation on batches...
  train_loss: 0.43362 - val_loss: 0.37338 - train_acc: 0.80386 - valid_acc: 0.84015
Epoch:  3
Training on batches...
Validation on batches...
  train_loss: 0.38863 - val_loss: 0.35032 - train_acc: 0.82667 - valid_acc: 0.84469
Epoch:  4
Training on batches...
Validation on batches...
  train_loss: 0.36343 - val_loss: 0.35456 - train_acc: 0.83870 - valid_acc: 0.84456
Penalty applied
Epoch:  5
Training on batches...
Validation on batches...
  train_loss: 0.34795 - val_loss: 0.35423 - train_acc: 0.84630 - valid_acc: 0.84697
Epoch:  6
Training on batches...
Validation on batches...
  train_loss: 0.33434 - val_loss: 0.35418 - train_acc: 0.85258 - valid_acc: 0.84991
Epoch:  7
Training on batches...
Validation on batches...
  tr

In [22]:
!cp -r /content/GPT2-model/ /content/drive/MyDrive/thesis/models/gpt2_MA_seed2024