## This part of the project was done in local PC

**Loading the dev and train data**

In [1]:
import json

# File paths
dev_file_path = r'D:\Rajesh\Rajesh\Personal\AISanDiego\520-NLP\Project\OriginalData\StanfordQA\dev-v1.1.json'
train_file_path = r'D:\Rajesh\Rajesh\Personal\AISanDiego\520-NLP\Project\OriginalData\StanfordQA\train-v1.1.json'

# Load dev data
with open(dev_file_path, 'r', encoding='utf-8') as dev_file:
    dev_squad_data = json.load(dev_file)

# Load train data
with open(train_file_path, 'r', encoding='utf-8') as train_file:
    train_squad_data = json.load(train_file)

# Print the loaded data (optional)
print(f"Sample from dev-v1.1.json: {dev_squad_data['data'][0]['paragraphs'][0]['qas'][0]}")
print(f"Sample from train-v1.1.json: {train_squad_data['data'][0]['paragraphs'][0]['qas'][0]}")

Sample from dev-v1.1.json: {'answers': [{'answer_start': 177, 'text': 'Denver Broncos'}, {'answer_start': 177, 'text': 'Denver Broncos'}, {'answer_start': 177, 'text': 'Denver Broncos'}], 'question': 'Which NFL team represented the AFC at Super Bowl 50?', 'id': '56be4db0acb8001400a502ec'}
Sample from train-v1.1.json: {'answers': [{'answer_start': 515, 'text': 'Saint Bernadette Soubirous'}], 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'id': '5733be284776f41900661182'}


In [2]:
# Define a function to extract question-answer pairs from the dataset.
def extract_qna_pairs_for_bert(squad_data):
    qna_pairs = []
    for article in squad_data['data']:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qas in paragraph['qas']:
                question = qas['question']
                if qas.get('is_impossible', False):
                    answer = "I don't know."
                    answer_start = -1  # No answer available
                else:
                    # Take the first answer
                    answer = qas['answers'][0]['text'] if qas['answers'] else "No answer available."
                    answer_start = qas['answers'][0]['answer_start']
                qna_pairs.append((context, question, answer, answer_start))
    return qna_pairs

# Step 5: Extract question-answer pairs from both train and dev datasets
qna_pairs_train = extract_qna_pairs_for_bert(train_squad_data)
qna_pairs_dev = extract_qna_pairs_for_bert(dev_squad_data)

# Displaying a sample of the extracted data
print(f"Sample from extracted training data: {qna_pairs_train[0]}")
print(f"Sample from extracted dev data: {qna_pairs_dev[0]}")


Sample from extracted training data: ('Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'Saint Bernadette Soubirous', 515)
Sample from extracted dev data: ('Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 20

In [3]:
import re

# Function to clean text (removes unnecessary whitespace, lowercasing, etc.)
def clean_text(text):
    text = text.lower()  # Lowercase the text (since we're using an uncased model)
    text = text.replace('\n', ' ').replace('\r', ' ')  # Remove newline and carriage returns
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = text.strip()  # Remove leading and trailing whitespace
    return text

# Apply cleaning to the extracted question-answer pairs before tokenization
def clean_qna_pairs(qna_pairs):
    cleaned_qna_pairs = []
    for context, question, answer, answer_start in qna_pairs:
        cleaned_context = clean_text(context)
        cleaned_question = clean_text(question)
        cleaned_answer = clean_text(answer)
        cleaned_qna_pairs.append((cleaned_context, cleaned_question, cleaned_answer, answer_start))
    return cleaned_qna_pairs

# Step 6 (Revised): Clean the extracted train and dev data before tokenization
cleaned_qna_pairs_train = clean_qna_pairs(qna_pairs_train)
cleaned_qna_pairs_dev = clean_qna_pairs(qna_pairs_dev)

# Displaying a sample of cleaned data
print(f"Cleaned sample from training data: {cleaned_qna_pairs_train[0]}")


Cleaned sample from training data: ('architecturally, the school has a catholic character. atop the main building\'s gold dome is a golden statue of the virgin mary. immediately in front of the main building and facing it, is a copper statue of christ with arms upraised with the legend "venite ad me omnes". next to the main building is the basilica of the sacred heart. immediately behind the basilica is the grotto, a marian place of prayer and reflection. it is a replica of the grotto at lourdes, france where the virgin mary reputedly appeared to saint bernadette soubirous in 1858. at the end of the main drive (and in a direct line that connects through 3 statues and the gold dome), is a simple, modern stone statue of mary.', 'to whom did the virgin mary allegedly appear in 1858 in lourdes france?', 'saint bernadette soubirous', 515)


In [8]:
from transformers import BertTokenizerFast

# Load the pre-trained BERT fast tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Function to tokenize the cleaned question-answer pairs and find the token positions of the answer
def tokenize_qna_pairs(qna_pairs, tokenizer, max_length=128):
    tokenized_data = []

    for context, question, answer, answer_start in qna_pairs:
        # Tokenize the question and context together
        encoding = tokenizer.encode_plus(
            question,
            context,
            max_length=max_length,
            truncation=True,
            padding='max_length',
            return_offsets_mapping=True,  # Now this will work with the fast tokenizer
            return_token_type_ids=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze().tolist()  # Convert to list
        attention_mask = encoding['attention_mask'].squeeze().tolist()  # Convert to list
        token_type_ids = encoding['token_type_ids'].squeeze().tolist()  # Convert to list
        offset_mapping = encoding['offset_mapping'].squeeze().tolist()  # Convert to list

        # Initialize start and end positions of the answer
        start_position = 0
        end_position = 0

        # Find the token positions corresponding to the answer if it's not unanswerable
        if answer_start != -1:
            for idx, (start, end) in enumerate(offset_mapping):
                if start <= answer_start < end:
                    start_position = idx
                if start < answer_start + len(answer) <= end:
                    end_position = idx
                    break

        # Append tokenized data
        tokenized_data.append({
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'token_type_ids': token_type_ids,
            'start_position': start_position,
            'end_position': end_position
        })

    return tokenized_data

# Tokenize the cleaned question-answer pairs
tokenized_train = tokenize_qna_pairs(cleaned_qna_pairs_train, tokenizer)
tokenized_dev = tokenize_qna_pairs(cleaned_qna_pairs_dev, tokenizer)

# Display a sample of the tokenized data
print(f"Sample tokenized data from training set: {tokenized_train[0]}")


Sample tokenized data from training set: {'input_ids': [101, 2000, 3183, 2106, 1996, 6261, 2984, 9382, 3711, 1999, 8517, 1999, 10223, 26371, 2605, 1029, 102, 6549, 2135, 1010, 1996, 2082, 2038, 1037, 3234, 2839, 1012, 10234, 1996, 2364, 2311, 1005, 1055, 2751, 8514, 2003, 1037, 3585, 6231, 1997, 1996, 6261, 2984, 1012, 3202, 1999, 2392, 1997, 1996, 2364, 2311, 1998, 5307, 2009, 1010, 2003, 1037, 6967, 6231, 1997, 4828, 2007, 2608, 2039, 14995, 6924, 2007, 1996, 5722, 1000, 2310, 3490, 2618, 4748, 2033, 18168, 5267, 1000, 1012, 2279, 2000, 1996, 2364, 2311, 2003, 1996, 13546, 1997, 1996, 6730, 2540, 1012, 3202, 2369, 1996, 13546, 2003, 1996, 24665, 23052, 1010, 1037, 14042, 2173, 1997, 7083, 1998, 9185, 1012, 2009, 2003, 1037, 15059, 1997, 1996, 24665, 23052, 2012, 10223, 26371, 1010, 2605, 2073, 1996, 6261, 2984, 22353, 2135, 2596, 2000, 3002, 16595, 9648, 4674, 2061, 12083, 9711, 2271, 1999, 8517, 1012, 2012, 1996, 2203, 1997, 1996, 2364, 3298, 1006, 1998, 1999, 1037, 3622, 2240, 2008

In [4]:
import pickle

# Save tokenized data to files using pickle
with open('tokenized_train.pkl', 'wb') as f:
    pickle.dump(tokenized_train, f)

with open('tokenized_dev.pkl', 'wb') as f:
    pickle.dump(tokenized_dev, f)

print("Pre-processed data saved successfully")


Pre-processed data saved successfully!


### The files tokenized_train.pkl and tokenized_dev.pkl were uploaded to the Gooogle Drive

### This part of the project was done in Google Colabs Pro

In [11]:
import pickle
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Specify the load path from your Google Drive
load_path_train = '/content/drive/My Drive/SQuAD_datasets/tokenized_train.pkl'
load_path_dev = '/content/drive/My Drive/SQuAD_datasets/tokenized_dev.pkl'

# Load the tokenized training and dev datasets
with open(load_path_train, 'rb') as f:
    tokenized_train = pickle.load(f)

with open(load_path_dev, 'rb') as f:
    tokenized_dev = pickle.load(f)

print("Tokenized data loaded successfully from Google Drive.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Tokenized data loaded successfully from Google Drive.


In [12]:
import torch
from torch.utils.data import Dataset

# Custom Dataset class for Question Answering
class QADataset(Dataset):
    def __init__(self, tokenized_data):
        self.tokenized_data = tokenized_data

    def __len__(self):
        return len(self.tokenized_data)

    def __getitem__(self, idx):
        # Return the input IDs, attention mask, token type IDs, start and end positions as a dictionary
        item = self.tokenized_data[idx]
        return {
            'input_ids': torch.tensor(item['input_ids']),
            'attention_mask': torch.tensor(item['attention_mask']),
            'token_type_ids': torch.tensor(item['token_type_ids']),
            'start_positions': torch.tensor(item['start_position']),
            'end_positions': torch.tensor(item['end_position'])
        }

# Step 2: Create Dataset objects for train and dev sets
train_dataset = QADataset(tokenized_train)
dev_dataset = QADataset(tokenized_dev)

# Display a sample from the training dataset
print(train_dataset[0])


{'input_ids': tensor([  101,  2000,  3183,  2106,  1996,  6261,  2984,  9382,  3711,  1999,
         8517,  1999, 10223, 26371,  2605,  1029,   102,  6549,  2135,  1010,
         1996,  2082,  2038,  1037,  3234,  2839,  1012, 10234,  1996,  2364,
         2311,  1005,  1055,  2751,  8514,  2003,  1037,  3585,  6231,  1997,
         1996,  6261,  2984,  1012,  3202,  1999,  2392,  1997,  1996,  2364,
         2311,  1998,  5307,  2009,  1010,  2003,  1037,  6967,  6231,  1997,
         4828,  2007,  2608,  2039, 14995,  6924,  2007,  1996,  5722,  1000,
         2310,  3490,  2618,  4748,  2033, 18168,  5267,  1000,  1012,  2279,
         2000,  1996,  2364,  2311,  2003,  1996, 13546,  1997,  1996,  6730,
         2540,  1012,  3202,  2369,  1996, 13546,  2003,  1996, 24665, 23052,
         1010,  1037, 14042,  2173,  1997,  7083,  1998,  9185,  1012,  2009,
         2003,  1037, 15059,  1997,  1996, 24665, 23052,  2012, 10223, 26371,
         1010,  2605,  2073,  1996,  6261,  2984, 

In [13]:
from torch.utils.data import DataLoader

# Create DataLoaders for the training and dev datasets
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=4, shuffle=False)

# Display a batch of data (for demonstration purposes)
for batch in train_loader:
    print(batch)
    break


{'input_ids': tensor([[ 101, 2054, 2001,  ...,    0,    0,    0],
        [ 101, 2029, 2028,  ...,    0,    0,    0],
        [ 101, 2055, 2129,  ...,    0,    0,    0],
        ...,
        [ 101, 2043, 2064,  ...,    0,    0,    0],
        [ 101, 2054, 4654,  ...,    0,    0,    0],
        [ 101, 2006, 2029,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'start_positions': tensor([ 58,  10,  23,  62,   7,  85, 109,  55]), 'end_positions': tensor([ 59,  11,  23,  63,  11,  91, 113,  56])}


In [14]:
from transformers import BertForQuestionAnswering, AdamW, get_scheduler
import torch
from tqdm import tqdm  # Progress bar

# Load the pre-trained BERT model for question answering
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Set up the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5)
num_epochs = 3

# Set up the learning rate scheduler
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

# Set device (GPU or CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Training loop with printing average loss every 100 batches
model.train()
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")

    # Train the model
    total_train_loss = 0
    batch_losses = []  # To track the losses of the last 100 batches
    progress_bar = tqdm(train_loader, desc="Training")

    for batch_idx, batch in enumerate(progress_bar):
        batch = {k: v.to(device) for k, v in batch.items()}  # Move data to the device
        outputs = model(**batch)  # Forward pass
        loss = outputs.loss
        total_train_loss += loss.item()
        batch_losses.append(loss.item())

        optimizer.zero_grad()  # Clear gradients
        loss.backward()  # Backpropagation
        optimizer.step()  # Update parameters
        lr_scheduler.step()  # Update learning rate

        # Print the average loss for the last 100 batches
        if (batch_idx + 1) % 100 == 0:
            avg_last_100_batches = sum(batch_losses[-100:]) / len(batch_losses[-100:])
            print(f"Average loss of the last 100 batches (batch {batch_idx + 1}): {avg_last_100_batches}")

        # Update progress bar with the current loss
        progress_bar.set_postfix({"loss": loss.item()})

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Average Training Loss for Epoch {epoch + 1}: {avg_train_loss}")

    # Validation loop
    model.eval()
    total_val_loss = 0
    progress_bar = tqdm(dev_loader, desc="Validating")
    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)  # Forward pass (without computing gradients)
            loss = outputs.loss
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(dev_loader)
    print(f"Validation Loss for Epoch {epoch + 1}: {avg_val_loss}")

    # Set model back to training mode
    model.train()


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training:   1%|          | 101/10950 [00:18<30:32,  5.92it/s, loss=2.99]

Average loss of the last 100 batches (batch 100): 4.231133337020874


Training:   2%|▏         | 201/10950 [00:35<30:16,  5.92it/s, loss=2.77]

Average loss of the last 100 batches (batch 200): 2.912179082632065


Training:   3%|▎         | 301/10950 [00:52<29:56,  5.93it/s, loss=1.79]

Average loss of the last 100 batches (batch 300): 2.369250613451004


Training:   4%|▎         | 401/10950 [01:09<29:41,  5.92it/s, loss=2.83]

Average loss of the last 100 batches (batch 400): 2.2766072434186935


Training:   5%|▍         | 501/10950 [01:26<29:23,  5.92it/s, loss=2.83]

Average loss of the last 100 batches (batch 500): 2.1141357856988905


Training:   5%|▌         | 601/10950 [01:43<29:08,  5.92it/s, loss=1.38]

Average loss of the last 100 batches (batch 600): 1.9195109552145004


Training:   6%|▋         | 701/10950 [02:00<28:54,  5.91it/s, loss=1.79]

Average loss of the last 100 batches (batch 700): 1.8866946029663085


Training:   7%|▋         | 801/10950 [02:16<28:43,  5.89it/s, loss=1.47]

Average loss of the last 100 batches (batch 800): 1.9747071641683578


Training:   8%|▊         | 901/10950 [02:33<28:30,  5.88it/s, loss=2.02]

Average loss of the last 100 batches (batch 900): 1.8879171204566956


Training:   9%|▉         | 1001/10950 [02:50<28:11,  5.88it/s, loss=2.11]

Average loss of the last 100 batches (batch 1000): 1.7568739259243011


Training:  10%|█         | 1101/10950 [03:07<27:47,  5.90it/s, loss=1.07]

Average loss of the last 100 batches (batch 1100): 1.828685192465782


Training:  11%|█         | 1201/10950 [03:24<27:25,  5.93it/s, loss=2.01]

Average loss of the last 100 batches (batch 1200): 1.6802969819307327


Training:  12%|█▏        | 1301/10950 [03:41<27:09,  5.92it/s, loss=1.59]

Average loss of the last 100 batches (batch 1300): 1.66411246240139


Training:  13%|█▎        | 1401/10950 [03:58<26:52,  5.92it/s, loss=1.31]

Average loss of the last 100 batches (batch 1400): 1.6818364343047143


Training:  14%|█▎        | 1501/10950 [04:15<26:42,  5.90it/s, loss=1.28]

Average loss of the last 100 batches (batch 1500): 1.6593033689260483


Training:  15%|█▍        | 1601/10950 [04:32<26:15,  5.93it/s, loss=0.978]

Average loss of the last 100 batches (batch 1600): 1.865857258439064


Training:  16%|█▌        | 1701/10950 [04:49<26:10,  5.89it/s, loss=1.39]

Average loss of the last 100 batches (batch 1700): 1.7516514253616333


Training:  16%|█▋        | 1801/10950 [05:06<25:43,  5.93it/s, loss=1.26]

Average loss of the last 100 batches (batch 1800): 1.807899168729782


Training:  17%|█▋        | 1901/10950 [05:23<25:32,  5.90it/s, loss=0.981]

Average loss of the last 100 batches (batch 1900): 1.685603232383728


Training:  18%|█▊        | 2001/10950 [05:39<25:10,  5.92it/s, loss=1.79]

Average loss of the last 100 batches (batch 2000): 1.6249693983793259


Training:  19%|█▉        | 2101/10950 [05:56<24:55,  5.92it/s, loss=1.09]

Average loss of the last 100 batches (batch 2100): 1.5806910580396651


Training:  20%|██        | 2201/10950 [06:13<24:37,  5.92it/s, loss=1.26]

Average loss of the last 100 batches (batch 2200): 1.5625329154729843


Training:  21%|██        | 2301/10950 [06:30<24:27,  5.90it/s, loss=1.98]

Average loss of the last 100 batches (batch 2300): 1.5627302926778794


Training:  22%|██▏       | 2401/10950 [06:47<24:14,  5.88it/s, loss=0.55]

Average loss of the last 100 batches (batch 2400): 1.6001496088504792


Training:  23%|██▎       | 2501/10950 [07:04<23:54,  5.89it/s, loss=2.07]

Average loss of the last 100 batches (batch 2500): 1.5792154026031495


Training:  24%|██▍       | 2601/10950 [07:21<23:30,  5.92it/s, loss=1.75]

Average loss of the last 100 batches (batch 2600): 1.5203211778402328


Training:  25%|██▍       | 2701/10950 [07:38<23:22,  5.88it/s, loss=0.943]

Average loss of the last 100 batches (batch 2700): 1.6044514656066895


Training:  26%|██▌       | 2801/10950 [07:55<23:02,  5.89it/s, loss=1.41]

Average loss of the last 100 batches (batch 2800): 1.473583105802536


Training:  26%|██▋       | 2901/10950 [08:12<22:45,  5.90it/s, loss=1.54]

Average loss of the last 100 batches (batch 2900): 1.563006012737751


Training:  27%|██▋       | 3001/10950 [08:29<22:31,  5.88it/s, loss=1.61]

Average loss of the last 100 batches (batch 3000): 1.5067556113004685


Training:  28%|██▊       | 3101/10950 [08:46<22:14,  5.88it/s, loss=1.14]

Average loss of the last 100 batches (batch 3100): 1.5565893906354904


Training:  29%|██▉       | 3201/10950 [09:03<21:57,  5.88it/s, loss=0.937]

Average loss of the last 100 batches (batch 3200): 1.450670051574707


Training:  30%|███       | 3301/10950 [09:20<21:35,  5.91it/s, loss=2.02]

Average loss of the last 100 batches (batch 3300): 1.458396223783493


Training:  31%|███       | 3401/10950 [09:37<21:18,  5.91it/s, loss=1.58]

Average loss of the last 100 batches (batch 3400): 1.5069265288114548


Training:  32%|███▏      | 3501/10950 [09:54<20:55,  5.93it/s, loss=1.26]

Average loss of the last 100 batches (batch 3500): 1.4965335160493851


Training:  33%|███▎      | 3601/10950 [10:11<20:41,  5.92it/s, loss=0.989]

Average loss of the last 100 batches (batch 3600): 1.444773304462433


Training:  34%|███▍      | 3701/10950 [10:28<20:23,  5.92it/s, loss=1.32]

Average loss of the last 100 batches (batch 3700): 1.4417149555683135


Training:  35%|███▍      | 3801/10950 [10:45<20:10,  5.91it/s, loss=0.832]

Average loss of the last 100 batches (batch 3800): 1.4794166743755341


Training:  36%|███▌      | 3901/10950 [11:01<19:55,  5.90it/s, loss=1.58]

Average loss of the last 100 batches (batch 3900): 1.4225502301752568


Training:  37%|███▋      | 4001/10950 [11:18<19:46,  5.85it/s, loss=1.52]

Average loss of the last 100 batches (batch 4000): 1.5182668191194535


Training:  37%|███▋      | 4101/10950 [11:35<19:24,  5.88it/s, loss=0.91]

Average loss of the last 100 batches (batch 4100): 1.4659151381254196


Training:  38%|███▊      | 4201/10950 [11:52<19:05,  5.89it/s, loss=1.24]

Average loss of the last 100 batches (batch 4200): 1.4658956590294838


Training:  39%|███▉      | 4301/10950 [12:09<18:42,  5.92it/s, loss=1.75]

Average loss of the last 100 batches (batch 4300): 1.4499608474969863


Training:  40%|████      | 4401/10950 [12:26<18:33,  5.88it/s, loss=1.75]

Average loss of the last 100 batches (batch 4400): 1.442686133980751


Training:  41%|████      | 4501/10950 [12:43<18:07,  5.93it/s, loss=0.758]

Average loss of the last 100 batches (batch 4500): 1.3406383687257766


Training:  42%|████▏     | 4601/10950 [13:00<17:54,  5.91it/s, loss=1.4]

Average loss of the last 100 batches (batch 4600): 1.4766210317611694


Training:  43%|████▎     | 4701/10950 [13:17<17:46,  5.86it/s, loss=2.17]

Average loss of the last 100 batches (batch 4700): 1.3942710494995116


Training:  44%|████▍     | 4801/10950 [13:34<17:22,  5.90it/s, loss=1.2]

Average loss of the last 100 batches (batch 4800): 1.4430437353253365


Training:  45%|████▍     | 4901/10950 [13:51<17:08,  5.88it/s, loss=1.12]

Average loss of the last 100 batches (batch 4900): 1.4823771661520004


Training:  46%|████▌     | 5001/10950 [14:08<16:41,  5.94it/s, loss=2.07]

Average loss of the last 100 batches (batch 5000): 1.348692879974842


Training:  47%|████▋     | 5101/10950 [14:25<16:32,  5.89it/s, loss=1.54]

Average loss of the last 100 batches (batch 5100): 1.4798288017511367


Training:  47%|████▋     | 5201/10950 [14:42<16:17,  5.88it/s, loss=1]

Average loss of the last 100 batches (batch 5200): 1.3364314818382264


Training:  48%|████▊     | 5301/10950 [14:59<15:58,  5.89it/s, loss=1.36]

Average loss of the last 100 batches (batch 5300): 1.3706478190422058


Training:  49%|████▉     | 5401/10950 [15:16<15:44,  5.88it/s, loss=0.703]

Average loss of the last 100 batches (batch 5400): 1.3737234944105148


Training:  50%|█████     | 5501/10950 [15:33<15:23,  5.90it/s, loss=0.823]

Average loss of the last 100 batches (batch 5500): 1.4413296872377395


Training:  51%|█████     | 5601/10950 [15:50<15:06,  5.90it/s, loss=1.64]

Average loss of the last 100 batches (batch 5600): 1.4723910468816757


Training:  52%|█████▏    | 5701/10950 [16:07<14:51,  5.89it/s, loss=1.57]

Average loss of the last 100 batches (batch 5700): 1.4395577186346054


Training:  53%|█████▎    | 5801/10950 [16:24<14:32,  5.90it/s, loss=0.545]

Average loss of the last 100 batches (batch 5800): 1.315556971281767


Training:  54%|█████▍    | 5901/10950 [16:41<14:17,  5.89it/s, loss=1.35]

Average loss of the last 100 batches (batch 5900): 1.368814223408699


Training:  55%|█████▍    | 6001/10950 [16:58<14:00,  5.89it/s, loss=0.581]

Average loss of the last 100 batches (batch 6000): 1.3510711264610291


Training:  56%|█████▌    | 6101/10950 [17:15<13:41,  5.91it/s, loss=1.13]

Average loss of the last 100 batches (batch 6100): 1.30744107991457


Training:  57%|█████▋    | 6201/10950 [17:32<13:24,  5.91it/s, loss=1.43]

Average loss of the last 100 batches (batch 6200): 1.312435777783394


Training:  58%|█████▊    | 6301/10950 [17:49<13:07,  5.90it/s, loss=1.57]

Average loss of the last 100 batches (batch 6300): 1.3882810223102569


Training:  58%|█████▊    | 6401/10950 [18:06<12:53,  5.88it/s, loss=1.89]

Average loss of the last 100 batches (batch 6400): 1.383493773341179


Training:  59%|█████▉    | 6501/10950 [18:23<12:33,  5.90it/s, loss=1.03]

Average loss of the last 100 batches (batch 6500): 1.422709275186062


Training:  60%|██████    | 6601/10950 [18:40<12:16,  5.90it/s, loss=1.49]

Average loss of the last 100 batches (batch 6600): 1.2815143385529517


Training:  61%|██████    | 6701/10950 [18:57<12:01,  5.89it/s, loss=0.74]

Average loss of the last 100 batches (batch 6700): 1.3922441419959068


Training:  62%|██████▏   | 6801/10950 [19:14<11:44,  5.89it/s, loss=0.674]

Average loss of the last 100 batches (batch 6800): 1.3549937349557877


Training:  63%|██████▎   | 6901/10950 [19:31<11:29,  5.87it/s, loss=1.33]

Average loss of the last 100 batches (batch 6900): 1.4413858470320702


Training:  64%|██████▍   | 7001/10950 [19:48<11:08,  5.91it/s, loss=2.13]

Average loss of the last 100 batches (batch 7000): 1.4023393937945365


Training:  65%|██████▍   | 7101/10950 [20:05<10:51,  5.91it/s, loss=1.15]

Average loss of the last 100 batches (batch 7100): 1.3116497644782066


Training:  66%|██████▌   | 7201/10950 [20:22<10:38,  5.87it/s, loss=0.728]

Average loss of the last 100 batches (batch 7200): 1.3666022819280625


Training:  67%|██████▋   | 7301/10950 [20:39<10:19,  5.89it/s, loss=0.954]

Average loss of the last 100 batches (batch 7300): 1.3402511224150657


Training:  68%|██████▊   | 7401/10950 [20:56<10:04,  5.87it/s, loss=1.26]

Average loss of the last 100 batches (batch 7400): 1.2899995079636575


Training:  69%|██████▊   | 7501/10950 [21:13<09:44,  5.90it/s, loss=0.456]

Average loss of the last 100 batches (batch 7500): 1.3808028200268745


Training:  69%|██████▉   | 7601/10950 [21:30<09:26,  5.91it/s, loss=1.81]

Average loss of the last 100 batches (batch 7600): 1.3409116458892822


Training:  70%|███████   | 7701/10950 [21:47<09:12,  5.88it/s, loss=1.3]

Average loss of the last 100 batches (batch 7700): 1.3005077937245368


Training:  71%|███████   | 7801/10950 [22:04<08:53,  5.90it/s, loss=1.65]

Average loss of the last 100 batches (batch 7800): 1.2866896617412567


Training:  72%|███████▏  | 7901/10950 [22:21<08:38,  5.88it/s, loss=1.33]

Average loss of the last 100 batches (batch 7900): 1.3470619440078735


Training:  73%|███████▎  | 8001/10950 [22:38<08:20,  5.89it/s, loss=2.54]

Average loss of the last 100 batches (batch 8000): 1.2840837585926055


Training:  74%|███████▍  | 8101/10950 [22:54<08:01,  5.91it/s, loss=0.528]

Average loss of the last 100 batches (batch 8100): 1.378340622484684


Training:  75%|███████▍  | 8201/10950 [23:11<07:47,  5.88it/s, loss=0.73]

Average loss of the last 100 batches (batch 8200): 1.3115431779623032


Training:  76%|███████▌  | 8301/10950 [23:28<07:30,  5.88it/s, loss=0.879]

Average loss of the last 100 batches (batch 8300): 1.2989031603932382


Training:  77%|███████▋  | 8401/10950 [23:45<07:14,  5.87it/s, loss=1.37]

Average loss of the last 100 batches (batch 8400): 1.4059040869772435


Training:  78%|███████▊  | 8501/10950 [24:02<06:56,  5.88it/s, loss=1.18]

Average loss of the last 100 batches (batch 8500): 1.2994653677940369


Training:  79%|███████▊  | 8601/10950 [24:19<06:39,  5.89it/s, loss=0.84]

Average loss of the last 100 batches (batch 8600): 1.1525496944785119


Training:  79%|███████▉  | 8701/10950 [24:36<06:22,  5.88it/s, loss=2.5]

Average loss of the last 100 batches (batch 8700): 1.248946667611599


Training:  80%|████████  | 8801/10950 [24:54<06:06,  5.86it/s, loss=0.905]

Average loss of the last 100 batches (batch 8800): 1.4420182833075523


Training:  81%|████████▏ | 8901/10950 [25:11<05:49,  5.87it/s, loss=1.71]

Average loss of the last 100 batches (batch 8900): 1.3348596784472466


Training:  82%|████████▏ | 9001/10950 [25:28<05:30,  5.89it/s, loss=0.862]

Average loss of the last 100 batches (batch 9000): 1.350067713856697


Training:  83%|████████▎ | 9101/10950 [25:45<05:14,  5.87it/s, loss=1.25]

Average loss of the last 100 batches (batch 9100): 1.2513131320476532


Training:  84%|████████▍ | 9201/10950 [26:02<04:58,  5.86it/s, loss=0.85]

Average loss of the last 100 batches (batch 9200): 1.2617910397052765


Training:  85%|████████▍ | 9301/10950 [26:19<04:40,  5.87it/s, loss=2.91]

Average loss of the last 100 batches (batch 9300): 1.2623129391670227


Training:  86%|████████▌ | 9401/10950 [26:36<04:23,  5.87it/s, loss=0.587]

Average loss of the last 100 batches (batch 9400): 1.303351265490055


Training:  87%|████████▋ | 9501/10950 [26:53<04:07,  5.86it/s, loss=0.926]

Average loss of the last 100 batches (batch 9500): 1.3340770304203033


Training:  88%|████████▊ | 9601/10950 [27:10<03:48,  5.90it/s, loss=1.81]

Average loss of the last 100 batches (batch 9600): 1.2101370260119437


Training:  89%|████████▊ | 9701/10950 [27:27<03:32,  5.87it/s, loss=1.3]

Average loss of the last 100 batches (batch 9700): 1.1865194511413575


Training:  90%|████████▉ | 9801/10950 [27:44<03:15,  5.87it/s, loss=1.81]

Average loss of the last 100 batches (batch 9800): 1.2753180342912673


Training:  90%|█████████ | 9901/10950 [28:01<02:58,  5.86it/s, loss=1.64]

Average loss of the last 100 batches (batch 9900): 1.4082538765668868


Training:  91%|█████████▏| 10001/10950 [28:18<02:41,  5.89it/s, loss=1.27]

Average loss of the last 100 batches (batch 10000): 1.2964188036322595


Training:  92%|█████████▏| 10101/10950 [28:35<02:23,  5.92it/s, loss=1.64]

Average loss of the last 100 batches (batch 10100): 1.2265375584363938


Training:  93%|█████████▎| 10201/10950 [28:52<02:06,  5.92it/s, loss=1.67]

Average loss of the last 100 batches (batch 10200): 1.3156971007585525


Training:  94%|█████████▍| 10301/10950 [29:09<01:50,  5.89it/s, loss=1.7]

Average loss of the last 100 batches (batch 10300): 1.2922768852114677


Training:  95%|█████████▍| 10401/10950 [29:26<01:33,  5.88it/s, loss=1.9]

Average loss of the last 100 batches (batch 10400): 1.2621716755628585


Training:  96%|█████████▌| 10501/10950 [29:43<01:15,  5.91it/s, loss=1.96]

Average loss of the last 100 batches (batch 10500): 1.2992165726423264


Training:  97%|█████████▋| 10601/10950 [30:00<00:59,  5.89it/s, loss=1.35]

Average loss of the last 100 batches (batch 10600): 1.260861890912056


Training:  98%|█████████▊| 10701/10950 [30:17<00:42,  5.91it/s, loss=0.431]

Average loss of the last 100 batches (batch 10700): 1.195641074180603


Training:  99%|█████████▊| 10801/10950 [30:34<00:25,  5.89it/s, loss=0.895]

Average loss of the last 100 batches (batch 10800): 1.3104521614313125


Training: 100%|█████████▉| 10901/10950 [30:51<00:08,  5.87it/s, loss=0.994]

Average loss of the last 100 batches (batch 10900): 1.2341378499567508


Training: 100%|██████████| 10950/10950 [30:59<00:00,  5.89it/s, loss=0.924]


Average Training Loss for Epoch 1: 1.4985162891590431


Validating: 100%|██████████| 1322/1322 [01:11<00:00, 18.56it/s]


Validation Loss for Epoch 1: 1.2199324916896048
Epoch 2/3


Training:   1%|          | 101/10950 [00:17<30:43,  5.89it/s, loss=0.331]

Average loss of the last 100 batches (batch 100): 0.924407060444355


Training:   2%|▏         | 201/10950 [00:34<30:18,  5.91it/s, loss=0.698]

Average loss of the last 100 batches (batch 200): 0.9300924651324749


Training:   3%|▎         | 301/10950 [00:51<30:03,  5.91it/s, loss=1.31]

Average loss of the last 100 batches (batch 300): 1.0771314196288586


Training:   4%|▎         | 401/10950 [01:08<30:03,  5.85it/s, loss=0.435]

Average loss of the last 100 batches (batch 400): 0.8663683373481035


Training:   5%|▍         | 501/10950 [01:25<29:26,  5.91it/s, loss=0.47]

Average loss of the last 100 batches (batch 500): 0.8846160148084163


Training:   5%|▌         | 601/10950 [01:42<29:27,  5.85it/s, loss=1.27]

Average loss of the last 100 batches (batch 600): 0.8258506479859352


Training:   6%|▋         | 701/10950 [01:59<28:54,  5.91it/s, loss=1.32]

Average loss of the last 100 batches (batch 700): 1.0181281113624572


Training:   7%|▋         | 801/10950 [02:16<28:42,  5.89it/s, loss=0.649]

Average loss of the last 100 batches (batch 800): 0.9742316399514676


Training:   8%|▊         | 901/10950 [02:33<28:36,  5.85it/s, loss=1.12]

Average loss of the last 100 batches (batch 900): 0.997510738670826


Training:   9%|▉         | 1001/10950 [02:50<28:09,  5.89it/s, loss=1.13]

Average loss of the last 100 batches (batch 1000): 0.8843038403987884


Training:  10%|█         | 1101/10950 [03:07<28:03,  5.85it/s, loss=0.41]

Average loss of the last 100 batches (batch 1100): 0.9147082915902138


Training:  11%|█         | 1201/10950 [03:24<27:26,  5.92it/s, loss=0.757]

Average loss of the last 100 batches (batch 1200): 0.9290052103996277


Training:  12%|█▏        | 1301/10950 [03:41<27:15,  5.90it/s, loss=1.29]

Average loss of the last 100 batches (batch 1300): 0.932818411141634


Training:  13%|█▎        | 1401/10950 [03:58<27:07,  5.87it/s, loss=0.788]

Average loss of the last 100 batches (batch 1400): 0.9416036546230316


Training:  14%|█▎        | 1501/10950 [04:15<26:45,  5.89it/s, loss=1.37]

Average loss of the last 100 batches (batch 1500): 0.9263731473684311


Training:  15%|█▍        | 1601/10950 [04:32<26:37,  5.85it/s, loss=0.855]

Average loss of the last 100 batches (batch 1600): 0.9530100095272064


Training:  16%|█▌        | 1701/10950 [04:49<26:10,  5.89it/s, loss=0.827]

Average loss of the last 100 batches (batch 1700): 0.9010267536342144


Training:  16%|█▋        | 1801/10950 [05:06<25:58,  5.87it/s, loss=0.679]

Average loss of the last 100 batches (batch 1800): 0.9904400339722633


Training:  17%|█▋        | 1901/10950 [05:23<25:38,  5.88it/s, loss=1.49]

Average loss of the last 100 batches (batch 1900): 0.8697358265519142


Training:  18%|█▊        | 2001/10950 [05:40<25:26,  5.86it/s, loss=0.682]

Average loss of the last 100 batches (batch 2000): 0.8760607869923115


Training:  19%|█▉        | 2101/10950 [05:57<25:06,  5.87it/s, loss=1.17]

Average loss of the last 100 batches (batch 2100): 0.8952296856790781


Training:  20%|██        | 2201/10950 [06:14<24:47,  5.88it/s, loss=2.21]

Average loss of the last 100 batches (batch 2200): 0.8714557604491711


Training:  21%|██        | 2301/10950 [06:31<24:26,  5.90it/s, loss=1.34]

Average loss of the last 100 batches (batch 2300): 1.0198830777406693


Training:  22%|██▏       | 2401/10950 [06:48<24:14,  5.88it/s, loss=0.266]

Average loss of the last 100 batches (batch 2400): 0.9741590777039528


Training:  23%|██▎       | 2501/10950 [07:05<23:59,  5.87it/s, loss=0.402]

Average loss of the last 100 batches (batch 2500): 0.8874775977432727


Training:  24%|██▍       | 2601/10950 [07:22<23:38,  5.88it/s, loss=1.18]

Average loss of the last 100 batches (batch 2600): 0.8920603650808334


Training:  25%|██▍       | 2701/10950 [07:39<23:20,  5.89it/s, loss=0.434]

Average loss of the last 100 batches (batch 2700): 0.9733079394698143


Training:  26%|██▌       | 2801/10950 [07:56<23:11,  5.86it/s, loss=1.43]

Average loss of the last 100 batches (batch 2800): 0.9265006364881992


Training:  26%|██▋       | 2901/10950 [08:13<22:47,  5.89it/s, loss=0.901]

Average loss of the last 100 batches (batch 2900): 0.9636178581416607


Training:  27%|██▋       | 3001/10950 [08:30<22:32,  5.88it/s, loss=1.17]

Average loss of the last 100 batches (batch 3000): 0.9585382680594922


Training:  28%|██▊       | 3101/10950 [08:47<22:12,  5.89it/s, loss=1.76]

Average loss of the last 100 batches (batch 3100): 0.9868783676624298


Training:  29%|██▉       | 3201/10950 [09:04<21:57,  5.88it/s, loss=0.656]

Average loss of the last 100 batches (batch 3200): 0.8899102374911309


Training:  30%|███       | 3301/10950 [09:21<21:46,  5.85it/s, loss=0.921]

Average loss of the last 100 batches (batch 3300): 0.9112424510717392


Training:  31%|███       | 3401/10950 [09:38<21:20,  5.89it/s, loss=1.14]

Average loss of the last 100 batches (batch 3400): 0.9876384434103965


Training:  32%|███▏      | 3501/10950 [09:55<21:05,  5.89it/s, loss=0.273]

Average loss of the last 100 batches (batch 3500): 0.9353512638807296


Training:  33%|███▎      | 3601/10950 [10:12<20:50,  5.88it/s, loss=1.32]

Average loss of the last 100 batches (batch 3600): 0.8639615586400032


Training:  34%|███▍      | 3701/10950 [10:29<20:31,  5.89it/s, loss=2.36]

Average loss of the last 100 batches (batch 3700): 0.967567281126976


Training:  35%|███▍      | 3801/10950 [10:46<20:20,  5.86it/s, loss=0.693]

Average loss of the last 100 batches (batch 3800): 1.0572121921181679


Training:  36%|███▌      | 3901/10950 [11:03<19:55,  5.90it/s, loss=0.55]

Average loss of the last 100 batches (batch 3900): 0.9252089658379554


Training:  37%|███▋      | 4001/10950 [11:20<19:37,  5.90it/s, loss=0.11]

Average loss of the last 100 batches (batch 4000): 0.8814530879259109


Training:  37%|███▋      | 4101/10950 [11:37<19:25,  5.88it/s, loss=0.628]

Average loss of the last 100 batches (batch 4100): 0.8632175302505494


Training:  38%|███▊      | 4201/10950 [11:54<19:18,  5.82it/s, loss=1.24]

Average loss of the last 100 batches (batch 4200): 0.9268594269454479


Training:  39%|███▉      | 4301/10950 [12:11<18:53,  5.87it/s, loss=0.805]

Average loss of the last 100 batches (batch 4300): 0.8883069033920765


Training:  40%|████      | 4401/10950 [12:28<18:31,  5.89it/s, loss=0.583]

Average loss of the last 100 batches (batch 4400): 0.8651677818596363


Training:  41%|████      | 4501/10950 [12:45<18:15,  5.89it/s, loss=1.28]

Average loss of the last 100 batches (batch 4500): 0.9326132363080979


Training:  42%|████▏     | 4601/10950 [13:02<18:03,  5.86it/s, loss=0.52]

Average loss of the last 100 batches (batch 4600): 0.9268405148386956


Training:  43%|████▎     | 4701/10950 [13:19<17:42,  5.88it/s, loss=1.05]

Average loss of the last 100 batches (batch 4700): 0.9471060232818127


Training:  44%|████▍     | 4801/10950 [13:36<17:28,  5.86it/s, loss=0.651]

Average loss of the last 100 batches (batch 4800): 0.9296570785343647


Training:  45%|████▍     | 4901/10950 [13:53<17:10,  5.87it/s, loss=1.88]

Average loss of the last 100 batches (batch 4900): 0.856698967218399


Training:  46%|████▌     | 5001/10950 [14:11<16:56,  5.85it/s, loss=0.548]

Average loss of the last 100 batches (batch 5000): 0.9553475099802017


Training:  47%|████▋     | 5101/10950 [14:28<16:37,  5.87it/s, loss=1.59]

Average loss of the last 100 batches (batch 5100): 0.8517435038089752


Training:  47%|████▋     | 5201/10950 [14:45<16:20,  5.86it/s, loss=1.44]

Average loss of the last 100 batches (batch 5200): 0.9332230095565319


Training:  48%|████▊     | 5301/10950 [15:02<16:04,  5.85it/s, loss=0.382]

Average loss of the last 100 batches (batch 5300): 0.9234274254739284


Training:  49%|████▉     | 5401/10950 [15:19<15:43,  5.88it/s, loss=0.549]

Average loss of the last 100 batches (batch 5400): 0.9259538571536541


Training:  50%|█████     | 5501/10950 [15:36<15:26,  5.88it/s, loss=2.3]

Average loss of the last 100 batches (batch 5500): 0.9424075277149677


Training:  51%|█████     | 5601/10950 [15:53<15:13,  5.86it/s, loss=0.554]

Average loss of the last 100 batches (batch 5600): 0.9622169329226017


Training:  52%|█████▏    | 5701/10950 [16:10<14:55,  5.86it/s, loss=0.295]

Average loss of the last 100 batches (batch 5700): 0.8447620984911919


Training:  53%|█████▎    | 5801/10950 [16:27<14:36,  5.88it/s, loss=1.55]

Average loss of the last 100 batches (batch 5800): 0.924390977025032


Training:  54%|█████▍    | 5901/10950 [16:44<14:20,  5.87it/s, loss=0.613]

Average loss of the last 100 batches (batch 5900): 0.8833153885602951


Training:  55%|█████▍    | 6001/10950 [17:01<14:12,  5.81it/s, loss=0.751]

Average loss of the last 100 batches (batch 6000): 0.8616415137797594


Training:  56%|█████▌    | 6101/10950 [17:18<13:48,  5.85it/s, loss=0.807]

Average loss of the last 100 batches (batch 6100): 0.9907212540507316


Training:  57%|█████▋    | 6201/10950 [17:35<13:30,  5.86it/s, loss=1.14]

Average loss of the last 100 batches (batch 6200): 0.9073341369628907


Training:  58%|█████▊    | 6301/10950 [17:52<13:14,  5.85it/s, loss=0.162]

Average loss of the last 100 batches (batch 6300): 0.9707927165925503


Training:  58%|█████▊    | 6401/10950 [18:09<12:55,  5.87it/s, loss=0.449]

Average loss of the last 100 batches (batch 6400): 0.9278716392815113


Training:  59%|█████▉    | 6501/10950 [18:26<12:39,  5.86it/s, loss=1.28]

Average loss of the last 100 batches (batch 6500): 0.8643937794864178


Training:  60%|██████    | 6601/10950 [18:44<12:21,  5.87it/s, loss=0.752]

Average loss of the last 100 batches (batch 6600): 0.929386452883482


Training:  61%|██████    | 6701/10950 [19:01<12:02,  5.88it/s, loss=0.749]

Average loss of the last 100 batches (batch 6700): 0.8864201568067074


Training:  62%|██████▏   | 6801/10950 [19:18<11:47,  5.87it/s, loss=1.13]

Average loss of the last 100 batches (batch 6800): 0.8690808126330376


Training:  63%|██████▎   | 6901/10950 [19:35<11:29,  5.87it/s, loss=1.06]

Average loss of the last 100 batches (batch 6900): 0.9725357773900032


Training:  64%|██████▍   | 7001/10950 [19:52<11:14,  5.85it/s, loss=0.702]

Average loss of the last 100 batches (batch 7000): 0.8909980948269367


Training:  65%|██████▍   | 7101/10950 [20:09<10:56,  5.86it/s, loss=1.04]

Average loss of the last 100 batches (batch 7100): 0.862824015468359


Training:  66%|██████▌   | 7201/10950 [20:26<10:39,  5.87it/s, loss=1.4]

Average loss of the last 100 batches (batch 7200): 0.9746945966780186


Training:  67%|██████▋   | 7301/10950 [20:43<10:21,  5.87it/s, loss=0.834]

Average loss of the last 100 batches (batch 7300): 0.9483191625773907


Training:  68%|██████▊   | 7401/10950 [21:00<10:06,  5.85it/s, loss=0.325]

Average loss of the last 100 batches (batch 7400): 0.8672285386919976


Training:  69%|██████▊   | 7501/10950 [21:17<09:49,  5.86it/s, loss=0.302]

Average loss of the last 100 batches (batch 7500): 0.8788147914409637


Training:  69%|██████▉   | 7601/10950 [21:34<09:32,  5.85it/s, loss=1.77]

Average loss of the last 100 batches (batch 7600): 0.9038232518732547


Training:  70%|███████   | 7701/10950 [21:51<09:12,  5.88it/s, loss=0.687]

Average loss of the last 100 batches (batch 7700): 0.9034719125926495


Training:  71%|███████   | 7801/10950 [22:08<08:57,  5.86it/s, loss=0.628]

Average loss of the last 100 batches (batch 7800): 0.8848821716010571


Training:  72%|███████▏  | 7901/10950 [22:25<08:39,  5.87it/s, loss=1.15]

Average loss of the last 100 batches (batch 7900): 0.9675944840908051


Training:  73%|███████▎  | 8001/10950 [22:42<08:24,  5.85it/s, loss=0.616]

Average loss of the last 100 batches (batch 8000): 0.9451601535081864


Training:  74%|███████▍  | 8101/10950 [23:00<08:05,  5.87it/s, loss=1.46]

Average loss of the last 100 batches (batch 8100): 0.9499806195497513


Training:  75%|███████▍  | 8201/10950 [23:17<07:49,  5.86it/s, loss=0.663]

Average loss of the last 100 batches (batch 8200): 0.9445648125559092


Training:  76%|███████▌  | 8301/10950 [23:34<07:31,  5.87it/s, loss=0.439]

Average loss of the last 100 batches (batch 8300): 0.9724266043305397


Training:  77%|███████▋  | 8401/10950 [23:51<07:13,  5.87it/s, loss=1.01]

Average loss of the last 100 batches (batch 8400): 0.9185435897111893


Training:  78%|███████▊  | 8501/10950 [24:08<06:58,  5.86it/s, loss=0.459]

Average loss of the last 100 batches (batch 8500): 0.8802515068650245


Training:  79%|███████▊  | 8601/10950 [24:25<06:40,  5.86it/s, loss=1.42]

Average loss of the last 100 batches (batch 8600): 0.9024123904109002


Training:  79%|███████▉  | 8701/10950 [24:42<06:22,  5.88it/s, loss=0.437]

Average loss of the last 100 batches (batch 8700): 0.9066160255670548


Training:  80%|████████  | 8801/10950 [24:59<06:05,  5.87it/s, loss=0.723]

Average loss of the last 100 batches (batch 8800): 0.8070033882558346


Training:  81%|████████▏ | 8901/10950 [25:16<05:51,  5.84it/s, loss=0.992]

Average loss of the last 100 batches (batch 8900): 0.8899493815004825


Training:  82%|████████▏ | 9001/10950 [25:33<05:30,  5.89it/s, loss=0.496]

Average loss of the last 100 batches (batch 9000): 0.9352989357709884


Training:  83%|████████▎ | 9101/10950 [25:50<05:14,  5.87it/s, loss=1.39]

Average loss of the last 100 batches (batch 9100): 0.8826018705219031


Training:  84%|████████▍ | 9201/10950 [26:07<04:57,  5.87it/s, loss=0.951]

Average loss of the last 100 batches (batch 9200): 0.9327086052298545


Training:  85%|████████▍ | 9301/10950 [26:24<04:41,  5.86it/s, loss=1.66]

Average loss of the last 100 batches (batch 9300): 0.8543591052293777


Training:  86%|████████▌ | 9401/10950 [26:41<04:23,  5.87it/s, loss=0.832]

Average loss of the last 100 batches (batch 9400): 0.8332450903952122


Training:  87%|████████▋ | 9501/10950 [26:59<04:07,  5.85it/s, loss=0.857]

Average loss of the last 100 batches (batch 9500): 0.9325937463343144


Training:  88%|████████▊ | 9601/10950 [27:16<03:49,  5.87it/s, loss=0.696]

Average loss of the last 100 batches (batch 9600): 0.8546927644312382


Training:  89%|████████▊ | 9701/10950 [27:33<03:32,  5.86it/s, loss=0.994]

Average loss of the last 100 batches (batch 9700): 0.8600543447583914


Training:  90%|████████▉ | 9801/10950 [27:50<03:16,  5.85it/s, loss=0.947]

Average loss of the last 100 batches (batch 9800): 0.9062267684936524


Training:  90%|█████████ | 9901/10950 [28:07<02:59,  5.85it/s, loss=0.287]

Average loss of the last 100 batches (batch 9900): 0.9218409346044063


Training:  91%|█████████▏| 10001/10950 [28:24<02:42,  5.85it/s, loss=0.769]

Average loss of the last 100 batches (batch 10000): 0.9056648007035255


Training:  92%|█████████▏| 10101/10950 [28:41<02:24,  5.86it/s, loss=0.923]

Average loss of the last 100 batches (batch 10100): 0.8852294373512268


Training:  93%|█████████▎| 10201/10950 [28:58<02:08,  5.82it/s, loss=0.553]

Average loss of the last 100 batches (batch 10200): 0.8705558234453201


Training:  94%|█████████▍| 10301/10950 [29:15<01:50,  5.86it/s, loss=0.805]

Average loss of the last 100 batches (batch 10300): 0.8627769563347101


Training:  95%|█████████▍| 10401/10950 [29:32<01:33,  5.86it/s, loss=1.34]

Average loss of the last 100 batches (batch 10400): 0.9083502152562142


Training:  96%|█████████▌| 10501/10950 [29:49<01:16,  5.87it/s, loss=1.69]

Average loss of the last 100 batches (batch 10500): 0.9421147574484349


Training:  97%|█████████▋| 10601/10950 [30:06<00:59,  5.85it/s, loss=0.75]

Average loss of the last 100 batches (batch 10600): 0.8893396571278572


Training:  98%|█████████▊| 10701/10950 [30:23<00:42,  5.83it/s, loss=0.748]

Average loss of the last 100 batches (batch 10700): 0.9022189075499774


Training:  99%|█████████▊| 10801/10950 [30:41<00:25,  5.86it/s, loss=0.371]

Average loss of the last 100 batches (batch 10800): 0.9747663232684135


Training: 100%|█████████▉| 10901/10950 [30:58<00:08,  5.87it/s, loss=0.696]

Average loss of the last 100 batches (batch 10900): 0.8945366179943085


Training: 100%|██████████| 10950/10950 [31:06<00:00,  5.87it/s, loss=0.725]


Average Training Loss for Epoch 2: 0.9181046817359859


Validating: 100%|██████████| 1322/1322 [01:11<00:00, 18.40it/s]


Validation Loss for Epoch 2: 1.147466570922313
Epoch 3/3


Training:   1%|          | 101/10950 [00:17<30:53,  5.85it/s, loss=0.415]

Average loss of the last 100 batches (batch 100): 0.6982916066795588


Training:   2%|▏         | 201/10950 [00:34<30:30,  5.87it/s, loss=0.772]

Average loss of the last 100 batches (batch 200): 0.5811004158481956


Training:   3%|▎         | 301/10950 [00:51<30:19,  5.85it/s, loss=0.138]

Average loss of the last 100 batches (batch 300): 0.5828841917961836


Training:   4%|▎         | 401/10950 [01:08<29:58,  5.87it/s, loss=0.379]

Average loss of the last 100 batches (batch 400): 0.5347166216373443


Training:   5%|▍         | 501/10950 [01:25<29:49,  5.84it/s, loss=0.576]

Average loss of the last 100 batches (batch 500): 0.6827547491341829


Training:   5%|▌         | 601/10950 [01:42<29:31,  5.84it/s, loss=0.0984]

Average loss of the last 100 batches (batch 600): 0.6078982209414243


Training:   6%|▋         | 701/10950 [01:59<29:03,  5.88it/s, loss=0.621]

Average loss of the last 100 batches (batch 700): 0.565896937251091


Training:   7%|▋         | 801/10950 [02:16<28:55,  5.85it/s, loss=0.566]

Average loss of the last 100 batches (batch 800): 0.6162205328792334


Training:   8%|▊         | 901/10950 [02:33<28:31,  5.87it/s, loss=0.514]

Average loss of the last 100 batches (batch 900): 0.656155701354146


Training:   9%|▉         | 1001/10950 [02:50<28:10,  5.88it/s, loss=0.629]

Average loss of the last 100 batches (batch 1000): 0.6414738719165325


Training:  10%|█         | 1101/10950 [03:08<28:00,  5.86it/s, loss=0.151]

Average loss of the last 100 batches (batch 1100): 0.6041491629183292


Training:  11%|█         | 1201/10950 [03:25<27:44,  5.86it/s, loss=0.723]

Average loss of the last 100 batches (batch 1200): 0.6411497510224581


Training:  12%|█▏        | 1301/10950 [03:42<27:28,  5.85it/s, loss=0.798]

Average loss of the last 100 batches (batch 1300): 0.6023632027208805


Training:  13%|█▎        | 1401/10950 [03:59<27:13,  5.85it/s, loss=0.547]

Average loss of the last 100 batches (batch 1400): 0.6063440023362636


Training:  14%|█▎        | 1501/10950 [04:16<26:53,  5.86it/s, loss=0.181]

Average loss of the last 100 batches (batch 1500): 0.616171068251133


Training:  15%|█▍        | 1601/10950 [04:33<26:34,  5.86it/s, loss=1.26]

Average loss of the last 100 batches (batch 1600): 0.6487407328933478


Training:  16%|█▌        | 1701/10950 [04:50<26:12,  5.88it/s, loss=1.01]

Average loss of the last 100 batches (batch 1700): 0.6486330512911082


Training:  16%|█▋        | 1801/10950 [05:07<25:59,  5.87it/s, loss=0.731]

Average loss of the last 100 batches (batch 1800): 0.6380220197141171


Training:  17%|█▋        | 1901/10950 [05:24<25:42,  5.87it/s, loss=0.6]

Average loss of the last 100 batches (batch 1900): 0.5883560000360012


Training:  18%|█▊        | 2001/10950 [05:41<25:23,  5.87it/s, loss=0.963]

Average loss of the last 100 batches (batch 2000): 0.590886723920703


Training:  19%|█▉        | 2101/10950 [05:58<25:16,  5.83it/s, loss=0.518]

Average loss of the last 100 batches (batch 2100): 0.6786312952637672


Training:  20%|██        | 2201/10950 [06:15<24:54,  5.85it/s, loss=0.739]

Average loss of the last 100 batches (batch 2200): 0.6284412901103497


Training:  21%|██        | 2301/10950 [06:33<24:33,  5.87it/s, loss=0.662]

Average loss of the last 100 batches (batch 2300): 0.6156789320707321


Training:  22%|██▏       | 2401/10950 [06:50<24:12,  5.89it/s, loss=0.629]

Average loss of the last 100 batches (batch 2400): 0.5869095500558614


Training:  23%|██▎       | 2501/10950 [07:07<23:57,  5.88it/s, loss=0.715]

Average loss of the last 100 batches (batch 2500): 0.6504344922304154


Training:  24%|██▍       | 2601/10950 [07:24<23:45,  5.86it/s, loss=1.35]

Average loss of the last 100 batches (batch 2600): 0.5959384576976299


Training:  25%|██▍       | 2701/10950 [07:41<23:20,  5.89it/s, loss=1.94]

Average loss of the last 100 batches (batch 2700): 0.5716405847668647


Training:  26%|██▌       | 2801/10950 [07:58<23:05,  5.88it/s, loss=0.739]

Average loss of the last 100 batches (batch 2800): 0.6258346189185977


Training:  26%|██▋       | 2901/10950 [08:15<22:50,  5.87it/s, loss=0.675]

Average loss of the last 100 batches (batch 2900): 0.6042420949041843


Training:  27%|██▋       | 3001/10950 [08:32<22:31,  5.88it/s, loss=0.858]

Average loss of the last 100 batches (batch 3000): 0.567248505204916


Training:  28%|██▊       | 3101/10950 [08:49<22:21,  5.85it/s, loss=0.606]

Average loss of the last 100 batches (batch 3100): 0.6052404288202524


Training:  29%|██▉       | 3201/10950 [09:06<22:02,  5.86it/s, loss=0.788]

Average loss of the last 100 batches (batch 3200): 0.5413550917804241


Training:  30%|███       | 3301/10950 [09:23<21:44,  5.86it/s, loss=0.519]

Average loss of the last 100 batches (batch 3300): 0.6152820011973381


Training:  31%|███       | 3401/10950 [09:40<21:27,  5.86it/s, loss=0.171]

Average loss of the last 100 batches (batch 3400): 0.6604753868281841


Training:  32%|███▏      | 3501/10950 [09:57<21:12,  5.85it/s, loss=0.479]

Average loss of the last 100 batches (batch 3500): 0.5795098905265331


Training:  33%|███▎      | 3601/10950 [10:14<20:52,  5.87it/s, loss=0.844]

Average loss of the last 100 batches (batch 3600): 0.6024498764425517


Training:  34%|███▍      | 3701/10950 [10:31<20:35,  5.87it/s, loss=0.86]

Average loss of the last 100 batches (batch 3700): 0.6346170218661428


Training:  35%|███▍      | 3801/10950 [10:48<20:22,  5.85it/s, loss=0.755]

Average loss of the last 100 batches (batch 3800): 0.6562061517685652


Training:  36%|███▌      | 3901/10950 [11:05<20:00,  5.87it/s, loss=1.14]

Average loss of the last 100 batches (batch 3900): 0.5742507843673229


Training:  37%|███▋      | 4001/10950 [11:22<19:41,  5.88it/s, loss=0.332]

Average loss of the last 100 batches (batch 4000): 0.6392145997285843


Training:  37%|███▋      | 4101/10950 [11:39<19:28,  5.86it/s, loss=0.955]

Average loss of the last 100 batches (batch 4100): 0.6263213819265365


Training:  38%|███▊      | 4201/10950 [11:57<19:13,  5.85it/s, loss=0.805]

Average loss of the last 100 batches (batch 4200): 0.6674408518150449


Training:  39%|███▉      | 4301/10950 [12:14<18:54,  5.86it/s, loss=0.459]

Average loss of the last 100 batches (batch 4300): 0.5791323121637106


Training:  40%|████      | 4401/10950 [12:31<18:39,  5.85it/s, loss=0.812]

Average loss of the last 100 batches (batch 4400): 0.5953454010933638


Training:  41%|████      | 4501/10950 [12:48<18:17,  5.87it/s, loss=1.09]

Average loss of the last 100 batches (batch 4500): 0.6650251422077418


Training:  42%|████▏     | 4601/10950 [13:05<18:00,  5.87it/s, loss=0.448]

Average loss of the last 100 batches (batch 4600): 0.6206854949891567


Training:  43%|████▎     | 4701/10950 [13:22<17:42,  5.88it/s, loss=0.515]

Average loss of the last 100 batches (batch 4700): 0.5891299946606159


Training:  44%|████▍     | 4801/10950 [13:39<17:31,  5.85it/s, loss=0.574]

Average loss of the last 100 batches (batch 4800): 0.6279450680315495


Training:  45%|████▍     | 4901/10950 [13:56<17:08,  5.88it/s, loss=0.456]

Average loss of the last 100 batches (batch 4900): 0.6252207854390144


Training:  46%|████▌     | 5001/10950 [14:13<16:53,  5.87it/s, loss=0.777]

Average loss of the last 100 batches (batch 5000): 0.5864050206542015


Training:  47%|████▋     | 5101/10950 [14:30<16:35,  5.88it/s, loss=1.01]

Average loss of the last 100 batches (batch 5100): 0.6083432152122259


Training:  47%|████▋     | 5201/10950 [14:47<16:20,  5.86it/s, loss=0.963]

Average loss of the last 100 batches (batch 5200): 0.6057437477633357


Training:  48%|████▊     | 5301/10950 [15:04<16:05,  5.85it/s, loss=0.832]

Average loss of the last 100 batches (batch 5300): 0.6199667024612426


Training:  49%|████▉     | 5401/10950 [15:21<15:44,  5.88it/s, loss=0.846]

Average loss of the last 100 batches (batch 5400): 0.5670528319478035


Training:  50%|█████     | 5501/10950 [15:38<15:29,  5.86it/s, loss=1.05]

Average loss of the last 100 batches (batch 5500): 0.5596352799981833


Training:  51%|█████     | 5601/10950 [15:55<15:10,  5.88it/s, loss=0.411]

Average loss of the last 100 batches (batch 5600): 0.6320634604245424


Training:  52%|█████▏    | 5701/10950 [16:12<14:54,  5.87it/s, loss=0.724]

Average loss of the last 100 batches (batch 5700): 0.6118320614099503


Training:  53%|█████▎    | 5801/10950 [16:30<14:37,  5.87it/s, loss=0.285]

Average loss of the last 100 batches (batch 5800): 0.5581100994721055


Training:  54%|█████▍    | 5901/10950 [16:47<14:21,  5.86it/s, loss=0.207]

Average loss of the last 100 batches (batch 5900): 0.5921534499526024


Training:  55%|█████▍    | 6001/10950 [17:04<14:05,  5.85it/s, loss=0.244]

Average loss of the last 100 batches (batch 6000): 0.5825428801774979


Training:  56%|█████▌    | 6101/10950 [17:21<13:47,  5.86it/s, loss=0.413]

Average loss of the last 100 batches (batch 6100): 0.6588654556125403


Training:  57%|█████▋    | 6201/10950 [17:38<13:30,  5.86it/s, loss=0.31]

Average loss of the last 100 batches (batch 6200): 0.5995847771316767


Training:  58%|█████▊    | 6301/10950 [17:55<13:12,  5.87it/s, loss=0.259]

Average loss of the last 100 batches (batch 6300): 0.6011570649594069


Training:  58%|█████▊    | 6401/10950 [18:12<13:00,  5.83it/s, loss=0.455]

Average loss of the last 100 batches (batch 6400): 0.6175738248974085


Training:  59%|█████▉    | 6501/10950 [18:29<12:37,  5.87it/s, loss=0.356]

Average loss of the last 100 batches (batch 6500): 0.6178614304959774


Training:  60%|██████    | 6601/10950 [18:46<12:21,  5.87it/s, loss=0.526]

Average loss of the last 100 batches (batch 6600): 0.5871547961235046


Training:  61%|██████    | 6701/10950 [19:03<12:07,  5.84it/s, loss=0.609]

Average loss of the last 100 batches (batch 6700): 0.6137628005445004


Training:  62%|██████▏   | 6801/10950 [19:20<11:51,  5.83it/s, loss=0.565]

Average loss of the last 100 batches (batch 6800): 0.6125237777084113


Training:  63%|██████▎   | 6901/10950 [19:37<11:29,  5.87it/s, loss=0.937]

Average loss of the last 100 batches (batch 6900): 0.5244058392941952


Training:  64%|██████▍   | 7001/10950 [19:54<11:16,  5.84it/s, loss=0.278]

Average loss of the last 100 batches (batch 7000): 0.6205209035053849


Training:  65%|██████▍   | 7101/10950 [20:11<10:56,  5.86it/s, loss=0.849]

Average loss of the last 100 batches (batch 7100): 0.5408771743625402


Training:  66%|██████▌   | 7201/10950 [20:28<10:40,  5.86it/s, loss=1.01]

Average loss of the last 100 batches (batch 7200): 0.6291651917248964


Training:  67%|██████▋   | 7301/10950 [20:46<10:20,  5.88it/s, loss=0.395]

Average loss of the last 100 batches (batch 7300): 0.6341906989365816


Training:  68%|██████▊   | 7401/10950 [21:03<10:06,  5.85it/s, loss=0.698]

Average loss of the last 100 batches (batch 7400): 0.5669528485834598


Training:  69%|██████▊   | 7501/10950 [21:20<09:50,  5.84it/s, loss=0.589]

Average loss of the last 100 batches (batch 7500): 0.589189855530858


Training:  69%|██████▉   | 7601/10950 [21:37<09:30,  5.87it/s, loss=0.497]

Average loss of the last 100 batches (batch 7600): 0.5908721781894565


Training:  70%|███████   | 7701/10950 [21:54<09:11,  5.89it/s, loss=0.295]

Average loss of the last 100 batches (batch 7700): 0.6046503009647131


Training:  71%|███████   | 7801/10950 [22:11<08:55,  5.88it/s, loss=0.455]

Average loss of the last 100 batches (batch 7800): 0.5745651093125344


Training:  72%|███████▏  | 7901/10950 [22:28<08:38,  5.88it/s, loss=0.262]

Average loss of the last 100 batches (batch 7900): 0.5936830637603998


Training:  73%|███████▎  | 8001/10950 [22:45<08:24,  5.84it/s, loss=1.39]

Average loss of the last 100 batches (batch 8000): 0.5555706376582384


Training:  74%|███████▍  | 8101/10950 [23:02<08:04,  5.88it/s, loss=0.493]

Average loss of the last 100 batches (batch 8100): 0.6066849126666785


Training:  75%|███████▍  | 8201/10950 [23:19<07:48,  5.87it/s, loss=1.01]

Average loss of the last 100 batches (batch 8200): 0.6068078721314669


Training:  76%|███████▌  | 8301/10950 [23:36<07:31,  5.87it/s, loss=0.999]

Average loss of the last 100 batches (batch 8300): 0.5770673416554928


Training:  77%|███████▋  | 8401/10950 [23:53<07:14,  5.86it/s, loss=0.484]

Average loss of the last 100 batches (batch 8400): 0.6199776013195515


Training:  78%|███████▊  | 8501/10950 [24:10<06:56,  5.88it/s, loss=0.728]

Average loss of the last 100 batches (batch 8500): 0.5715317110717296


Training:  79%|███████▊  | 8601/10950 [24:27<06:41,  5.85it/s, loss=0.423]

Average loss of the last 100 batches (batch 8600): 0.6018045489490033


Training:  79%|███████▉  | 8701/10950 [24:44<06:23,  5.86it/s, loss=0.217]

Average loss of the last 100 batches (batch 8700): 0.6094974578917026


Training:  80%|████████  | 8801/10950 [25:02<06:04,  5.89it/s, loss=0.706]

Average loss of the last 100 batches (batch 8800): 0.5820247787237167


Training:  81%|████████▏ | 8901/10950 [25:19<05:48,  5.88it/s, loss=0.455]

Average loss of the last 100 batches (batch 8900): 0.6212674327939749


Training:  82%|████████▏ | 9001/10950 [25:36<05:31,  5.88it/s, loss=0.491]

Average loss of the last 100 batches (batch 9000): 0.5561422229558229


Training:  83%|████████▎ | 9101/10950 [25:53<05:16,  5.84it/s, loss=0.781]

Average loss of the last 100 batches (batch 9100): 0.5843381467461586


Training:  84%|████████▍ | 9201/10950 [26:10<04:58,  5.86it/s, loss=0.452]

Average loss of the last 100 batches (batch 9200): 0.6093230287730694


Training:  85%|████████▍ | 9301/10950 [26:27<04:40,  5.87it/s, loss=0.547]

Average loss of the last 100 batches (batch 9300): 0.5683894060179591


Training:  86%|████████▌ | 9401/10950 [26:44<04:24,  5.86it/s, loss=0.636]

Average loss of the last 100 batches (batch 9400): 0.5572957564145327


Training:  87%|████████▋ | 9501/10950 [27:01<04:06,  5.88it/s, loss=0.454]

Average loss of the last 100 batches (batch 9500): 0.6409769594669342


Training:  88%|████████▊ | 9601/10950 [27:18<03:50,  5.85it/s, loss=0.876]

Average loss of the last 100 batches (batch 9600): 0.5123872531950474


Training:  89%|████████▊ | 9701/10950 [27:35<03:32,  5.86it/s, loss=0.508]

Average loss of the last 100 batches (batch 9700): 0.6233972778171301


Training:  90%|████████▉ | 9801/10950 [27:52<03:16,  5.85it/s, loss=1.33]

Average loss of the last 100 batches (batch 9800): 0.581683349609375


Training:  90%|█████████ | 9901/10950 [28:09<02:58,  5.87it/s, loss=0.988]

Average loss of the last 100 batches (batch 9900): 0.6787943036109209


Training:  91%|█████████▏| 10001/10950 [28:26<02:41,  5.88it/s, loss=0.903]

Average loss of the last 100 batches (batch 10000): 0.5493169967085123


Training:  92%|█████████▏| 10101/10950 [28:43<02:25,  5.83it/s, loss=0.303]

Average loss of the last 100 batches (batch 10100): 0.6226316587626934


Training:  93%|█████████▎| 10201/10950 [29:00<02:08,  5.84it/s, loss=0.245]

Average loss of the last 100 batches (batch 10200): 0.5330405846983194


Training:  94%|█████████▍| 10301/10950 [29:17<01:50,  5.88it/s, loss=1.04]

Average loss of the last 100 batches (batch 10300): 0.5834126925468445


Training:  95%|█████████▍| 10401/10950 [29:35<01:33,  5.89it/s, loss=0.585]

Average loss of the last 100 batches (batch 10400): 0.6033886218070984


Training:  96%|█████████▌| 10501/10950 [29:52<01:16,  5.84it/s, loss=0.206]

Average loss of the last 100 batches (batch 10500): 0.5905494809150695


Training:  97%|█████████▋| 10601/10950 [30:09<00:59,  5.83it/s, loss=0.513]

Average loss of the last 100 batches (batch 10600): 0.5066306424885988


Training:  98%|█████████▊| 10701/10950 [30:26<00:42,  5.87it/s, loss=0.769]

Average loss of the last 100 batches (batch 10700): 0.5143914371356368


Training:  99%|█████████▊| 10801/10950 [30:43<00:25,  5.85it/s, loss=1.2]

Average loss of the last 100 batches (batch 10800): 0.566112012937665


Training: 100%|█████████▉| 10901/10950 [31:00<00:08,  5.86it/s, loss=0.556]

Average loss of the last 100 batches (batch 10900): 0.625988946184516


Training: 100%|██████████| 10950/10950 [31:08<00:00,  5.86it/s, loss=0.36]


Average Training Loss for Epoch 3: 0.6020535657558267


Validating: 100%|██████████| 1322/1322 [01:11<00:00, 18.45it/s]

Validation Loss for Epoch 3: 1.3187899800967116





In [15]:
import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define the directory where you want to save the model
model_save_path = '/content/drive/My Drive/SQuAD_datasets/saved_model'

# Create the directory if it doesn't exist
if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
# Save the trained model and tokenizer
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Model and tokenizer saved to {model_save_path}")


Model and tokenizer saved to /content/drive/My Drive/SQuAD_datasets/saved_model


In [17]:
from transformers import BertForQuestionAnswering, BertTokenizer

# Load the saved model and tokenizer from Google Drive
model = BertForQuestionAnswering.from_pretrained(model_save_path)
tokenizer = BertTokenizerFast.from_pretrained(model_save_path)

print("Model and tokenizer loaded successfully.")


Model and tokenizer loaded successfully.


In [19]:
from transformers import BertForQuestionAnswering, BertTokenizer
import torch

# Load the saved model and tokenizer from Google Drive
model_save_path = '/content/drive/My Drive/SQuAD_datasets/saved_model'
model = BertForQuestionAnswering.from_pretrained(model_save_path)
tokenizer = BertTokenizerFast.from_pretrained(model_save_path)

# Set the model to evaluation mode
model.eval()

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

In [20]:
def answer_question(question, context):
    # Tokenize input
    inputs = tokenizer(question, context, return_tensors='pt', truncation=True, max_length=512).to(device)

    # Generate output
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the most likely beginning and end of the answer span
    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits

    # Find the tokens with the highest `start` and `end` scores
    answer_start = torch.argmax(answer_start_scores)  # Get the most likely start of the answer
    answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of the answer

    # Convert the tokens to text
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))

    return answer


In [21]:
# Example context and question
context = """Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season.
The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10
to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi's Stadium in the San Francisco Bay Area at Santa Clara, California."""

question = "Who won Super Bowl 50?"

# Get the answer
answer = answer_question(question, context)
print(f"Question: {question}")
print(f"Answer: {answer}")


Question: Who won Super Bowl 50?
Answer: denver broncos


In [22]:
# New example context and question (completely different from training or dev data)
context = """
The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France. It is named after the engineer Gustave Eiffel,
whose company designed and built the tower. Constructed from 1887 to 1889 as the entrance arch for the 1889 World's Fair,
it was initially criticized by some of France's leading artists and intellectuals for its design, but it has become a global cultural icon of France
and one of the most recognizable structures in the world.
"""

question = "Who designed the Eiffel Tower?"

# Get the answer from the model
answer = answer_question(question, context)

print(f"Question: {question}")
print(f"Answer: {answer}")


Question: Who designed the Eiffel Tower?
Answer: gustave eiffel


In [23]:
question = "What is Eiffel Tower made up of?"

# Get the answer from the model
answer = answer_question(question, context)

print(f"Question: {question}")
print(f"Answer: {answer}")

Question: What is Eiffel Tower made up of?
Answer: wrought - iron lattice tower


In [24]:
question = "How long it took to build the Eiffel Tower?"

# Get the answer from the model
answer = answer_question(question, context)

print(f"Question: {question}")
print(f"Answer: {answer}")

Question: How long it took to build the Eiffel Tower?
Answer: from 1887 to 1889


In [25]:
question = "Why was the construction of Eiffel Tower critisized initally?"

# Get the answer from the model
answer = answer_question(question, context)

print(f"Question: {question}")
print(f"Answer: {answer}")

Question: Why was the construction of Eiffel Tower critisized initally?
Answer: it was initially criticized by some of france ' s leading artists and intellectuals for its design


In [27]:
import shutil

# Zip the saved model directory
shutil.make_archive('/content/saved_model', 'zip', '/content/drive/My Drive/SQuAD_datasets/saved_model')

# Download the zip file to your PC
from google.colab import files
files.download('/content/saved_model.zip')



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### After downloading the model in Local Laptop, the model was saved to the working directory and zip extracted.

### This part was executed in the local laptop

In [40]:
import sys
print(sys.path)


['C:\\Users\\CZ0068\\torch', 'C:\\Users\\CZ0068\\torch', 'C:\\Users\\CZ0068\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python311, C:\\Users\\CZ0068\\torch', 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\\python311.zip', 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\\DLLs', 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\\Lib', 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0', '', 'C:\\Users\\CZ0068\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python311\\site-packages', 'C:\\Users\\CZ0068\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python311\\site-packages\\win32', 'C:\\Users\\CZ0068\\AppD

In [39]:
import sys
print(sys.executable)


C:\Users\CZ0068\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe


In [38]:
import sys
sys.path.insert(0, 'C:\\Users\\CZ0068\\torch')

import transformers
import torch
import numpy as np

print(transformers.__version__)
print(torch.__version__)
print(np.__version__)


4.45.2
2.4.1+cpu
1.24.4


In [31]:
import torch
from transformers import BertForQuestionAnswering, BertTokenizerFast

# Path to the extracted model on your PC
model_path = 'D:/Rajesh/Rajesh/Personal/AISanDiego/520-NLP/Project/saved_model/'

# Load the model and the fast version of the tokenizer
model = BertForQuestionAnswering.from_pretrained(model_path)
tokenizer = BertTokenizerFast.from_pretrained(model_path)

# Define a function to answer questions
def answer_question(question, context):
    # Encode the inputs with offset mappings to preserve context token positions
    inputs = tokenizer.encode_plus(question, context, return_tensors="pt", add_special_tokens=True, return_offsets_mapping=True)
    
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    offsets = inputs["offset_mapping"][0]  # Offset mapping for token positions
    
    # Get the answer using the model
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        answer_start_scores = outputs.start_logits
        answer_end_scores = outputs.end_logits

    # Find the start and end of the answer
    answer_start = torch.argmax(answer_start_scores)
    answer_end = torch.argmax(answer_end_scores) + 1

    # Get the token offsets for the answer (start and end positions)
    start_char = offsets[answer_start][0]
    end_char = offsets[answer_end - 1][1]

    # Extract the answer from the original context using the offsets
    answer = context[start_char:end_char]
    
    # Capitalize the first letter of the answer if needed
    if answer:
        answer = answer[0].upper() + answer[1:]

    return answer

# Test with a new question and context
context = ("Transformers are a type of deep learning model introduced in the paper 'Attention is All You Need' by Vaswani et al. "
           "in 2017. Unlike traditional recurrent neural networks (RNNs), Transformers rely entirely on self-attention mechanisms "
           "to capture the relationships between different parts of a sequence. This allows Transformers to process data in parallel, "
           "making them much faster and more efficient for tasks like natural language processing. Transformers are the foundation for "
           "many state-of-the-art models, such as BERT, GPT, and T5. They are used in a variety of applications, including machine translation, "
           "text generation, and question answering.")



question = "Name some models that are based on the Transformer architecture"
answer = answer_question(question, context)
print(f"Question: {question}")
print(f"Answer: {answer}")


question = "What mechanism does the Transformer rely on"
answer = answer_question(question, context)

print(f"Question: {question}")
print(f"Answer: {answer_question(question, context)}")

question = "Why are Transformers faster than traditional RNNs?"
answer = answer_question(question, context)

print(f"Question: {question}")
print(f"Answer: {answer_question(question, context)}")


question = "What happens if selef-attention mechanism capture the relationship between different parts of a sequence ?"
answer = answer_question(question, context)

print(f"Question: {question}")
print(f"Answer: {answer_question(question, context)}")

question = "State some applications of Transformers?"
answer = answer_question(question, context)

print(f"Question: {question}")
print(f"Answer: {answer_question(question, context)}")

question = "Who introduced self Attention?"
answer = answer_question(question, context)

print(f"Question: {question}")
print(f"Answer: {answer_question(question, context)}")

Question: Name some models that are based on the Transformer architecture
Answer: BERT, GPT, and T5
Question: What mechanism does the Transformer rely on
Answer: Self-attention mechanisms
Question: Why are Transformers faster than traditional RNNs?
Answer: Self-attention mechanisms
Question: What happens if selef-attention mechanism capture the relationship between different parts of a sequence ?
Answer: Self-attention mechanisms to capture the relationships between different parts of a sequence. This allows Transformers to process data in parallel
Question: State some applications of Transformers?
Answer: Machine translation, text generation, and question answering
Question: Who introduced self Attention?
Answer: Vaswani et al.


In [37]:
def answer_question(question, context):
    # Tokenize input
    inputs = tokenizer(question, context, return_tensors='pt', truncation=True, max_length=512)

    # Generate output
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the most likely beginning and end of the answer span
    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits

    # Find the tokens with the highest `start` and `end` scores
    answer_start = torch.argmax(answer_start_scores)  # Get the most likely start of the answer
    answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of the answer

    # Convert the tokens to text
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))

    answer=answer[0].upper()+ answer[1:len(answer)]

    return answer

# Test with a new question and context
context = ("Transformers are a type of deep learning model introduced in the paper 'Attention is All You Need' by Vaswani et al. "
           "in 2017. Unlike traditional recurrent neural networks (RNNs), Transformers rely entirely on self-attention mechanisms "
           "to capture the relationships between different parts of a sequence. This allows Transformers to process data in parallel, "
           "making them much faster and more efficient for tasks like natural language processing. Transformers are the foundation for "
           "many state-of-the-art models, such as BERT, GPT, and T5. They are used in a variety of applications, including machine translation, "
           "text generation, and question answering.")
question = "Name some models that are based on the Transformer architecture"
answer = answer_question(question, context)
print(f"Question: {question}")
print(f"Answer: {answer}")


question = "What mechanism does the Transformer rely on"
answer = answer_question(question, context)

print(f"Question: {question}")
print(f"Answer: {answer_question(question, context)}")

question = "Why are Transformers faster than traditional RNNs?"
answer = answer_question(question, context)

print(f"Question: {question}")
print(f"Answer: {answer_question(question, context)}")


question = "What happens if selef-attention mechanism capture the relationship between different parts of a sequence ?"
answer = answer_question(question, context)

print(f"Question: {question}")
print(f"Answer: {answer_question(question, context)}")

question = "State some applications of Transformers?"
answer = answer_question(question, context)

print(f"Question: {question}")
print(f"Answer: {answer_question(question, context)}")

question = "Who introduced self Attention?"
answer = answer_question(question, context)

print(f"Question: {question}")
print(f"Answer: {answer_question(question, context)}")



Question: Name some models that are based on the Transformer architecture
Answer: Bert, gpt, and t5
Question: What mechanism does the Transformer rely on
Answer: Self - attention mechanisms
Question: Why are Transformers faster than traditional RNNs?
Answer: Transformers rely entirely on self - attention mechanisms to capture the relationships between different parts of a sequence
Question: What happens if selef-attention mechanism capture the relationship between different parts of a sequence ?
Answer: Allows transformers to process data in parallel
Question: State some applications of Transformers?
Answer: Machine translation, text generation, and question answering
Question: Who introduced self Attention?
Answer: Vaswani et al.


In [45]:
# Test with a new question and context
context = ("Artificial intelligence (AI) is a field of computer science that aims to create machines that can perform tasks that would normally require human intelligence."
           "These tasks include problem-solving, learning, planning, natural language understanding, and perception. The term was first coined by John McCarthy in 1956 at "
           "the Dartmouth Conference, which is considered the founding event of AI as a field. Early efforts in AI involved symbolic AI, where knowledge was explicitly coded"
           "into machines, but progress was slow due to the complexity of human cognition. In the 1990s and 2000s, a major shift occurred with the rise of machine learning,"
            "a subset of AI that enables computers to learn from data. Instead of being explicitly programmed, machine learning algorithms, especially neural networks, became" 
            "capable of automatically improving their performance on tasks. Deep learning, a branch of machine learning involving multi-layered neural networks, has enabled"
            "breakthroughs in fields like image recognition, speech processing, and game playing.Today, AI is applied in numerous industries including healthcare, finance,"
            " and autonomous driving. While AI has made tremendous progress, it also raises ethical concerns such as bias in algorithms, job displacement, and the need for" 
            "regulations to ensure AI systems are fair and transparent. Despite these challenges, AI continues to evolve rapidly, with researchers aiming to achieve artificial" 
            "general intelligence (AGI), a level of AI that can understand and learn any intellectual task that a human being can.")

question = "What are some of the key developments in artificial intelligence from its early days to the present?"
answer = answer_question(question, context)

print(f"Question: {question}")
print(f"Answer: {answer}")

Question: What are some of the key developments in artificial intelligence from its early days to the present?
Answer: Problem-solving, learning, planning, natural language understanding, and perception. The term was first coined by John McCarthy in 1956 at the Dartmouth Conference, which is considered the founding event of AI as a field. Early efforts in AI involved symbolic AI, where knowledge was explicitly codedinto machines, but progress was slow due to the complexity of human cognition. In the 1990s and 2000s, a major shift occurred with the rise of machine learning,a subset of AI that enables computers to learn from data. Instead of being explicitly programmed, machine learning algorithms


In [48]:
question = "What are some ethical concerns related to artificial intelligence?"
answer = answer_question(question, context)

print(f"Question: {question}")
print(f"Answer: {answer}")

Question: What are some ethical concerns related to artificial intelligence?
Answer: Bias in algorithms, job displacement, and the need forregulations to ensure AI systems are fair and transparent
