In [1]:
import jsonlines
import torchaudio
from datasets import Dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Trainer, TrainingArguments, AutoModelForSequenceClassification, EarlyStoppingCallback, TrainerCallback
from pathlib import Path
import torch, random
import librosa, os
import IPython.display as ipd
from dotenv import load_dotenv
import evaluate
from spellchecker import SpellChecker
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
from datasets import load_metric
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
#load dataset

load_dotenv()

TEAM_NAME = os.getenv("TEAM_NAME", "7up")
TEAM_TRACK = os.getenv("TEAM_TRACK", "advanced")


input_dir = Path(f"/home/jupyter/{TEAM_TRACK}")
# input_dir = Path(f"../../data/{TEAM_TRACK}/train")
results_dir = Path(f"/home/jupyter/{TEAM_NAME}")
# results_dir = Path("results")
results_dir.mkdir(parents=True, exist_ok=True)


data = []
with jsonlines.open(input_dir / "nlp.jsonl") as reader:
    for obj in reader:
        data.append(obj)

df = pd.DataFrame(data)
dataset = Dataset.from_pandas(df)

In [3]:
new_contexts = []
new_questions = []
new_answers = []

# Loop through each row in the original DataFrame
for index, row in df.iterrows():
    context = row['transcript']
    for col in ['target', 'heading', 'tool']:
        new_contexts.append(context)
        if col == 'tool':
            new_questions.append(f"What is the tool to be deployed?")
        else:
            new_questions.append(f"What is the {col}?")
        new_answers.append(row[col])

new_df = pd.DataFrame({
    'context': new_contexts,
    'question': new_questions,
    'answer': new_answers
})

In [4]:
seed = 42

train_df, temp_df = train_test_split(new_df, test_size=0.2, random_state=seed, shuffle=True)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=seed, shuffle=True)
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

#train_df

In [5]:
import torch
from transformers import BertForQuestionAnswering, AutoTokenizer, DefaultDataCollator, TrainingArguments, Trainer 

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# import torch
# from transformers import BertTokenizer, BertForQuestionAnswering

# # Load a pre-trained BERT model and tokenizer
# model_name = 'bert-large-uncased-whole-word-masking-finetuned-squad'
# tokenizer = BertTokenizer.from_pretrained(model_name)
# model = BertForQuestionAnswering.from_pretrained(model_name)

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
context = "to interceptors, deploy interceptor jets immediately. target is an orange and yellow fighter jet . engage and intercept the target. repeat, engage and intercept the target."
question = "what is the target?"

In [20]:
train_ds["answer"][0]

'purple fighter plane'

In [17]:
def preprocess_function(examples):
    # Tokenize the text
    tokenized_inputs = tokenizer(examples['context'], padding='max_length', truncation = "only_first")
    
    # Map labels to integers
    tokenized_inputs['label_tool'] = [label_to_id['tool'][label] for label in examples['tool']]
    tokenized_inputs['label_heading'] = [label_to_id['heading'][label] for label in examples['heading']]
    tokenized_inputs['label_target'] = [label_to_id['target'][label] for label in examples['target']]
    
    return tokenized_inputs

# Create label mappings
label_to_id = {
    'tool': {label: i for i, label in enumerate(set([item['tool'] for item in data]))},
    'heading': {label: i for i, label in enumerate(set([item['heading'] for item in data]))},
    'target': {label: i for i, label in enumerate(set([item['target'] for item in data]))},
}

In [43]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=64,
        truncation=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answer"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = examples['context'][i].find(answer)
        end_char = start_char + len(answer)
        # answer = answers[i]
        # start_char = answer["answer_start"][0]
        # end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [44]:
# Apply preprocessing function

tokenized_train = train_ds.map(preprocess_function, batched=True)
tokenized_test = test_ds.map(preprocess_function, batched=True)
data_collator = DefaultDataCollator()

Map:   0%|          | 0/8400 [00:00<?, ? examples/s]

Map:   0%|          | 0/1050 [00:00<?, ? examples/s]

In [51]:
## Training the model

training_args = TrainingArguments(
    output_dir="qa_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train[:2000],
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

KeyError: 6

In [None]:
# context = "The University of California was founded in 1868, located in Berkeley."
# question = "where was the University of California located?"

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model.to(device)
# Tokenize the context to find the exact start and end position of the answer
encoded = tokenizer.encode_plus(question, context, return_tensors="pt").to(device)
input_ids = encoded["input_ids"].tolist()[0]

model.eval()
with torch.no_grad():
    outputs = model(**encoded)

answer_start = torch.argmax(outputs.start_logits)
answer_end = torch.argmax(outputs.end_logits) + 1

# Convert tokens to answer string
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
print("Improved Answer:", answer)

In [31]:

### Step 2: Preprocess the Data
#Tokenize the data using a BERT tokenizer and prepare the labels for each classification task.

from transformers import BertTokenizer

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def preprocess_function(examples):
    # Tokenize the text
    tokenized_inputs = tokenizer(examples['transcript'], padding='max_length', truncation=True)
    
    # Map labels to integers
    tokenized_inputs['label_tool'] = [label_to_id['tool'][label] for label in examples['tool']]
    tokenized_inputs['label_heading'] = [label_to_id['heading'][label] for label in examples['heading']]
    tokenized_inputs['label_target'] = [label_to_id['target'][label] for label in examples['target']]
    
    return tokenized_inputs

# Create label mappings
label_to_id = {
    'tool': {label: i for i, label in enumerate(set([item['tool'] for item in data]))},
    'heading': {label: i for i, label in enumerate(set([item['heading'] for item in data]))},
    'target': {label: i for i, label in enumerate(set([item['target'] for item in data]))},
}

# Apply preprocessing function
tokenized_train = train_ds.map(preprocess_function, batched=True)
tokenized_test = test_ds.map(preprocess_function, batched=True)

### Step 3: Define the Model
#Define a custom model with three classification heads.

import torch
from torch import nn
from transformers import BertModel, BertPreTrainedModel

class MultiTaskBertForClassification(BertPreTrainedModel):
    def __init__(self, config, num_labels_tool, num_labels_heading, num_labels_target):
        super().__init__(config)
        self.bert = BertModel(config)
        self.classifier_tool = nn.Linear(config.hidden_size, num_labels_tool)
        self.classifier_heading = nn.Linear(config.hidden_size, num_labels_heading)
        self.classifier_target = nn.Linear(config.hidden_size, num_labels_target)
        
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels_tool=None, labels_heading=None, labels_target=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs[1]

        logits_tool = self.classifier_tool(pooled_output)
        logits_heading = self.classifier_heading(pooled_output)
        logits_target = self.classifier_target(pooled_output)

        loss = None
        if labels_tool is not None and labels_heading is not None and labels_target is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss_tool = loss_fct(logits_tool, labels_tool)
            loss_heading = loss_fct(logits_heading, labels_heading)
            loss_target = loss_fct(logits_target, labels_target)
            loss = loss_tool + loss_heading + loss_target

        return (loss, logits_tool, logits_heading, logits_target)

# Initialize the model
num_labels_tool = len(label_to_id['tool'])
num_labels_heading = len(label_to_id['heading'])
num_labels_target = len(label_to_id['target'])

model = MultiTaskBertForClassification.from_pretrained(
    'bert-base-uncased',
    num_labels_tool=num_labels_tool,
    num_labels_heading=num_labels_heading,
    num_labels_target=num_labels_target,
)

Map:   0%|          | 0/2800 [00:00<?, ? examples/s]

Map:   0%|          | 0/350 [00:00<?, ? examples/s]

Some weights of MultiTaskBertForClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier_heading.bias', 'classifier_heading.weight', 'classifier_target.bias', 'classifier_target.weight', 'classifier_tool.bias', 'classifier_tool.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
from transformers import BertForQuestionAnswering, AutoTokenizer, DefaultDataCollator, TrainingArguments, Trainer
data_collator = DefaultDataCollator()

In [29]:
training_args = TrainingArguments(
    output_dir="qa_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset= tokenized_train, ##train_ds, #tokenized_squad["train"], #change to train_df???
    eval_dataset= tokenized_test, #test_ds, #tokenized_squad["test"], #change to test_df???
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 