## Imports

In [2]:
import jsonlines
import torchaudio
from datasets import Dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Trainer, TrainingArguments, AutoModelForSequenceClassification, EarlyStoppingCallback, TrainerCallback
from pathlib import Path
import torch, random
import librosa, os
import IPython.display as ipd
from dotenv import load_dotenv
import evaluate
from spellchecker import SpellChecker
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
from datasets import load_metric
import pandas as pd
from sklearn.model_selection import train_test_split

## Load Dataset

In [7]:
load_dotenv()

TEAM_NAME = os.getenv("TEAM_NAME", "7up")
TEAM_TRACK = os.getenv("TEAM_TRACK", "advanced")


input_dir = Path(f"/home/jupyter/{TEAM_TRACK}")
# input_dir = Path(f"../../data/{TEAM_TRACK}/train")
results_dir = Path(f"/home/jupyter/{TEAM_NAME}")
# results_dir = Path("results")
results_dir.mkdir(parents=True, exist_ok=True)


data = []
with jsonlines.open(input_dir / "nlp.jsonl") as reader:
    for obj in reader:
        data.append(obj)

df = pd.DataFrame(data)

df

Unnamed: 0,key,transcript,tool,heading,target
0,0,"Turret, prepare to deploy electromagnetic puls...",electromagnetic pulse,065,grey and white fighter jet
1,1,Engage yellow drone with surface-to-air missil...,surface-to-air missiles,235,yellow drone
2,2,"Control to turrets, deploy electromagnetic pul...",electromagnetic pulse,110,blue and red fighter plane
3,3,"Alfa, Echo, Mike Papa, deploy EMP tool heading...",EMP,085,"purple, red, and silver fighter jet"
4,4,"Engage the grey, black, and green fighter plan...",machine gun,095,"grey, black, and green fighter plane"
...,...,...,...,...,...
3495,3495,Deploy electromagnetic pulse on brown commerci...,electromagnetic pulse,350,brown commercial aircraft
3496,3496,"Deploy surface-to-air missiles, heading two on...",surface-to-air missiles,215,"silver, orange, and brown helicopter"
3497,3497,"Engage target, grey, orange, and silver missil...",surface-to-air missiles,080,"grey, orange, and silver missile"
3498,3498,Engage the white drone at heading zero five fi...,machine gun,055,white drone


In [4]:
seed = 42

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=seed, shuffle=True)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=seed, shuffle=True)

# Display the resulting DataFrames
print("Training Set:")
train_df

Training Set:


Unnamed: 0,key,transcript,tool,heading,target
162,162,"Control tower to turrets, deploy drone catcher...",drone catcher,095,blue drone
1001,1001,"Control to turrets, we have a purple light air...",electromagnetic pulse,340,purple light aircraft
1718,1718,"Control tower to turrets, deploy EMP, target t...",EMP,045,blue and purple fighter jet
1003,1003,"Turret Bravo, engage silver, red, and yellow m...",machine gun,235,"silver, red, and yellow missile"
1233,1233,"Engage target, white, purple, and black missil...",anti-air artillery,175,"white, purple, and black missile"
...,...,...,...,...,...
1095,1095,Deploy surface-to-air missiles to intercept an...,surface-to-air missiles,285,yellow cargo aircraft
1130,1130,"Tower to turrets, heading zero seven zero. Dep...",electromagnetic pulse,070,white and green helicopter
1294,1294,Deploy anti-air artillery to intercept green a...,anti-air artillery,065,green and purple light aircraft
860,860,"Engage orange fighter plane with machine gun, ...",machine gun,295,orange fighter plane


In [4]:
import torch
from transformers import BertTokenizer, BertForQuestionAnswering

# Load a pre-trained BERT model and tokenizer
model_name = 'bert-large-uncased-whole-word-masking-finetuned-squad'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

# Define the question and context
question = "What is the aircraft to lock?"
context = "Air defense turret, lock onto target black and yellow missile at heading zero one five. Deploy EMP."

# Encode the question and context so that they are prepared for the model
inputs = tokenizer.encode_plus(question, context, return_tensors='pt', add_special_tokens=True)

# Get model's prediction
input_ids = inputs['input_ids']
token_type_ids = inputs['token_type_ids']

# Model output handling might require adjustment based on transformers version
outputs = model(input_ids, token_type_ids=token_type_ids)
start_scores = outputs.start_logits
end_scores = outputs.end_logits

# Find the position tokens with the highest scores
start_index = torch.argmax(start_scores)
end_index = torch.argmax(end_scores) + 1

# Convert tokens to the answer string
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[0][start_index:end_index]))

print("Answer:", answer)

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Answer: target black and yellow missile


In [2]:
def heading_text_to_string(heading_text):
    """
    Convert heading text to string representation with leading zeros.
    
    Args:
        heading_text (str): Heading text in format "zero six five".
    
    Returns:
        str: String representation of the heading with leading zeros.
    """
    # Define a mapping from text to integer representations
    heading_mapping = {
        "zero": "0", "one": "1", "two": "2", "three": "3", "four": "4",
        "five": "5", "six": "6", "seven": "7", "eight": "8", "nine": "9", "niner": "9"
    }
    
    # Split the heading text into words and convert each word to its integer representation
    words = heading_text.split()
    string_heading = ""
    for word in words:
        if word in heading_mapping:
            string_heading += heading_mapping[word]
    
    return string_heading

# Test the function
heading_text = "zero niner five"
heading_string = heading_text_to_string(heading_text)
print("Heading String:", heading_string)

Heading String: 095


In [6]:
def tool_text_to_string(tool):
    if tool == "emp tool" or tool == "emp":
        return "EMP"
    else:
        return tool

In [8]:
import string
count = 0
for index, row in df.iterrows():
    if row['key'] > 1000:
        
        break
    if row['key'] > -1:
        context = row['transcript'].lower()

        # Answers

        # print(context)
        context=remove_words(context)
        # print(context)
        # translator = str.maketrans('', '', string.punctuation + '-,')

        # Remove all punctuation using translate()
        # context = context.translate(translator)
        heading = predict_answer("What is the heading at?", context)
        # print(heading)
        context_heading=context.replace(heading, '')
        context_heading=context_heading.replace("heading ", '')
        # print(context_heading)
        heading = heading_text_to_string(heading)
        
        color = predict_answer("what is the colors?", context_heading)
        # print(color)
        target = predict_answer("Deploy engagement on what camouflage target?", context_heading)
        # print(target)
        
        if color == target:
            target = predict_answer("What is the flying target", context_heading) 
        if not re.search(r'\b' + re.escape(color) + r'\b', target):
            target = color + " " + target
            
        context_target=context_heading.replace(target, '')
        
        tool = predict_answer("What tool to deploy?", context_target)
        # print(context_tool)
        # print("\n")
        tool = tool_text_to_string(tool)
        
        
        

#         if "jet" not in target:
#             target = predict_answer("what aircraft?", context)
        
#         if "aircraft" not in target:
#             target = predict_answer("what missle?", context)
        
#         if "missle" not in target:
#             target = predict_answer("what drone?", context)
        
        target = target.replace(" , ", ", ")
        target = target.replace("the ", "")
        target = target.replace("a ", "")
        target = target.replace("an ", "")
        target = target.replace("engage ", "")
        tool = tool.replace(" - ", "-")
        tool = tool.replace(" target", "")
        tool = tool.replace(" system", "")
        # Print types and values
        if not heading == row['heading']:
            print(context)
            print(f'Predicted heading: {heading}', f"Actual: {row['heading']}", heading == row['heading'])
            count += 1
        if not target == row['target']:
            print(context_heading)
            print(f'Predicted target: {target}', f"Actual: {row['target']}", target == row['target'])
            count += 1
        if not tool == row['tool']:
            print(context_target)
            print(f'Predicted tool: {tool}', f"Actual: {row['tool']}", tool == row['tool'])
            count += 1
print(count, count/1000)


 to interceptors, deploy interceptor jets immediately. target is an orange and yellow fighter jet . engage and intercept the target. repeat, engage and intercept the target. .
Predicted target: target is orange and yellow fighter jet Actual: orange and yellow fighter jet False
control calling  , be advised, deploy surface-to-air missiles, , engage . execute immediately. .
Predicted tool: control calling Actual: surface-to-air missiles False
activate machine gun, , engage grey, black, and orange missile.
Predicted tool: machine gun , , engage grey , black , and orange missile Actual: machine gun False
deploy surface-to-air missiles, , engage blue, grey, and orange missile.
Predicted target: surface - to - air missiles, , blue, grey, and orange missile Actual: blue, grey, and orange missile False
air defense , deploy interceptor jets to . engage the yellow, red, and silver fighter plane.
Predicted tool: air defense Actual: interceptor jets False
deploy interceptor jets, , intercept orang

In [3]:
import torch
from transformers import BertTokenizer, BertForQuestionAnswering
import nltk
from nltk.corpus import stopwords
import re

# Load a pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

# Function to predict the answer for a single question
def predict_answer(question, context):
    inputs = tokenizer.encode_plus(question, context, return_tensors='pt', add_special_tokens=True)
    input_ids = inputs['input_ids']
    token_type_ids = inputs['token_type_ids']

    outputs = model(input_ids, token_type_ids=token_type_ids)
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores) + 1

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[0][start_index:end_index]))
    return answer

def remove_words(context):
    # Define the patterns to match "Turret", "turret", and "turrets"
    patterns = [r'turret', r'turrets', r'turrets, ', r'control tower,*', r'over', r'tower']
    
    # Compile the patterns into regular expressions
    regex_list = [re.compile(pattern, flags=re.IGNORECASE) for pattern in patterns]

    # Iterate through the regex_list and replace the matches with an empty string
    for regex in regex_list:
        context = regex.sub('', context)
    
    words_to_remove = ["alpha", "alfa", "bravo", "charlie", "delta", "echo", "foxtrot", "golf", "hotel", 
                  "india", "juliet", "kilo", "lima", "mike", "november", "oscar", "papa", 
                  "quebec", "romeo", "sierra", "tango", "uniform", "victor", "whiskey", 
                  "x-ray", "yankee", "zulu"]

    pattern = r'\b(?:' + '|'.join(words_to_remove) + r')(?:,|\b)'
    result = re.sub(pattern, '', context, flags=re.IGNORECASE)

    return result

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
import nltk
from nltk.corpus import stopwords

def remove_stopwords_except_and(sentence):
    stop_words = set(stopwords.words("english"))
    words = nltk.word_tokenize(sentence)
    filtered_sentence = [word for word in words if word.lower() not in stop_words or word.lower() == "and"]
    return " ".join(filtered_sentence)

sentence = "I like to dance and sing"
result = remove_stopwords_except_and(sentence)
print(result)

like dance and sing


In [9]:
import json
from sklearn.model_selection import train_test_split

# Load JSON data
data = []
with open("advanced/nlp.jsonl", 'r') as f:
    for line in f:
        # Load individual JSON object from each line
        obj = json.loads(line)
        data.append(obj)

In [10]:
# Extract transcripts and labels
texts = [entry["transcript"] for entry in data]
labels = [(entry["tool"], entry["heading"], entry["target"]) for entry in data]

# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [11]:
# import torch
# from transformers import BertTokenizer, BertForSequenceClassification, AdamW
# from torch.utils.data import DataLoader, Dataset
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import LabelEncoder
# model = BertForQuestionAnswering.from_pretrained('bert-base-uncased', force_download=True)
# # Define your dataset class
# class CustomDataset(Dataset):
#     def __init__(self, texts, labels, tokenizer, max_length):
#         self.texts = texts
#         self.labels = labels
#         self.tokenizer = tokenizer
#         self.max_length = max_length

#     def __len__(self):
#         return len(self.texts)

#     def __getitem__(self, idx):
#         text = self.texts[idx]
#         label = self.labels[idx]
    
#         encoding = self.tokenizer.encode_plus(
#             text,
#             add_special_tokens=True,
#             max_length=self.max_length,
#             padding='max_length',
#             truncation=True,
#             return_tensors='pt'
#         )
#         label_encoder = LabelEncoder()

#         # Fit label encoder and transform labels to numerical values
#         label_numeric = label_encoder.fit_transform(label)
#         return {
#             'input_ids': encoding['input_ids'].flatten(),
#             'attention_mask': encoding['attention_mask'].flatten(),
#             'labels': torch.tensor(label_numeric, dtype=torch.long)
#         }
    
# train_dataset = CustomDataset(train_texts, train_labels, tokenizer, max_length=128)
# val_dataset = CustomDataset(val_texts, val_labels, tokenizer, max_length=128)

# # Define dataloaders
# train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# # Define optimizer and learning rate scheduler
# optimizer = AdamW(model.parameters(), lr=2e-5, no_deprecation_warning=True)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

# # Training loop
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model.to(device)

# num_epochs = 5
# for epoch in range(num_epochs):
#     model.train()
#     for batch in train_loader:
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         labels = batch['labels'].to(device)

#         optimizer.zero_grad()
#         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()
#         scheduler.step()

#     # Validation
#     model.eval()
#     val_loss = 0
#     num_correct = 0
#     total_samples = 0
#     with torch.no_grad():
#         for batch in val_loader:
#             input_ids = batch['input_ids'].to(device)
#             attention_mask = batch['attention_mask'].to(device)
#             labels = batch['labels'].to(device)

#             outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#             loss = outputs.loss
#             val_loss += loss.item()

#             predictions = torch.argmax(outputs.logits, dim=1)
#             num_correct += (predictions == labels).sum().item()
#             total_samples += labels.size(0)

#     val_loss /= len(val_loader)
#     accuracy = num_correct / total_samples

#     print(f'Epoch {epoch+1}/{num_epochs}, Loss: {val_loss:.4f}, Accuracy: {accuracy:.4f}')


In [12]:
data =[]
with open("advanced/nlp.jsonl", 'r') as f:
    for line in f:
        # Load individual JSON object from each line
        obj = json.loads(line)
        data.append(obj)


# Extract transcripts, headings, tools, and targets from the data
transcripts = []
for item in data:
    transcripts.append({
        'transcript': item['transcript'],
        'heading': item['heading'],
        'tool': item['tool'],
        'target': item['target']
    })

In [13]:
import torch
from transformers import BertTokenizer, BertForTokenClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder

class CustomDataset(Dataset):
    def __init__(self, transcripts, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Convert list of dictionaries to dictionary of lists
        transcripts_dict = {"transcript": [], "heading": [], "tool": [], "target": []}
        for transcript in transcripts:
            for key in transcripts_dict.keys():
                transcripts_dict[key].append(transcript[key])

        # Label encoding
        self.label_encoders = {}
        for key in ['heading', 'tool', 'target']:
            self.label_encoders[key] = LabelEncoder()
            transcripts_dict[key] = self.label_encoders[key].fit_transform(transcripts_dict[key])

        self.transcripts = transcripts_dict

    def __len__(self):
        return len(self.transcripts["transcript"])

    def __getitem__(self, idx):
        transcript = {key: self.transcripts[key][idx] for key in self.transcripts.keys()}
        
        # Tokenize the transcript
        encoding = self.tokenizer(transcript["transcript"], padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor([transcript["heading"], transcript["tool"], transcript["target"]])  # Stacking along dimension 0
        }

# Initialize tokenizer and dataset
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = CustomDataset(transcripts, tokenizer, max_length=128)

# Define dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Instantiate BertForTokenClassification
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=3)  # 3 labels: heading, tool, target

# Define optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        # Print the shape of the logits and the labels
        print("Logits shape:", outputs.logits.shape)
        print("Labels shape:", labels.shape)
        
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}/{num_epochs} completed')


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: Expected input batch_size (2048) to match target batch_size (48).