In [1]:
import jsonlines
import torchaudio
from datasets import Dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Trainer, TrainingArguments, AutoModelForSequenceClassification, EarlyStoppingCallback, TrainerCallback
from pathlib import Path
import torch, random
import librosa, os
import IPython.display as ipd
from dotenv import load_dotenv
import evaluate
from spellchecker import SpellChecker
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
from datasets import load_metric
import pandas as pd
from sklearn.model_selection import train_test_split
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from transformers import DefaultDataCollator
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertForQuestionAnswering, Trainer, TrainingArguments
import inflect

p = inflect.engine()

In [2]:
load_dotenv()

TEAM_NAME = os.getenv("TEAM_NAME", "7up")
TEAM_TRACK = os.getenv("TEAM_TRACK", "advanced")


input_dir = Path(f"/home/jupyter/{TEAM_TRACK}")
# input_dir = Path(f"../../data/{TEAM_TRACK}/train")
results_dir = Path(f"/home/jupyter/{TEAM_NAME}")
# results_dir = Path("results")
results_dir.mkdir(parents=True, exist_ok=True)


data = []
with jsonlines.open(input_dir / "nlp.jsonl") as reader:
    for obj in reader:
        data.append(obj)

df = pd.DataFrame(data)
df

Unnamed: 0,key,transcript,tool,heading,target
0,0,"Turret, prepare to deploy electromagnetic puls...",electromagnetic pulse,065,grey and white fighter jet
1,1,Engage yellow drone with surface-to-air missil...,surface-to-air missiles,235,yellow drone
2,2,"Control to turrets, deploy electromagnetic pul...",electromagnetic pulse,110,blue and red fighter plane
3,3,"Alfa, Echo, Mike Papa, deploy EMP tool heading...",EMP,085,"purple, red, and silver fighter jet"
4,4,"Engage the grey, black, and green fighter plan...",machine gun,095,"grey, black, and green fighter plane"
...,...,...,...,...,...
3495,3495,Deploy electromagnetic pulse on brown commerci...,electromagnetic pulse,350,brown commercial aircraft
3496,3496,"Deploy surface-to-air missiles, heading two on...",surface-to-air missiles,215,"silver, orange, and brown helicopter"
3497,3497,"Engage target, grey, orange, and silver missil...",surface-to-air missiles,080,"grey, orange, and silver missile"
3498,3498,Engage the white drone at heading zero five fi...,machine gun,055,white drone


In [3]:
new_contexts = []
new_questions = []
new_answers = []

# Loop through each row in the original DataFrame
for index, row in df.iterrows():
    context = row['transcript']
    for col in ['target', 'heading', 'tool']:
        new_contexts.append(context)
        if col == 'tool':
            new_questions.append(f"What is the tool to be deployed?")
        else:
            new_questions.append(f"What is the {col}?")
        new_answers.append(row[col])

new_df = pd.DataFrame({
    'context': new_contexts,
    'question': new_questions,
    'answer': new_answers
})

def convert_number_to_words(answer):
    if answer.isdigit():  # Check if the answer is a digit
        return ' '.join(p.number_to_words(digit) for digit in answer)
    return answer

# Apply the function to the answer column
new_df['answer'] = new_df['answer'].apply(convert_number_to_words)
new_df['context'] = new_df['context'].str.lower()
new_df

Unnamed: 0,context,question,answer
0,"turret, prepare to deploy electromagnetic puls...",What is the target?,grey and white fighter jet
1,"turret, prepare to deploy electromagnetic puls...",What is the heading?,zero six five
2,"turret, prepare to deploy electromagnetic puls...",What is the tool to be deployed?,electromagnetic pulse
3,engage yellow drone with surface-to-air missil...,What is the target?,yellow drone
4,engage yellow drone with surface-to-air missil...,What is the heading?,two three five
...,...,...,...
10495,engage the white drone at heading zero five fi...,What is the heading?,zero five five
10496,engage the white drone at heading zero five fi...,What is the tool to be deployed?,machine gun
10497,"turret charlie, prepare to engage. deploy emp ...",What is the target?,white cargo aircraft
10498,"turret charlie, prepare to engage. deploy emp ...",What is the heading?,two five five


In [4]:
train_df, test_df = train_test_split(new_df, test_size=0.2, random_state=42)

# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

def preprocess_function(examples):
    inputs = tokenizer(
        examples['question'],
        examples['context'],
        max_length=384,
        truncation="only_second",
        padding="max_length",
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        stride=128
    )
    
    start_positions = []
    end_positions = []
    for i, offsets in enumerate(inputs['offset_mapping']):
        try:
            start_char = examples['context'][i].index(examples['answer'][i].lower())
            end_char = start_char + len(examples['answer'][i])
        except ValueError:
            # Handle cases where answer is not found in context
            try:
                start_char = examples['context'][i].index(examples['answer'][i].lower().replace("nine", "niner"))
            except ValueError:
                print(examples['context'][i], examples['answer'][i])

        sequence_ids = inputs.sequence_ids(i)
        
        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
            token_start_index += 1
        
        token_end_index = len(inputs['input_ids'][i]) - 1
        while sequence_ids[token_end_index] != 1:
            token_end_index -= 1
        
        while offsets[token_start_index][0] <= start_char:
            token_start_index += 1
        while offsets[token_end_index][1] >= end_char:
            token_end_index -= 1
        
        start_positions.append(token_start_index - 1)
        end_positions.append(token_end_index + 1)
    
    inputs['start_positions'] = start_positions
    inputs['end_positions'] = end_positions
    return inputs

# Preprocess the datasets
train_tokenized_dataset = train_dataset.map(preprocess_function, batched=True)
test_tokenized_dataset = test_dataset.map(preprocess_function, batched=True)
train_tokenized_dataset



Map:   0%|          | 0/8400 [00:00<?, ? examples/s]

Map:   0%|          | 0/2100 [00:00<?, ? examples/s]

Dataset({
    features: ['context', 'question', 'answer', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping', 'start_positions', 'end_positions'],
    num_rows: 8400
})

In [5]:
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

training_args = TrainingArguments(
    output_dir="./distil-bert-ft-qa-model-7up-v7",
    evaluation_strategy="steps",
    learning_rate=7e-6,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    max_steps=2200,
    eval_steps=200,
    logging_steps=500,
    weight_decay=0.01,
    push_to_hub=True,
)

# Data collator
data_collator = DefaultDataCollator()

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=test_tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss
200,No log,0.296782
400,No log,0.087126
600,0.892100,0.050711
800,0.892100,0.041639
1000,0.056600,0.040205
1200,0.056600,0.040079
1400,0.056600,0.040241
1600,0.058800,0.039052
1800,0.058800,0.038305
2000,0.060200,0.038692


TrainOutput(global_step=2200, training_loss=0.246962609724565, metrics={'train_runtime': 1227.834, 'train_samples_per_second': 7.167, 'train_steps_per_second': 1.792, 'total_flos': 1724558594457600.0, 'train_loss': 0.246962609724565, 'epoch': 1.0476190476190477})

In [None]:
# tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# # Prepare inputs for tokenization
# questions = new_df["question"].tolist()
# contexts = new_df["context"].tolist()
# answers = new_df["answer"].tolist()

# # Tokenize the inputs
# inputs = tokenizer(
#     questions,
#     contexts,
#     max_length=384,
#     truncation="only_second",
#     return_offsets_mapping=True,
#     padding="max_length",
# )

# offset_mapping = inputs.pop("offset_mapping")

# start_positions = []
# end_positions = []

# for i, offset in enumerate(offset_mapping):
#     answer = answers[i]
#     start_char = contexts[i].find(answer)
#     end_char = start_char + len(answer)
    
#     sequence_ids = inputs.sequence_ids(i)

#     # Find the start and end of the context
#     context_start = 0
#     while sequence_ids[context_start] != 1:
#         context_start += 1
#     context_end = context_start
#     while context_end < len(sequence_ids) and sequence_ids[context_end] == 1:
#         context_end += 1
#     context_end -= 1

#     # If the answer is not fully inside the context, label it (0, 0)
#     if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
#         start_positions.append(0)
#         end_positions.append(0)
#     else:
#         # Find start token position
#         idx = context_start
#         while idx <= context_end and offset[idx][0] <= start_char:
#             idx += 1
#         start_positions.append(idx - 1)

#         # Find end token position
#         idx = context_end
#         while idx >= context_start and offset[idx][1] >= end_char:
#             idx -= 1
#         end_positions.append(idx + 1)

# new_df["start_positions"] = start_positions
# new_df["end_positions"] = end_positions

# new_df

In [None]:
# seed = 42

# train_df, test_df = train_test_split(new_df, test_size=0.1, random_state=seed, shuffle=True)

# # Display the resulting DataFrames
# print("Training Set:")
# train_df

In [None]:


train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Define a function to tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["context"], examples["question"], truncation=True, padding="max_length")

# Tokenize the datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

# Ensure that the tokenized fields are included in the dataset
tokenized_train_dataset = tokenized_train_dataset.remove_columns(["context", "question", "answer"])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["context", "question", "answer"])


tokenized_train_dataset


In [7]:
# # Load model
# model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

# # Training arguments
# training_args = TrainingArguments(
#     output_dir="./distil-bert-ft-qa-model-7up",
#     evaluation_strategy="steps",
#     learning_rate=2e-5,
#     per_device_train_batch_size=1,
#     per_device_eval_batch_size=1,
#     max_steps=10000,
#     eval_steps=1000,
#     weight_decay=0.01,
#     push_to_hub=True,
# )

# # Data collator
# data_collator = DefaultDataCollator()

# # Trainer setup
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_train_dataset,
#     eval_dataset=tokenized_test_dataset,
#     tokenizer=tokenizer,
#     data_collator=data_collator,
# )

# # Train the model
# trainer.train()

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss
1000,2.0312,1.82073
2000,1.5344,1.35382
3000,1.3121,1.061962
4000,1.0202,0.868066
5000,0.722,0.814651
6000,0.8664,0.716137
7000,0.7102,0.769623
8000,0.6577,0.579359
9000,0.6782,0.526296
10000,0.4637,0.493492


TrainOutput(global_step=10000, training_loss=1.078759828186035, metrics={'train_runtime': 1126.0222, 'train_samples_per_second': 8.881, 'train_steps_per_second': 8.881, 'total_flos': 1306531000320000.0, 'train_loss': 1.078759828186035, 'epoch': 1.0582010582010581})

In [17]:
from typing import Dict
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import re
import nltk
from nltk import word_tokenize, pos_tag

class NLPManager:
    def __init__(self):
        # Initialize the model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
        self.model = AutoModelForQuestionAnswering.from_pretrained('cadzchua/distil-bert-ft-qa-model-7up-v6')  
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model.to(self.device)
        
    def qa(self, context: str) -> Dict[str, str]:
        # Perform NLP question-answering
        context = context.lower()
        hdg_ans = self.find_heading(context)
        # hdg_ans = self.heading_text_to_string(hdg_ans)
        tool_ans = self.predict_answer("What is the tool to be deployed?", context)
        if "-" in tool_ans:
            tool_ans = self.remove_spaces_around_hyphens(tool_ans)
        if tool_ans.lower() in ['emp', 'emp tool']:
            tool_ans = 'EMP'
        tgt_ans = self.predict_answer("What is the target?", context)
        
        return {"heading": hdg_ans, "tool": tool_ans, "target": tgt_ans}
    
    def find_heading(self, context: str) -> str:
        """
        Extract the heading from the context using POS tagging.
        """
        words = word_tokenize(context)
        pos_tags = pos_tag(words)
        
        finding_heading = True
        reading_heading = False
        heading = []

        for word, pos in pos_tags:
            if finding_heading:
                if reading_heading:
                    if pos in ['CD', 'NN', 'JJ']:
                        heading.append(word)
                    # else:
                    #     reading_heading = False
                    #     finding_heading = False
                
                if word.lower() in ['heading', 'at']:
                    reading_heading = True

        return " ".join(heading)
    
    def heading_text_to_string(self, heading_text: str) -> str:
        """
        Convert heading text to string representation with leading zeros.
        """
        heading_mapping = {
            "zero": "0", "one": "1", "two": "2", "three": "3", "four": "4",
            "five": "5", "six": "6", "seven": "7", "eight": "8", "nine": "9", "niner": "9"
        }
        words = heading_text.split()
        string_heading = "".join([heading_mapping.get(word, "") for word in words])
        return string_heading.zfill(3)  # Ensure the heading is always three digits
    
    def predict_answer(self, question: str, context: str) -> str:
        inputs = self.tokenizer.encode_plus(question, context, return_tensors='pt', truncation=True, padding="max_length").to(self.device)
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']

        outputs = self.model(input_ids, attention_mask=attention_mask)
        start_scores = outputs.start_logits
        end_scores = outputs.end_logits

        start_index = torch.argmax(start_scores)
        end_index = torch.argmax(end_scores) + 1

        answer = self.tokenizer.convert_tokens_to_string(self.tokenizer.convert_ids_to_tokens(input_ids[0][start_index:end_index]))
        return answer.strip()
    
    def remove_spaces_around_hyphens(self, text: str) -> str:
        return re.sub(r'\s*-\s*', '-', text)
    
nlp_manager = NLPManager()

# Test with a new context
context = """
Control tower to air defense turrets, deploy electromagnetic pulse at heading zero six five. 
Target is an incoming missile.
"""
result = nlp_manager.qa(context)
print(result)


{'heading': 'zero six five', 'tool': 'electromagnetic pulse', 'target': 'zero six five. target is an incoming missile'}


In [8]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer, DefaultDataCollator
# from datasets import Dataset

# # Example DataFrame
# data = {
#     'context': ['context1', 'context2'],
#     'tool': ['tool1', 'tool2'],
#     'heading': ['heading1', 'heading2'],
#     'target': ['target1', 'target2']
# }

# df = pd.DataFrame(data)

# # Initialize lists to store the new DataFrame's data
# new_contexts = []
# new_questions = []
# new_answers = []

# # Loop through each row in the original DataFrame
# for index, row in df.iterrows():
#     context = row['context']
#     for col in ['target', 'heading', 'tool']:
#         new_contexts.append(context)
#         if col == 'tool':
#             new_questions.append("What is the tool to be deployed?")
#         else:
#             new_questions.append(f"What is the {col}?")
#         new_answers.append(row[col])

# # Create the new DataFrame
# new_df = pd.DataFrame({
#     'context': new_contexts,
#     'question': new_questions,
#     'answer': new_answers
# })

# # Tokenizer setup
# tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# # Prepare inputs for tokenization
# questions = new_df["question"].tolist()
# contexts = new_df["context"].tolist()
# answers = new_df["answer"].tolist()

# # Tokenize the inputs
# inputs = tokenizer(
#     questions,
#     contexts,
#     max_length=384,
#     truncation="only_second",
#     return_offsets_mapping=True,
#     padding="max_length",
# )

# offset_mapping = inputs.pop("offset_mapping")

# start_positions = []
# end_positions = []

# for i, offset in enumerate(offset_mapping):
#     answer = answers[i]
#     start_char = contexts[i].find(answer)
#     end_char = start_char + len(answer)
    
#     sequence_ids = inputs.sequence_ids(i)

#     # Find the start and end of the context
#     context_start = 0
#     while sequence_ids[context_start] != 1:
#         context_start += 1
#     context_end = context_start
#     while context_end < len(sequence_ids) and sequence_ids[context_end] == 1:
#         context_end += 1
#     context_end -= 1

#     # If the answer is not fully inside the context, label it (0, 0)
#     if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
#         start_positions.append(0)
#         end_positions.append(0)
#     else:
#         # Find start token position
#         idx = context_start
#         while idx <= context_end and offset[idx][0] <= start_char:
#             idx += 1
#         start_positions.append(idx - 1)

#         # Find end token position
#         idx = context_end
#         while idx >= context_start and offset[idx][1] >= end_char:
#             idx -= 1
#         end_positions.append(idx + 1)

# new_df["start_positions"] = start_positions
# new_df["end_positions"] = end_positions

# # Split the data
# seed = 42
# train_df, test_df = train_test_split(new_df, test_size=0.1, random_state=seed, shuffle=True)

# # Convert to Hugging Face Dataset
# train_dataset = Dataset.from_pandas(train_df)
# test_dataset = Dataset.from_pandas(test_df)

# # Load model
# model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

# # Training arguments
# training_args = TrainingArguments(
#     output_dir="./bert-qa-model-7up",
#     evaluation_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=1,
#     per_device_eval_batch_size=1,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     push_to_hub=True,
# )

# # Data collator
# data_collator = DefaultDataCollator()

# # Trainer setup
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=test_dataset,
#     tokenizer=tokenizer,
#     data_collator=data_collator,
# )

# # Train the model
# trainer.train()


In [30]:
from nltk import word_tokenize, sent_tokenize
from nltk import pos_tag

sentence = "Control to air defense turrets, we have a target consisting of a red, brown, and orange drone heading towards your location. Please deploy the drone catcher and intercept the target at heading zero three zero. Over."
tokenized_text = word_tokenize(sentence)
print(tokenized_text)

['Control', 'to', 'air', 'defense', 'turrets', ',', 'we', 'have', 'a', 'target', 'consisting', 'of', 'a', 'red', ',', 'brown', ',', 'and', 'orange', 'drone', 'heading', 'towards', 'your', 'location', '.', 'Please', 'deploy', 'the', 'drone', 'catcher', 'and', 'intercept', 'the', 'target', 'at', 'heading', 'zero', 'three', 'zero', '.', 'Over', '.']


In [31]:
tags = tokens_tag = pos_tag(tokenized_text)
tags

[('Control', 'NN'),
 ('to', 'TO'),
 ('air', 'NN'),
 ('defense', 'NN'),
 ('turrets', 'NNS'),
 (',', ','),
 ('we', 'PRP'),
 ('have', 'VBP'),
 ('a', 'DT'),
 ('target', 'NN'),
 ('consisting', 'NN'),
 ('of', 'IN'),
 ('a', 'DT'),
 ('red', 'JJ'),
 (',', ','),
 ('brown', 'JJ'),
 (',', ','),
 ('and', 'CC'),
 ('orange', 'NN'),
 ('drone', 'NN'),
 ('heading', 'VBG'),
 ('towards', 'NNS'),
 ('your', 'PRP$'),
 ('location', 'NN'),
 ('.', '.'),
 ('Please', 'NNP'),
 ('deploy', 'VBZ'),
 ('the', 'DT'),
 ('drone', 'NN'),
 ('catcher', 'NN'),
 ('and', 'CC'),
 ('intercept', 'VB'),
 ('the', 'DT'),
 ('target', 'NN'),
 ('at', 'IN'),
 ('heading', 'VBG'),
 ('zero', 'CD'),
 ('three', 'CD'),
 ('zero', 'NN'),
 ('.', '.'),
 ('Over', 'IN'),
 ('.', '.')]

In [None]:
# def extract_information(word_pos_pairs):
#     tool = None
#     target = []
#     heading = []

#     reading_tool = False
#     reading_target = False
#     reading_heading = False
#     finding_tool = True
#     finding_heading = True
#     finding_target = True

#     for word, pos in word_pos_pairs:
#         # Identify the tool
#         if finding_tool:
#             if reading_tool:
#                 if pos in ['JJ', 'NN', 'NNS', 'NNP']:
#                     if word.lower() not in ['towards', 'deployment']:
#                         tool = (tool + " " + word) if tool else word
#                     else:
#                         finding_tool = False
#                 elif pos in ['DT', 'IN']:
#                     continue
#                 else:
#                     reading_tool = False
#                     finding_tool = False
        
#             if word.lower() in ['deploy', 'with', 'using', 'deployment', 'initiate', 'initiating']:
#                 reading_tool = True

#         if finding_heading:
#             if reading_heading:
#                 if pos in ['CD', 'NN', 'JJ']:
#                     heading.append(word)
#                 else:
#                     reading_heading = False
#                     finding_heading = False
        
#             if word.lower() in ['heading', 'at']:
#                 reading_heading = True
        
#         if finding_target:
#             if reading_target:
#                 if pos in ['JJ', 'CC', 'NN', 'RB', ',']:  # Include more POS tags to capture full target phrase
#                     target.append(word)
#                 elif word == ".":
#                     break
#                 elif pos in ['VBZ', 'DT']:
#                     continue
#                 else:
#                     reading_target = False
#                     finding_target = False
                    
#             if (word.lower()in ["target", "engage", "towards"]):
#                 reading_target = True

#     # Join the heading and target words into strings
#     heading_str = ' '.join(heading)
#     target_str = ' '.join(target)

#     return tool, heading_str, target_str

In [34]:
tool_ans = "attack helicopter using insane missle"
ans = tool_ans.split('using', 1)[1].strip()
ans

'insane missle'

In [176]:
# hi = extract_information(tags)
# print(hi)

(None, 'one eight zero', 'bogey')
