In [1]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Map src directory
import sys
import os
root_dir = os.path.abspath(os.path.join(os.getcwd(), "../."))
print("Root Directory: ", root_dir)
src_dir = os.path.join(root_dir,"src")
print("Src Directory: ", src_dir)
sys.path.append(src_dir)
data_dir = os.path.join(root_dir,"data")
print("Data Directory: ", data_dir)

Root Directory:  /media/dzielinski06/HDD1/AI894 - Capstone/Complete Collision Recorder
Src Directory:  /media/dzielinski06/HDD1/AI894 - Capstone/Complete Collision Recorder/src
Data Directory:  /media/dzielinski06/HDD1/AI894 - Capstone/Complete Collision Recorder/data


In [3]:
def concatenate_texts(row):
    cad_text = row['CAD_TEXT'] if pd.notna(row['CAD_TEXT']) else ""
    oh_text = row['OH1_TEXT'] if pd.notna(row['OH1_TEXT']) else ""
    
    if oh_text:  # If 'OH_TEXT' is not an empty string
        return cad_text + "POLICE NARRATIVE \n\n " + oh_text
    else:
        return cad_text  # If 'OH_TEXT' is empty, return only 'CAD_TEXT'

# Apply the function to the dataframe to create a new concatenated column
bikeCLE_input_df = pd.read_csv(os.path.join(data_dir, "processed", "GCAT_pred_df.csv"))
bikeCLE_input_df = bikeCLE_input_df[bikeCLE_input_df['concatenated_text'].str.strip() != '']
bikeCLE_input_df = bikeCLE_input_df[pd.notna(bikeCLE_input_df['BIKE_CLE_TEXT'])]
bikeCLE_input_df

Unnamed: 0,concatenated_text,BIKE_CLE_TEXT
0,CALL COMMENTS COMMENTS FROM INTERGAPH 1/1/2020...,A driver struck an adult crossing an intersect...
1,POLICE NARRATIVE \n\n WHILE UNIT1 WAS TURNING ...,A driver of a commercial truck turned left and...
2,CALL COMMENTS COMMENTS FROM INTERGAPH 1/3/2020...,A driver struck a person. The crash caused inj...
3,CALL COMMENTS COMMENTS FROM INTERGAPH 1/3/2020...,A driver turned left and struck and killed an ...
4,CALL COMMENTS COMMENTS FROM INTERGAPH 1/4/2020...,A driver struck an adult. The crash caused inj...
...,...,...
2168,CALL COMMENTS COMMENTS FROM INTERGAPH 12/29/20...,A driver of a passenger car struck an adult on...
2169,PRINT DATE/TIME: LOGIN ID: INCIDENT DATE/TIME:...,A driver of an SUV turned left and struck an a...
2170,CALL COMMENTS COMMENTS FROM INTERGAPH 12/30/20...,A driver of a transit bus struck a person.
2171,PRINT DATE/TIME: 01/05/2024 06:37 LOGIN ID: GR...,A driver struck a person.


In [4]:
dataset = Dataset.from_pandas(bikeCLE_input_df[['concatenated_text', 'BIKE_CLE_TEXT']])
tokenizer = AutoTokenizer.from_pretrained('mistralai/Mistral-7B-v0.1')
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"
tokenizer.truncation_side = "left"

def tokenize_inputs_and_labels(examples):
    # Tokenize input (concatenated_text)
    inputs = tokenizer(examples['concatenated_text'], padding='max_length', truncation=True, max_length=2048)
    
    # Tokenize target labels (BIKE_CLE_TEXT)
    labels = tokenizer(examples['BIKE_CLE_TEXT'], padding='max_length', truncation=True, max_length=2400)
    
    # Use input_ids from both tokenizations
    inputs["labels"] = labels["input_ids"]
    
    # Replace padding token id (assigned to -100) so it's ignored in the loss calculation
    inputs["labels"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in labels_seq] 
        for labels_seq in inputs["labels"]
    ]
    
    return inputs

# Apply tokenization to the entire dataset
tokenized_dataset = dataset.map(tokenize_inputs_and_labels, batched=True)

train_test_split = tokenized_dataset.train_test_split(test_size=0.999)

# Separate the datasets
train_dataset = train_test_split['train']
validation_dataset = train_test_split['test']

Map: 100%|██████████| 1315/1315 [00:01<00:00, 1039.22 examples/s]


In [5]:
train_dataset

Dataset({
    features: ['concatenated_text', 'BIKE_CLE_TEXT', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1
})

In [6]:
from transformers import AutoModelForCausalLM
import torch

# Load the Mistral-7B model
model = AutoModelForCausalLM.from_pretrained('mistralai/Mistral-7B-v0.1')
device = torch.device("cpu")
model.to(device)


Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.92s/it]


MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
     

In [7]:
from peft import get_peft_model, LoraConfig, TaskType

# Set up LoRA configuration for CPU
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8, 
    lora_alpha=16, 
    lora_dropout=0.1, 
    target_modules=["q_proj", "v_proj"]  # Fine-tune specific layers
)

# Apply LoRA configuration to the model
peft_model = get_peft_model(model, config)
peft_model.to(device)  # Ensure LoRA-adapted model is on CPU

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Lin

In [8]:
from transformers import Trainer, TrainingArguments

# Set up training arguments for CPU-only execution
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    per_device_train_batch_size=1,  # Keep the batch size small for CPU training
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    no_cuda=True  # IMPORTANT: This ensures no GPU is used at all
)

# Initialize the Trainer for CPU
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
)

# Fine-tune the model on CPU
trainer.train()


  0%|          | 0/3 [00:00<?, ?it/s]

ValueError: Expected input batch_size (2047) to match target batch_size (639).

In [9]:
# peft_model.save_pretrained("./fine_tuned_mistralai")
# tokenizer.save_pretrained("./fine_tuned_mistralai")

