In [2]:
import pandas as pd
from sentence_transformers import InputExample
import pickle
import os # Added import

# Define the output path for the pickle file
# Using a relative path from where the script is run
output_pickle_filename = 'train_examples_contrastive.pkl'

print("Loading CSV file...")
# Load your database using the provided absolute path
try:
    df = pd.read_csv('/home/joelphilip/Documents/roads-in-madras/data/raw/GPT_Input_DB(Sheet1).csv')
except FileNotFoundError:
    print("Error: CSV file not found at the specified path.")
    exit()
except Exception as e:
    print(f"Error loading CSV: {e}")
    exit()


# Drop any rows that have no data (just in case)
required_cols = ['data', 'problem', 'category', 'type']
initial_rows = len(df)
df = df.dropna(subset=required_cols)
dropped_rows = initial_rows - len(df)
if dropped_rows > 0:
    print(f"Dropped {dropped_rows} rows due to missing required data.")


train_examples = []

print(f"Creating training examples from {len(df)} valid rows...")

# Iterate over your database and create training pairs
for _, row in df.iterrows():
    # 1. Create the "query"
    # We combine the problem, category, and type to create a rich query
    query = f"Problem: {row['problem']} Category: {row['category']} Type: {row['type']}"

    # 2. Define the "positive passage"
    passage = str(row['data']) # Ensure data is string

    # --- CHANGE IS HERE ---
    # 3. Create an InputExample with label=1.0 for ContrastiveLoss
    # ContrastiveLoss expects pairs and a label (1.0 for similar, 0.0 for dissimilar)
    example = InputExample(texts=[query, passage], label=1.0)
    # ---------------------
    train_examples.append(example)

print(f"Created {len(train_examples)} training examples for ContrastiveLoss.")

# Save the examples to a file so you don't have to re-process every time
try:
    with open(output_pickle_filename, 'wb') as f:
        pickle.dump(train_examples, f)
    print(f"Training examples saved to '{output_pickle_filename}'")
except Exception as e:
    print(f"Error saving pickle file: {e}")
    exit()

if train_examples:
    print("\nExample 0:")
    print("Query:", train_examples[0].texts[0])
    print("Passage:", train_examples[0].texts[1])
    print("Label:", train_examples[0].label) # Show the label
else:
    print("No training examples generated.")

Loading CSV file...
Creating training examples from 50 valid rows...
Created 50 training examples for ContrastiveLoss.
Training examples saved to 'train_examples_contrastive.pkl'

Example 0:
Query: Problem: Damaged Category: Road Sign Type: STOP Sign
Passage: The 'STOP' sign, used on Minor Roads intersecting Major Roads, requires vehicles to stop before entering and proceed only when safe. It is octagonal with a red background, a white border, and "STOP" written centrally in white. Installed on the left side of the approach, it should be placed close to the stop line, typically 1.5 m in advance, without impairing visibility of the Major Road.
The dimensions vary by approach speed: up to 50 km/h, 750 mm height, 25 mm border, 175 mm font; 51â€“65 km/h, 900 mm height, 30 mm border, 210 mm font; and over 65 km/h, 1200 mm height, 40 mm border, 280 mm font.
Label: 1.0


In [3]:
import pickle
from sentence_transformers import SentenceTransformer, losses
# NOTE: Removed 'from datasets import Dataset' as it's not directly needed here
# if model.fit relies on it internally, having 'datasets' installed is sufficient.
from torch.utils.data import DataLoader
import torch # Added import
import os # Added import

# Define the path for the input pickle file
input_pickle_filename = 'train_examples_contrastive.pkl'

# Define the output path for the fine-tuned model
output_model_name = 'my-contrastive-finetuned-model'
output_model_path = f'./{output_model_name}' # Save in current directory

# 1. Load the base model
model_name = 'BAAI/bge-small-en-v1.5'

# Check for GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

try:
    model = SentenceTransformer(model_name, device=device)
    print(f"Loaded base model: {model_name}")
except Exception as e:
    print(f"Error loading base model {model_name}: {e}")
    exit()


# 2. Load your prepared training data
print(f"Loading training examples from '{input_pickle_filename}'...")
try:
    with open(input_pickle_filename, 'rb') as f:
        train_examples = pickle.load(f)
        # Check if the loaded data has labels needed for ContrastiveLoss
        if not train_examples or not hasattr(train_examples[0], 'label'):
             print(f"Error: Examples in '{input_pickle_filename}' are missing labels required for ContrastiveLoss.")
             print("Please run the updated prepare_data.py script.")
             exit()
except FileNotFoundError:
    print(f"Error: Training data file '{input_pickle_filename}' not found.")
    print("Please run the prepare_data.py script first.")
    exit()
except Exception as e:
    print(f"Error loading pickle file: {e}")
    exit()

# 3. Define the DataLoader
train_batch_size = 4 # Keep small for 6GB VRAM
print(f"Using DataLoader with batch size: {train_batch_size}")
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=train_batch_size)

# --- CHANGE LOSS FUNCTION ---
# 4. Define the Loss Function using ContrastiveLoss
print("Using ContrastiveLoss.")
train_loss = losses.ContrastiveLoss(model=model)
# ----------------------------

# --- CHANGE NUMBER OF EPOCHS ---
# 5. Start the Training
num_epochs = 3 # Increased epochs
# -----------------------------

print(f"\n--- Starting Training ---")
print(f"Training examples: {len(train_examples)}")
print(f"Batch Size: {train_batch_size}")
print(f"Epochs: {num_epochs}")
print(f"Output Path: {output_model_path}")
print(f"-------------------------\n")

# Ensure output directory exists
os.makedirs(output_model_path, exist_ok=True)

try:
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=num_epochs,
        warmup_steps=10, # A small number of steps to warm up the learning rate
        output_path=output_model_path,
        show_progress_bar=True
    )
    print(f"\n--- Training Complete ---")
    print(f"Model saved to '{output_model_path}'")

except RuntimeError as e:
    if "CUDA out of memory" in str(e):
        print("\n--- ERROR: CUDA Out of Memory ---")
        print(f"Training failed with batch size {train_batch_size}.")
        print("Try reducing the batch size further (e.g., to 2) and re-run.")
        print("---------------------------------")
    else:
        print(f"\n--- An unexpected runtime error occurred during training ---")
        print(e)
    exit()
except Exception as e:
    print(f"\n--- An unexpected error occurred during training ---")
    print(e)
    exit()

Using device: cuda
Loaded base model: BAAI/bge-small-en-v1.5
Loading training examples from 'train_examples_contrastive.pkl'...
Using DataLoader with batch size: 4
Using ContrastiveLoss.

--- Starting Training ---
Training examples: 50
Batch Size: 4
Epochs: 3
Output Path: ./my-contrastive-finetuned-model
-------------------------



                                                                     

Step,Training Loss



--- Training Complete ---
Model saved to './my-contrastive-finetuned-model'
