In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1" 

import pandas as pd
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset
from sklearn.model_selection import train_test_split
import json
from pathlib import Path

torch.set_float32_matmul_precision('high')

  from .autonotebook import tqdm as notebook_tqdm


### Fine tuning

In [4]:
print("="*80)
print("GEMMA-2-9B-IT FINE-TUNING WITH LORA (CLEAN VERSION)")
print("="*80)

# ============================================================================
# CONFIGURATION
# ============================================================================
# Paths
base_dir = Path("/home/diya.thakor/AirQuality/BASELINE")
data_dir = base_dir / "data"
finetuned_dir = base_dir / "finetuned"
finetuned_dir.mkdir(parents=True, exist_ok=True)

# Model configuration
model_id = "google/gemma-2-9b-it"
hf_token = os.environ.get("HF_TOKEN")

# Training configuration
output_dir = finetuned_dir / "gemma-2-9b-it-pm25-clean"
max_epochs = 5
batch_size = 2
learning_rate = 1e-4
eval_steps = 50
early_stopping_patience = 3

# LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    bias="none"
)

# ============================================================================
# LOAD MODEL AND TOKENIZER
# ============================================================================
print("\n[STEP 1] Loading model and tokenizer...")

tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model - NOW WORKS WITH YOUR TRANSFORMERS 4.57.0!
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map={"":0},
    torch_dtype=torch.bfloat16,
    token=hf_token
)

print("Model loaded successfully!")

# ============================================================================
# LOAD AND PREPARE DATA
# ============================================================================
print("\n[STEP 2] Loading and preparing training data...")

# Load training data
train_df = pd.read_csv(data_dir / "train_2022.csv")
print(f"Loaded training data: {train_df.shape}")

# Prompt templates (same as zero-shot)
SYSTEM_PROMPT = (
    "You are an air pollution assistant. "
    "Strictly respond to queries with a single real number only. "
    "Do not include any units, explanation, or punctuation. Just a single number."
)
# RQ1
# USER_TEMPLATE = (
#     "What is the average PM2.5 concentration (in μg/m³) in {city}, {state} during {month}, {year}? "
#     "Give a single number only."
# )

# RQ4
USER_TEMPLATE = (
    "What is the average PM2.5 concentration (in μg/m³) in {city}, {state} during {month}, {year}? "
    "Additional context: Temperature (AT) = {at}°C, NDVI = {ndvi}, Population = {pop}. "
    "Give a single number only."
)

# lat,lon
# USER_TEMPLATE = (
#     "What is the average PM2.5 concentration (in μg/m³) at location (latitude={lat}, longitude={lon}) during {month}, {year}? "
#     "Give a single number only."
# )

# Convert month numbers to month names
month_names = {
    1: "January", 2: "February", 3: "March", 4: "April",
    5: "May", 6: "June", 7: "July", 8: "August", 
    9: "September", 10: "October", 11: "November", 12: "December"
}

def create_training_text(row):
    """Create training text in the format the model expects"""
    month_name = month_names[row['month']]

    # Format auxiliary features
    at_value = f"{row['AT']:.1f}" if pd.notna(row['AT']) else "N/A"
    ndvi_value = f"{row['avg_ndvi']:.3f}" if pd.notna(row['avg_ndvi']) else "N/A"
    pop_value = f"{int(row['population'])}" if pd.notna(row['population']) else "N/A"
    
    # Format coordinates
    lat_value = f"{row['latitude']:.4f}" if pd.notna(row['latitude']) else "N/A"
    lon_value = f"{row['longitude']:.4f}" if pd.notna(row['longitude']) else "N/A"

    
    # Create the full conversation
    # messages = [
    #     {"role": "user", "content": f"{SYSTEM_PROMPT}\n\n{USER_TEMPLATE.format(city=row['city'], state=row['state'], month=month_name, year=row['year'])}"},
    #     {"role": "assistant", "content": str(row['PM2.5'])}
    # ]
    
    # messages = [
    #     {"role": "user", "content": f"{SYSTEM_PROMPT}\n\n{USER_TEMPLATE.format(
    #         city=row['city'], 
    #         state=row['state'], 
    #         month=month_name, 
    #         year=row['year'],
    #         at=at_value,
    #         ndvi=ndvi_value,
    #         pop=pop_value
    #     )}"},
    #     {"role": "assistant", "content": str(row['PM2.5'])}
    # ]

    # Create the full conversation with lat/lon
    messages = [
        {"role": "user", "content": f"{SYSTEM_PROMPT}\n\n{USER_TEMPLATE.format(
            lat=lat_value,
            lon=lon_value,
            month=month_name, 
            year=row['year']
        )}"},
        {"role": "assistant", "content": str(row['PM2.5'])}
    ]

    # Apply chat template
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_token=False)
    return text

# Create training texts
training_texts = [create_training_text(row) for _, row in train_df.iterrows()]

# Split into train/validation (80/20)
train_texts, val_texts = train_test_split(
    training_texts, 
    test_size=0.2, 
    random_state=42
)

print(f"Train samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")

# Tokenize datasets
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"], 
        truncation=True, 
        padding="max_length",
        max_length=512,
        return_tensors=None
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# Create datasets
train_dataset = Dataset.from_dict({"text": train_texts})
val_dataset = Dataset.from_dict({"text": val_texts})

# Tokenize
train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# ============================================================================
# APPLY LORA
# ============================================================================
print("\n[STEP 3] Applying LoRA...")

# Enable gradient checkpointing for memory efficiency
model.gradient_checkpointing_enable()

# Apply LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# ============================================================================
# TRAINING CONFIGURATION
# ============================================================================
print("\n[STEP 4] Setting up training configuration...")

training_args = TrainingArguments(
    output_dir=str(output_dir),
    num_train_epochs=max_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    local_rank=-1,
    ddp_find_unused_parameters=False, 
    no_cuda=False,
    
    # Optimizer settings
    optim="adamw_torch",
    learning_rate=learning_rate,
    weight_decay=0.01,
    warmup_ratio=0.03,
    max_grad_norm=0.3,
    
    # Evaluation and saving
    eval_strategy="steps",
    eval_steps=eval_steps,
    save_strategy="steps",
    save_steps=eval_steps,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    
    # Logging
    logging_steps=10,
    logging_dir=str(output_dir / "logs"),
    report_to="none",
    
    # Memory optimization
    fp16=False,
    bf16=True,
    dataloader_drop_last=False,
    remove_unused_columns=False,
    dataloader_num_workers=0,
    
    # Reproducibility
    seed=42,
    data_seed=42,
)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=None
)

# ============================================================================
# CREATE TRAINER
# ============================================================================
print("\n[STEP 5] Creating trainer...")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)]
)

# ============================================================================
# TRAINING
# ============================================================================
print("\n[STEP 6] Starting training...")
print(f"Training configuration:")
print(f"  - Model: {model_id}")
print(f"  - Max epochs: {max_epochs}")
print(f"  - Batch size: {batch_size}")
print(f"  - Learning rate: {learning_rate}")
print(f"  - Evaluation every: {eval_steps} steps")
print(f"  - Early stopping patience: {early_stopping_patience}")
print(f"  - LoRA rank: {lora_config.r}")
print(f"  - LoRA alpha: {lora_config.lora_alpha}")

# Start training
trainer.train()

# ============================================================================
# SAVE MODEL
# ============================================================================
print("\n[STEP 7] Saving model...")

# Save the fine-tuned model (LoRA adapters)
trainer.save_model()
tokenizer.save_pretrained(output_dir)

import shutil
for checkpoint_dir in output_dir.glob("checkpoint-*"):
    shutil.rmtree(checkpoint_dir)

print(f"Model saved to: {output_dir}")

GEMMA-2-9B-IT FINE-TUNING WITH LORA (CLEAN VERSION)

[STEP 1] Loading model and tokenizer...


Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.06it/s]


Model loaded successfully!

[STEP 2] Loading and preparing training data...
Loaded training data: (1440, 15)
Train samples: 1152
Validation samples: 288


Map: 100%|██████████| 1152/1152 [00:00<00:00, 2914.66 examples/s]
Map: 100%|██████████| 288/288 [00:00<00:00, 4409.56 examples/s]



[STEP 3] Applying LoRA...


The model is already on multiple devices. Skipping the move to device specified in `args`.


trainable params: 54,018,048 || all params: 9,295,724,032 || trainable%: 0.5811

[STEP 4] Setting up training configuration...

[STEP 5] Creating trainer...

[STEP 6] Starting training...
Training configuration:
  - Model: google/gemma-2-9b-it
  - Max epochs: 5
  - Batch size: 2
  - Learning rate: 0.0001
  - Evaluation every: 50 steps
  - Early stopping patience: 3
  - LoRA rank: 16
  - LoRA alpha: 32


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
50,0.7978,0.66387
100,0.5551,0.547292
150,0.4634,0.463329
200,0.4305,0.420515
250,0.4088,0.401714
300,0.3886,0.398796
350,0.3858,0.395086
400,0.3777,0.39014
450,0.3738,0.387888
500,0.3874,0.384921



[STEP 7] Saving model...
Model saved to: /home/diya.thakor/AirQuality/BASELINE/finetuned/gemma-2-9b-it-pm25-clean


### Inferencing on fine tuned model

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import os
from datetime import datetime
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import spearmanr

# ============================================================================
# CONFIGURATION
# ============================================================================
# Paths
base_dir = "/home/diya.thakor/AirQuality/BASELINE"
finetuned_model_path = f"{base_dir}/finetuned/gemma-2-9b-it-pm25-clean"
test_data_path = f"{base_dir}/data/test_2023.csv"  
results_path = f"{base_dir}/results/gemma2_finetuned_test_results.csv"

# Model configuration
base_model_id = "google/gemma-2-9b-it"
hf_token = os.environ.get("HF_TOKEN")
device = "cuda:1"

# Batch processing configuration
BATCH_SIZE = 8  
MAX_NEW_TOKENS = 10

print("="*80)
print("GEMMA-2-9B-IT FINE-TUNED MODEL BATCH INFERENCE")
print("="*80)

# ============================================================================
# LOAD MODELS AND TOKENIZER
# ============================================================================
print("\n[STEP 1] Loading base model and fine-tuned adapters...")

tokenizer = AutoTokenizer.from_pretrained(base_model_id, token=hf_token)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map={"": 1},
    torch_dtype=torch.bfloat16,
    token=hf_token,
)

model = PeftModel.from_pretrained(
    base_model, 
    finetuned_model_path,
    torch_dtype=torch.bfloat16
)

model.eval()
print(f"Fine-tuned model loaded on {device}")

# ============================================================================
# LOAD TEST DATA
# ============================================================================
print("\n[STEP 2] Loading test data...")
test_df = pd.read_csv(test_data_path)
print(f"Loaded test data: {test_df.shape}")

# ============================================================================
# PROMPT TEMPLATES
# ============================================================================
SYSTEM_PROMPT = (
    "You are an air pollution assistant. "
    "Strictly respond to queries with a single real number only. "
    "Do not include any units, explanation, or punctuation. Just a single number."
)

# RQ1

# USER_TEMPLATE = (
#     "What is the average PM2.5 concentration (in μg/m³) in {city}, {state} during {month}, {year}? "
#     "Give a single number only."
# )

#RQ4

# USER_TEMPLATE = (
#     "What is the average PM2.5 concentration (in μg/m³) in {city}, {state} during {month}, {year}? "
#     "Additional context: Temperature (AT) = {at}°C, NDVI = {ndvi}, Population = {pop}. "
#     "Give a single number only."
# )

# With lat,lon

USER_TEMPLATE = (
    "What is the average PM2.5 concentration (in μg/m³) at location (latitude={lat}, longitude={lon}) during {month}, {year}? "
    "Give a single number only."
)


month_names = {
    1: "January", 2: "February", 3: "March", 4: "April",
    5: "May", 6: "June", 7: "July", 8: "August", 
    9: "September", 10: "October", 11: "November", 12: "December"
}

# ============================================================================
# BATCH INFERENCE FUNCTIONS
# ============================================================================
def prepare_batch_prompts(batch_data):
    """Prepare batch of prompts for inference"""
    prompts = []
    for _, row in batch_data.iterrows():
        city = row['city']
        state = row['state']
        month_name = month_names[row['month']]
        year = row['year']

        # Format auxiliary features (handle missing values)
        at_value = f"{row['AT']:.1f}" if pd.notna(row['AT']) else "N/A"
        ndvi_value = f"{row['avg_ndvi']:.3f}" if pd.notna(row['avg_ndvi']) else "N/A"
        pop_value = f"{int(row['population'])}" if pd.notna(row['population']) else "N/A"

        # Format coordinates
        lat_value = f"{row['latitude']:.4f}" if pd.notna(row['latitude']) else "N/A"
        lon_value = f"{row['longitude']:.4f}" if pd.notna(row['longitude']) else "N/A"
    
        
        # user_prompt = USER_TEMPLATE.format(city=city, state=state, month=month_name, year=year)

        # user_prompt = USER_TEMPLATE.format(
        #     city=city, 
        #     state=state, 
        #     month=month_name, 
        #     year=year,
        #     at=at_value,
        #     ndvi=ndvi_value,
        #     pop=pop_value
        # )

        user_prompt = USER_TEMPLATE.format(
            lat=lat_value,
            lon=lon_value,
            month=month_name,
            year=year
        )

        messages = [
            {"role": "user", "content": f"{SYSTEM_PROMPT}\n\n{user_prompt}"}
        ]
        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        prompts.append(prompt)
    
    return prompts

def batch_inference(prompts):
    """Run batch inference on list of prompts"""
    # Tokenize all prompts at once
    inputs = tokenizer(
        prompts, 
        return_tensors="pt", 
        padding=True, 
        truncation=True,
        max_length=512
    ).to(device)
    
    # Generate predictions for batch
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=False,
            top_k=1,
            num_beams=1,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
            num_return_sequences=1
        )
    
    # Decode batch outputs
    predictions = []
    for i, output in enumerate(outputs):
        # Extract only the generated part (after input)
        input_length = inputs["input_ids"][i].shape[0]
        generated_tokens = output[input_length:]
        generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
        
        # Extract number from response
        match = re.search(r"\d+(\.\d+)?", generated_text)
        prediction = float(match.group()) if match else float("nan")
        predictions.append(prediction)
    
    return predictions

# ============================================================================
# RESUME FUNCTIONALITY (CLEAN VERSION)
# ============================================================================
if os.path.exists(results_path):
    existing_results = pd.read_csv(results_path)
    print(f"Found existing results: {len(existing_results)} rows")
    processed_keys = set(zip(
        existing_results['city'], 
        existing_results['state'], 
        existing_results['month'],
        existing_results['year']
    ))
else:
    print("Starting fresh batch inference run")
    existing_results = pd.DataFrame()
    processed_keys = set()

# Filter unprocessed data using actual data matching
test_df['data_key'] = list(zip(test_df['city'], test_df['state'], test_df['month'], test_df['year']))
unprocessed_df = test_df[~test_df['data_key'].isin(processed_keys)]
print(f"Remaining samples to process: {len(unprocessed_df)}")

# ============================================================================
# BATCH PROCESSING
# ============================================================================
print(f"\n[STEP 3] Running batch inference (batch_size={BATCH_SIZE})...")

all_results = []
error_count = 0
start_time = datetime.now()

# Process data in batches
for start_idx in tqdm(range(0, len(unprocessed_df), BATCH_SIZE), desc="Batch Processing"):
    end_idx = min(start_idx + BATCH_SIZE, len(unprocessed_df))
    batch_data = unprocessed_df.iloc[start_idx:end_idx]
    
    try:
        # Prepare batch prompts
        prompts = prepare_batch_prompts(batch_data)
        
        # Run batch inference
        predictions = batch_inference(prompts)
        
        # Store results
        for i, (_, row) in enumerate(batch_data.iterrows()):
            result = {
                'city': row['city'],
                'state': row['state'],
                'month': row['month'],
                'year': row['year'],
                'pm2.5_predicted': predictions[i],
                'pm2.5_actual': row['PM2.5'] 
            }
            all_results.append(result)
        
        # Save progress every 10 batches
        if len(all_results) % (BATCH_SIZE * 10) == 0:
            # Combine with existing results
            if len(existing_results) > 0:
                combined_results = pd.concat([existing_results, pd.DataFrame(all_results)], ignore_index=True)
            else:
                combined_results = pd.DataFrame(all_results)
            
            combined_results.to_csv(results_path, index=False)
            print(f"Saved checkpoint: {len(combined_results)} total results")
    
    except Exception as e:
        error_count += 1
        print(f"Error processing batch {start_idx//BATCH_SIZE + 1}: {e}")
        
        # Store error results for this batch
        for _, row in batch_data.iterrows():
            result = {
                'city': row['city'],
                'state': row['state'],
                'month': row['month'],
                'year': row['year'],
                'pm2.5_predicted': float('nan'),
                'pm2.5_actual': row['PM2.5']
            }
            all_results.append(result)

# ============================================================================
# SAVE FINAL RESULTS
# ============================================================================
print("\n[STEP 4] Saving final results...")

if len(existing_results) > 0:
    final_results = pd.concat([existing_results, pd.DataFrame(all_results)], ignore_index=True)
else:
    final_results = pd.DataFrame(all_results)

final_results.to_csv(results_path, index=False)

# ============================================================================
# SUMMARY STATISTICS
# ============================================================================
print("\n[STEP 5] Summary Statistics...")

end_time = datetime.now()
runtime = end_time - start_time
total_new_samples = len(all_results)

print(f"Batch inference completed!")
print(f"Batch size used: {BATCH_SIZE}")
print(f"Total samples processed: {len(final_results)}")
print(f"New predictions made: {total_new_samples}")
print(f"Error batches: {error_count}")
print(f"Runtime: {runtime}")
print(f"Speed: {total_new_samples / runtime.total_seconds():.2f} samples/second")
print(f"Results saved to: {results_path}")

# ============================================================================
# EVALUATION METRICS
# ============================================================================
print("\n[STEP 6] Calculating Evaluation Metrics...")

# Remove rows with NaN predictions (errors)
valid_results = final_results.dropna(subset=['pm2.5_predicted', 'pm2.5_actual'])
print(f"Valid predictions: {len(valid_results)} out of {len(final_results)}")

if len(valid_results) > 0:
    y_true = valid_results['pm2.5_actual'].values
    y_pred = valid_results['pm2.5_predicted'].values
    
    # Calculate metrics
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    spearman_corr, _ = spearmanr(y_true, y_pred)
    
    print("\n" + "="*80)
    print("EVALUATION RESULTS")
    print("="*80)
    print(f"MAE (Mean Absolute Error):       {mae:.2f} μg/m³")
    print(f"RMSE (Root Mean Squared Error):  {rmse:.2f} μg/m³")
    print(f"Spearman Correlation:             {spearman_corr:.4f}")
    print("="*80)
    
    # Save metrics to file
    metrics_dict = {
        'MAE': mae,
        'RMSE': rmse,
        'Spearman': spearman_corr,
        'Valid_Predictions': len(valid_results),
        'Total_Samples': len(final_results),
        'Error_Rate': (len(final_results) - len(valid_results)) / len(final_results) * 100
    }
    
    metrics_path = results_path.replace('.csv', '_metrics.txt')
    with open(metrics_path, 'w') as f:
        f.write("="*80 + "\n")
        f.write("GEMMA-2-9B-IT FINE-TUNED MODEL - EVALUATION METRICS\n")
        f.write("="*80 + "\n\n")
        f.write(f"Model: {base_model_id}\n")
        f.write(f"Fine-tuned checkpoint: {finetuned_model_path}\n")
        f.write(f"Test data: {test_data_path}\n")
        f.write(f"Inference date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
        f.write("Metrics:\n")
        f.write(f"  MAE:       {mae:.2f} μg/m³\n")
        f.write(f"  RMSE:      {rmse:.2f} μg/m³\n")
        f.write(f"  Spearman:  {spearman_corr:.4f}\n\n")
        f.write("Data Statistics:\n")
        f.write(f"  Valid predictions:     {len(valid_results)}\n")
        f.write(f"  Total samples:         {len(final_results)}\n")
        f.write(f"  Error rate:            {metrics_dict['Error_Rate']:.2f}%\n")
        f.write(f"  Runtime:               {runtime}\n")
        f.write(f"  Processing speed:      {total_new_samples / runtime.total_seconds():.2f} samples/sec\n")
    
    print(f"\nMetrics saved to: {metrics_path}")
else:
    print("No valid predictions found! Check error logs.")


# ============================================================================
# CLEANUP
# ============================================================================
print("\n[STEP 6] Cleanup...")
del model
del base_model
torch.cuda.empty_cache()

print("="*80)
print("BATCH INFERENCE COMPLETED SUCCESSFULLY")
print("="*80)

  from .autonotebook import tqdm as notebook_tqdm


GEMMA-2-9B-IT FINE-TUNED MODEL BATCH INFERENCE

[STEP 1] Loading base model and fine-tuned adapters...


`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.16it/s]


Fine-tuned model loaded on cuda:1

[STEP 2] Loading test data...
Loaded test data: (1932, 15)
Starting fresh batch inference run
Remaining samples to process: 1932

[STEP 3] Running batch inference (batch_size=8)...


Batch Processing:   0%|          | 0/242 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Batch Processing:   4%|▍         | 10/242 [00:10<03:41,  1.05it/s]

Saved checkpoint: 80 total results


Batch Processing:   8%|▊         | 20/242 [00:19<03:36,  1.03it/s]

Saved checkpoint: 160 total results


Batch Processing:  12%|█▏        | 30/242 [00:29<03:21,  1.05it/s]

Saved checkpoint: 240 total results


Batch Processing:  17%|█▋        | 40/242 [00:38<03:13,  1.05it/s]

Saved checkpoint: 320 total results


Batch Processing:  21%|██        | 50/242 [00:48<03:02,  1.05it/s]

Saved checkpoint: 400 total results


Batch Processing:  25%|██▍       | 60/242 [00:57<02:51,  1.06it/s]

Saved checkpoint: 480 total results


Batch Processing:  29%|██▉       | 70/242 [01:07<02:42,  1.06it/s]

Saved checkpoint: 560 total results


Batch Processing:  33%|███▎      | 80/242 [01:16<02:32,  1.06it/s]

Saved checkpoint: 640 total results


Batch Processing:  37%|███▋      | 90/242 [01:26<02:25,  1.04it/s]

Saved checkpoint: 720 total results


Batch Processing:  41%|████▏     | 100/242 [01:35<02:14,  1.06it/s]

Saved checkpoint: 800 total results


Batch Processing:  45%|████▌     | 110/242 [01:45<02:04,  1.06it/s]

Saved checkpoint: 880 total results


Batch Processing:  50%|████▉     | 120/242 [01:54<01:56,  1.05it/s]

Saved checkpoint: 960 total results


Batch Processing:  54%|█████▎    | 130/242 [02:04<01:46,  1.06it/s]

Saved checkpoint: 1040 total results


Batch Processing:  58%|█████▊    | 140/242 [02:13<01:38,  1.04it/s]

Saved checkpoint: 1120 total results


Batch Processing:  62%|██████▏   | 150/242 [02:23<01:28,  1.04it/s]

Saved checkpoint: 1200 total results


Batch Processing:  66%|██████▌   | 160/242 [02:33<01:19,  1.03it/s]

Saved checkpoint: 1280 total results


Batch Processing:  70%|███████   | 170/242 [02:42<01:09,  1.04it/s]

Saved checkpoint: 1360 total results


Batch Processing:  74%|███████▍  | 180/242 [02:52<00:59,  1.05it/s]

Saved checkpoint: 1440 total results


Batch Processing:  79%|███████▊  | 190/242 [03:02<00:50,  1.02it/s]

Saved checkpoint: 1520 total results


Batch Processing:  83%|████████▎ | 200/242 [03:11<00:40,  1.04it/s]

Saved checkpoint: 1600 total results


Batch Processing:  87%|████████▋ | 210/242 [03:21<00:31,  1.03it/s]

Saved checkpoint: 1680 total results


Batch Processing:  91%|█████████ | 220/242 [03:31<00:21,  1.01it/s]

Saved checkpoint: 1760 total results


Batch Processing:  95%|█████████▌| 230/242 [03:41<00:11,  1.03it/s]

Saved checkpoint: 1840 total results


Batch Processing:  99%|█████████▉| 240/242 [03:50<00:01,  1.05it/s]

Saved checkpoint: 1920 total results


Batch Processing: 100%|██████████| 242/242 [03:52<00:00,  1.04it/s]


[STEP 4] Saving final results...

[STEP 5] Summary Statistics...
Batch inference completed!
Batch size used: 8
Total samples processed: 1932
New predictions made: 1932
Error batches: 0
Runtime: 0:03:52.437517
Speed: 8.31 samples/second
Results saved to: /home/diya.thakor/AirQuality/BASELINE/results/gemma2_finetuned_test_results.csv

[STEP 6] Calculating Evaluation Metrics...
Valid predictions: 1932 out of 1932

EVALUATION RESULTS
MAE (Mean Absolute Error):       18.43 μg/m³
RMSE (Root Mean Squared Error):  26.82 μg/m³
Spearman Correlation:             0.7445

Metrics saved to: /home/diya.thakor/AirQuality/BASELINE/results/gemma2_finetuned_test_results_metrics.txt

[STEP 6] Cleanup...
BATCH INFERENCE COMPLETED SUCCESSFULLY



