<a href="https://colab.research.google.com/github/Sbursu/Carbon-EF/blob/temp-branch/training/notebooks/mistral_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mistral-7B Fine-Tuning

This notebook implements fine-tuning of Mistral-7B for emission factor recommendations.

## Setup
1. Select Runtime > Change runtime type and choose GPU
2. Run cells in sequence

In [1]:
# Check GPU availability
!nvidia-smi

Sun Mar 16 01:52:36 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   41C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## Install Dependencies

In [2]:
# Install core dependencies
!pip install -q transformers==4.36.2 datasets==2.16.1 peft==0.7.1 accelerate==0.25.0 bitsandbytes==0.41.3 trl==0.7.11 wandb==0.16.3
!pip install -q torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu118

# Install neo4j for database access (optional, used only if Neo4j data source is enabled)
!pip install -q neo4j==5.10.0

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.8/126.8 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.3/168.3 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m155.3/155.3 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Clone Repository and Import Scripts

In [3]:
!git clone https://github.com/Sbursu/Carbon-EF.git
%cd Carbon-EF

# Add repository root to Python path
import os
import sys
sys.path.append(os.getcwd())

# Import necessary modules with error handling
try:
    from training.scripts.data_preparation import load_and_prepare_data, format_instruction
    from training.scripts.model_config import setup_model_and_tokenizer, get_training_config
    from training.scripts.training import setup_trainer, evaluate_model, save_model
    print("Successfully imported all required modules")
except ImportError as e:
    print(f"Import error: {e}")
    print("Please check that all required packages are installed")

Cloning into 'Carbon-EF'...
remote: Enumerating objects: 129, done.[K
remote: Counting objects: 100% (129/129), done.[K
remote: Compressing objects: 100% (98/98), done.[K
remote: Total 129 (delta 31), reused 115 (delta 17), pack-reused 0 (from 0)[K
Receiving objects: 100% (129/129), 1.33 MiB | 3.49 MiB/s, done.
Resolving deltas: 100% (31/31), done.
/content/Carbon-EF


FileNotFoundError: [Errno 2] No such file or directory: '/content/Carbon-EF/training/logs/data_preparation.log'

## Prepare Training Data

In [None]:
# Check if data files exist
data_files = {
    "train": "training/data/instructions_train.json",
    "val": "training/data/instructions_val.json",
    "test": "training/data/instructions_test.json"
}

for split, file_path in data_files.items():
    if os.path.exists(file_path):
        print(f"Found {split} data: {file_path}")
    else:
        print(f"Warning: {file_path} not found")

# Load and prepare data
try:
    # Use file-based loading (don't use Neo4j in Colab)
    train_data, val_data = load_and_prepare_data(use_neo4j=False)

    # Format data for training
    train_data = train_data.map(format_instruction)
    val_data = val_data.map(format_instruction)

    # Print summary
    print(f"Training examples: {len(train_data['train'])}")
    print(f"Validation examples: {len(val_data['train'])}")

    # Show sample
    print("\nSample training example:")
    print(train_data["train"][0]["text"][:300] + "...")
except Exception as e:
    print(f"Error preparing data: {e}")
    print("Please check that the data files exist and are properly formatted")

## Initialize Model

In [None]:
# Set up model and tokenizer
try:
    model, tokenizer = setup_model_and_tokenizer()
    print("Model and tokenizer successfully initialized")

    # Get training configuration
    config = get_training_config()
    print("\nTraining configuration:")
    for key, value in config.items():
        print(f"  {key}: {value}")

    # Set up trainer
    trainer = setup_trainer(model, tokenizer, train_data, val_data, config)
    print("\nTrainer set up successfully")
except Exception as e:
    print(f"Error setting up model: {e}")
    print("Please check your GPU availability and memory")

## Start Training

In [None]:
# Start training
try:
    print("Starting training...")
    trainer.train()
    print("Training completed successfully!")

    # Save model
    save_model(model, tokenizer, config['output_dir'])
    print(f"Model saved to {config['output_dir']}/final_model")
except Exception as e:
    print(f"Error during training: {e}")
    print("\nTroubleshooting tips:")
    print("1. Check if you have enough VRAM (T4 or better GPU recommended)")
    print("2. Try reducing batch size or gradient accumulation steps")

## Evaluate Model

In [None]:
# Run evaluation
try:
    print("Running evaluation...")
    results = evaluate_model(model, tokenizer)

    # Display results
    print("\nEvaluation results:")
    for result in results:
        print(f"\nQuery: {result['query']}")
        print(f"Response: {result['response']}")
        print()
except Exception as e:
    print(f"Error during evaluation: {e}")

## Test Your Own Queries

In [None]:
from training.scripts.training import generate_recommendation

query = "What is the emission factor for cement production in India?"
try:
    response = generate_recommendation(model, tokenizer, query)
    print(f"Query: {query}")
    print(f"Response: {response}")
except Exception as e:
    print(f"Error generating recommendation: {e}")