# 🚀 LLM Dataset Preparation and Fine-tuning

This notebook works across different platforms:
- Google Colab
- Kaggle
- Local Jupyter
- Cloud clusters

## Setup Environment

In [None]:
import os
import sys
import platform

# Detect runtime environment
def get_environment():
    try:
        import google.colab
        return 'colab'
    except:
        try:
            import kaggle
            return 'kaggle'
        except:
            return 'local'

ENV = get_environment()
print(f"Running in {ENV} environment")

# Install required packages
!pip install -q transformers datasets torch accelerate bitsandbytes tqdm pandas numpy

## Clone Repository and Setup Data

In [None]:
# Clone repository
!git clone https://github.com/bentex2006/datasets-base.git
!cd datasets-base

# Setup paths based on environment
if ENV == 'colab':
    BASE_PATH = '/content/datasets-base'
elif ENV == 'kaggle':
    BASE_PATH = '/kaggle/working/datasets-base'
else:
    BASE_PATH = './datasets-base'

DATA_PATH = os.path.join(BASE_PATH, 'data')
RAW_DATA_PATH = os.path.join(DATA_PATH, 'raw')
PROCESSED_DATA_PATH = os.path.join(DATA_PATH, 'processed')

## Load and Process Datasets

In [None]:
import json
from typing import List, Dict

def load_jsonl(file_path: str) -> List[Dict]:
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

# Load sample datasets
instruction_data = load_jsonl(os.path.join(RAW_DATA_PATH, 'sample_instruction_dataset.jsonl'))
conversation_data = load_jsonl(os.path.join(RAW_DATA_PATH, 'sample_conversation_dataset.jsonl'))
completion_data = load_jsonl(os.path.join(RAW_DATA_PATH, 'sample_completion_dataset.jsonl'))
hinglish_data = load_jsonl(os.path.join(RAW_DATA_PATH, 'savage_ai_hinglish.jsonl'))

print(f"Loaded datasets:")
print(f"- Instructions: {len(instruction_data)} examples")
print(f"- Conversations: {len(conversation_data)} examples")
print(f"- Completions: {len(completion_data)} examples")
print(f"- Hinglish: {len(hinglish_data)} examples")

## Data Processing Functions

In [None]:
def format_for_mistral(data: List[Dict]) -> List[Dict]:
    """Format data for Mistral fine-tuning"""
    return [{
        "instruction": item.get("instruction", ""),
        "input": item.get("input", ""),
        "output": item.get("output", "")
    } for item in data]

def format_for_pi(data: List[Dict]) -> List[Dict]:
    """Format data for Pi fine-tuning"""
    return [{
        "conversations": [
            {"role": "human", "content": item.get("input", "")},
            {"role": "assistant", "content": item.get("output", "")}
        ]
    } for item in data]

def format_for_gemini(data: List[Dict]) -> List[Dict]:
    """Format data for Gemini fine-tuning"""
    return [{
        "prompt": item.get("input", ""),
        "completion": item.get("output", ""),
        "context": item.get("instruction", "")
    } for item in data]

def save_jsonl(data: List[Dict], file_path: str) -> None:
    """Save data in JSONL format"""
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, 'w', encoding='utf-8') as f:
        for item in data:
            f.write(json.dumps(item) + '\n')

## Process Data for Different Models

In [None]:
# Process Hinglish data for different models
mistral_data = format_for_mistral(hinglish_data)
pi_data = format_for_pi(hinglish_data)
gemini_data = format_for_gemini(hinglish_data)

# Save processed data
save_jsonl(mistral_data, os.path.join(PROCESSED_DATA_PATH, 'hinglish_mistral.jsonl'))
save_jsonl(pi_data, os.path.join(PROCESSED_DATA_PATH, 'hinglish_pi.jsonl'))
save_jsonl(gemini_data, os.path.join(PROCESSED_DATA_PATH, 'hinglish_gemini.jsonl'))

print("Data processed and saved for all models!")