# Tech Challenge (Colab GPU): Fine-tuning OSS com Unsloth + TinyLlama

Pipeline completo para rodar no Google Colab com GPU (sem custo de API):

1) Montar Google Drive e instalar dependências
2) Carregar e limpar dados (title → content) a partir de `/content/drive/MyDrive/FineTunning/TechChallenge03/trn.json.gz`
3) Preparar dataset para SFT (prompts + splits)
4) Avaliação baseline (modelo base, sem FT)
5) Treinamento LoRA com Unsloth + TinyLlama
6) Avaliação pós-FT e gráficos
7) Salvar artefatos (adapter, métricas, amostras) no Drive

Observações:
- Use GPU (Runtime > Change runtime type > T4/L4/A100).
- bitsandbytes (4-bit) ajuda caber na VRAM.
- Ajuste OSS_SUBSET/OSS_EVAL_N/OSS_EPOCHS conforme tempo e GPU.

In [None]:
# 1) Montar Drive e instalar dependências (Colab)
import sys, subprocess, os

IN_COLAB = False
try:
    import google.colab  # type: ignore
    IN_COLAB = True
except Exception:
    pass

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')

PKGS = [
    'unsloth>=2024.8.12',
    'transformers>=4.43',
    'accelerate>=0.33',
    'datasets>=2.20',
    'peft>=0.12.0',
    'trl>=0.9.6',
    'sacrebleu',
    'rouge-score',
    'matplotlib',
    'tiktoken'
]

print('Instalando dependências...')
subprocess.check_call([sys.executable, '-m', 'pip', 'install', *PKGS])

# bitsandbytes geralmente já existe no Colab; garantimos
try:
    import bitsandbytes as bnb  # type: ignore
except Exception:
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'bitsandbytes'])

print('Setup concluído.')

In [None]:
# 2) Carregar e limpar dados (title → content)
import gzip, json, re
import pandas as pd
import numpy as np
from tqdm import tqdm

DATA_PATH = '/content/drive/MyDrive/FineTunning/TechChallenge03/trn.json.gz'
SEED = int(os.getenv('SEED', 42))
MAX_RECORDS = int(os.getenv('MAX_RECORDS_FOR_FT', 200_000))

assert os.path.exists(DATA_PATH), f'Arquivo não encontrado: {DATA_PATH}'

rows, count = [], 0
with gzip.open(DATA_PATH, 'rt', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if not line: continue
        try:
            obj = json.loads(line)
        except json.JSONDecodeError:
            continue
        rows.append({'title': obj.get('title'), 'content': obj.get('content')})
        count += 1
        if count >= MAX_RECORDS: break

df_raw = pd.DataFrame(rows, columns=['title','content']).dropna(how='all')
print('Lidas linhas:', len(df_raw))

EMPTY_STRINGS = {'none','nan','null','na','n/a'}
def clean_text(s):
    if s is None: return None
    if isinstance(s, float) and np.isnan(s): return None
    if isinstance(s, str) and s.strip().lower() in EMPTY_STRINGS: return None
    s = str(s)
    s = re.sub(r'<[^>]+>', ' ', s)
    s = re.sub(r'[`*_#>\"]', ' ', s)
    s = s.replace('
',' ').replace('
',' ')
    s = re.sub(r'\s+', ' ', s).strip(

,