# Telugu Multi-Turn Dialogue Pipeline
### Optimised for Google Colab (T4 / A100 GPU)

**Steps covered:**
1. GPU check
2. Mount Google Drive & upload project
3. Install dependencies (with `faiss-gpu` + `bitsandbytes` for 4-bit quantisation)
4. Run the full pipeline (data → embeddings → generation → evaluation → plots)
5. Save outputs back to Drive

> **Before running:** Change Runtime → **T4 GPU** (Runtime > Change runtime type)

## Step 1 — Verify GPU

In [None]:
import subprocess, sys

result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
if result.returncode != 0:
    print('⚠️  No GPU detected. Go to Runtime > Change runtime type and select T4 GPU.')
    print('   The pipeline will still run on CPU but will be very slow.')
else:
    print(result.stdout)
    print('✅ GPU is available!')

import torch
print(f'PyTorch version : {torch.__version__}')
print(f'CUDA available  : {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'GPU device      : {torch.cuda.get_device_name(0)}')
    print(f'VRAM            : {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB')

## Step 2 — Mount Google Drive & Set Up Project

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os

# ─────────────────────────────────────────────────────────────
# OPTION A: Project is already zipped in your Drive
#   Upload nlp_new.zip to your Drive root, then run this block.
#
# OPTION B: Upload directly from local machine (next cell)
# ─────────────────────────────────────────────────────────────

DRIVE_ZIP = '/content/drive/MyDrive/nlp_new.zip'   # change if needed
PROJECT_DIR = '/content/nlp_new'

if os.path.exists(DRIVE_ZIP):
    print(f'Found zip at {DRIVE_ZIP}. Extracting...')
    !unzip -q "{DRIVE_ZIP}" -d /content/
    print('✅ Extracted.')
else:
    print(f'❌ Zip not found at {DRIVE_ZIP}.')
    print('   Run the next cell to upload your files directly.')

os.chdir(PROJECT_DIR)
print(f'Working directory: {os.getcwd()}')

In [None]:
# ── OPTION B: Upload files directly from your local machine ──
# Run this ONLY if you did not use Option A above.

# from google.colab import files
# uploaded = files.upload()   # select nlp_new.zip from your Mac
# !unzip -q nlp_new.zip -d /content/
# import os; os.chdir('/content/nlp_new')

## Step 3 — Install Dependencies

In [None]:
# Install all required packages.
# faiss-gpu replaces faiss-cpu for CUDA acceleration.
# bitsandbytes enables 4-bit quantisation of Gemma & Sarvam models.

!pip install -q \
    pandas numpy \
    torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 \
    transformers>=4.38.0 \
    accelerate \
    bitsandbytes \
    faiss-gpu \
    seaborn matplotlib \
    sentencepiece \
    huggingface_hub

print('✅ All packages installed.')

## Step 4 — (Optional) Hugging Face Login
Required for gated models like **google/gemma-2-2b-it**.

In [None]:
# Paste your Hugging Face token from https://huggingface.co/settings/tokens
# Make sure you have accepted the Gemma model terms on the HF model page.

from huggingface_hub import login
login()   # will prompt for token interactively

## Step 5 — Enable 4-bit Quantisation in Config
This patches `config.py` to add the quantisation flag used by the model loaders.

In [None]:
# Patch model_t5.py and model_sarvam.py to use 4-bit quantisation.
# This reduces VRAM usage from ~6 GB to ~2 GB per model.

QUANT_PATCH = '''
from transformers import BitsAndBytesConfig
import torch

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)
'''

import re, pathlib

def patch_model_file(filepath: str) -> None:
    src = pathlib.Path(filepath).read_text()
    if 'BitsAndBytesConfig' in src:
        print(f'{filepath} already patched.')
        return

    # Insert bnb_config block after the first 'from transformers import' line
    src = re.sub(
        r'(from transformers import .*\n)',
        r'\1' + QUANT_PATCH,
        src,
        count=1
    )

    # Add quantization_config=bnb_config to from_pretrained calls
    src = re.sub(
        r'(AutoModelForSeq2SeqLM|AutoModelForCausalLM)\.from_pretrained\(([^)]+)\)',
        lambda m: m.group(0).rstrip(')') + ',\n        quantization_config=bnb_config,\n        device_map="auto")',
        src
    )

    pathlib.Path(filepath).write_text(src)
    print(f'✅ Patched {filepath} with 4-bit quantisation.')

patch_model_file('model_t5.py')
patch_model_file('model_sarvam.py')

## Step 6 — Run the Full Pipeline

In [None]:
# Verify the dataset exists before starting
import os

TEL_TRAIN = 'IndicDialogue Dataset/dataset/Splitted_Dataset/train/tel/tel.jsonl'
TEL_TEST  = 'IndicDialogue Dataset/dataset/Splitted_Dataset/test/tel/tel.jsonl'

for p in [TEL_TRAIN, TEL_TEST]:
    exists = os.path.exists(p)
    status = '✅' if exists else '❌'
    size   = f'({os.path.getsize(p)/1024:.1f} KB)' if exists else ''
    print(f'{status} {p} {size}')

In [None]:
# ── Stage 1: Data Loading ──────────────────────────────────────
import logging, sys
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(name)s %(levelname)s %(message)s',
                    handlers=[logging.StreamHandler(sys.stdout)])

from config import Config
from data_loader import DataLoader

config = Config()
loader = DataLoader(config)
raw_df = loader.load_data(split='train')
print(f'\nLoaded {len(raw_df)} Telugu dialogue lines.')
raw_df.head()

In [None]:
# ── Stage 2: Data Cleaning ────────────────────────────────────
from data_cleaner import DataCleaner

cleaner = DataCleaner(config)
cleaned_df = cleaner.clean_dataset(raw_df, text_column='text')
print(f'After cleaning: {len(cleaned_df)} rows (removed {len(raw_df)-len(cleaned_df)} noisy lines).')

In [None]:
# ── Stage 3: Dialogue Segmentation ───────────────────────────
from dialogue_segmenter import DialogueSegmenter

segmenter = DialogueSegmenter(config)
dialogue_pairs = segmenter.segment_dialogues(cleaned_df, text_column='text')
print(f'Generated {len(dialogue_pairs)} dialogue pairs (context window = {config.context_window_size}).')

In [None]:
# ── Stage 4: Triplet Construction ────────────────────────────
from tripplet_builder import TripletBuilder

builder  = TripletBuilder(config)
triplets = builder.build_triplets(dialogue_pairs)
print(f'Built {len(triplets)} anchor-positive triplets.')
print('Sample triplet keys:', list(triplets[0].keys()) if triplets else 'N/A')

In [None]:
# ── Stage 5: MuRIL Embedding (GPU-accelerated) ────────────────
from embedder import MuRILEmbedder
import torch

print(f'Embedding on: {"CUDA" if torch.cuda.is_available() else "CPU"}')
embedder = MuRILEmbedder(config)

contexts = [str(t['anchor']) for t in triplets if t.get('anchor')]
context_embeddings = embedder.get_embeddings(contexts)
print(f'Embedding matrix shape: {context_embeddings.shape}')

In [None]:
# ── Stage 6: FAISS Vector DB ──────────────────────────────────
from vectordb_store import VectorDBStore

vector_db = VectorDBStore(config)
vector_db.build_index(context_embeddings, triplets)
vector_db.save_index()
print('✅ FAISS index built and saved.')

In [None]:
# ── Stage 7: Response Generation (4-bit quantised models) ─────
# This is the slowest stage. Checkpointing is built-in — if it crashes,
# re-run this cell and it will resume from the last checkpoint.

import gc, torch
gc.collect()
torch.cuda.empty_cache()

from response_generator import ResponseGenerator

generator = ResponseGenerator(config)
enriched_triplets = generator.generate_all_responses(triplets)
print(f'✅ Generated responses for {len(enriched_triplets)} triplets.')

In [None]:
# ── Stage 8: Evaluation ───────────────────────────────────────
from evaluator import Evaluator

evaluator = Evaluator(config)
evaluation_results = evaluator.evaluate_dataset(enriched_triplets)
print(f'✅ Evaluated {len(evaluation_results)} samples.')

In [None]:
# ── Stage 9: Metrics Logging ──────────────────────────────────
from metrics_logger import MetricsLogger

metrics_logger = MetricsLogger(config)
metrics_logger.log_results(evaluation_results)
print('✅ Metrics saved.')

In [None]:
# ── Stage 10: Results Analysis & Visualisation ────────────────
from results_analyzer import ResultsAnalyzer
from visualizer import Visualizer

analyzer = ResultsAnalyzer(config)
summary_matrix = analyzer.analyze_metrics(evaluation_results)
analyzer.print_summary(summary_matrix)

visualizer = Visualizer(config)
visualizer.plot_heatmap(summary_matrix)
visualizer.plot_bar_charts(summary_matrix)

print('\n✅ Pipeline completed successfully!')

## Step 7 — Save Outputs to Google Drive

In [None]:
import shutil, os

DRIVE_OUTPUT = '/content/drive/MyDrive/nlp_new_outputs'
os.makedirs(DRIVE_OUTPUT, exist_ok=True)

# Copy outputs, logs, and vector_db to Drive
for folder in ['outputs', 'logs', 'vector_db']:
    src = os.path.join('/content/nlp_new', folder)
    dst = os.path.join(DRIVE_OUTPUT, folder)
    if os.path.exists(src):
        shutil.copytree(src, dst, dirs_exist_ok=True)
        print(f'✅ Copied {folder}/ → Drive')
    else:
        print(f'⚠️  {folder}/ not found, skipping.')

print(f'\nAll outputs saved to {DRIVE_OUTPUT}')

In [None]:
# Optional: also zip and download to your local machine
# Uncomment to run.

# import shutil
# from google.colab import files
# shutil.make_archive('/content/nlp_new_outputs', 'zip', DRIVE_OUTPUT)
# files.download('/content/nlp_new_outputs.zip')