In [None]:
# Install specific versions of libraries to match your Docker environment
# This is crucial to avoid version incompatibility issues when loading models.
!pip install scikit-learn==1.3.0 joblib==1.3.2 numpy==1.25.2 pdfminer.six==20221105

# IMPORTANT: After running the above pip install command,
# YOU MUST RESTART THE COLAB RUNTIME before proceeding.
# Go to "Runtime" -> "Restart runtime" in the Colab menu.
# Then, run all cells from the beginning again.

import os
import json
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTChar, LTLine, LTRect, LTFigure, LTTextBoxHorizontal # Import LTTextBoxHorizontal

# --- Configuration for GitHub Repository and Model Saving ---
GITHUB_REPO_URL = "https://github.com/jhaaj08/Adobe-India-Hackathon25.git"
REPO_NAME = GITHUB_REPO_URL.split('/')[-1].replace('.git', '')
CHALLENGE_1A_DIR = os.path.join(REPO_NAME, "Challenge_1a")
SAMPLE_INPUT_DIR = os.path.join(CHALLENGE_1A_DIR, "sample_dataset/pdfs")
SAMPLE_OUTPUT_DIR = os.path.join(CHALLENGE_1A_DIR, "sample_dataset/outputs")

MODEL_DIR = "colab_model_files"
MODEL_PATH = os.path.join(MODEL_DIR, "outline_classifier.joblib")
SCALER_PATH = os.path.join(MODEL_DIR, "scaler.joblib")

# --- Feature Extraction Functions (Copied and adapted from outline_extractor.py) ---
def get_font_weight_score(fontname):
    """Assigns a numerical score based on font name to indicate boldness."""
    if not fontname:
        return 0
    fontname_lower = fontname.lower()
    score = 0
    if 'bold' in fontname_lower or 'bd' in fontname_lower or 'heavy' in fontname_lower or 'black' in fontname_lower:
        score += 2
    if 'italic' in fontname_lower or 'it' in fontname_lower:
        score += 0.5
    return score

def get_text_properties(element, page_width, page_height, page_font_sizes=None, prev_element_bbox=None):
    """
    Extracts a rich set of features from a PDF text element (LTTextBoxHorizontal).
    Requires page_width and page_height for normalization.
    """
    if not isinstance(element, LTTextBoxHorizontal): # Now expecting LTTextBoxHorizontal
        return None

    text = element.get_text().strip()
    if not text:
        return None

    font_size = 0
    font_is_bold = False
    font_is_italic = False
    font_weight_score = 0

    # Iterate through text lines and characters within the LTTextBoxHorizontal
    # to get font properties. Assume consistent font within a single line.
    for text_line in element:
        for character in text_line:
            if isinstance(character, LTChar):
                font_size = round(character.size, 2)
                fontname = character.fontname.lower()
                font_weight_score = get_font_weight_score(character.fontname)
                if 'bold' in fontname or 'bd' in fontname:
                    font_is_bold = True
                if 'italic' in fontname or 'it' in fontname:
                    font_is_italic = True
                break # Found first character, assume consistent for the line
        if font_size > 0: # If font size found, no need to check other lines in this box
            break

    relative_font_size = 0
    if page_font_sizes and font_size:
        median_font_size = np.median(list(page_font_sizes))
        if median_font_size > 0:
            relative_font_size = font_size / median_font_size

    vertical_space_above = 0
    if prev_element_bbox and element.bbox:
        vertical_space_above = element.bbox[1] - prev_element_bbox[3]
        vertical_space_above = max(0, min(vertical_space_above, 100))

    x_position_normalized = element.bbox[0] / page_width if page_width > 0 else 0

    line_width = element.bbox[2] - element.bbox[0]
    char_density = len(text) / line_width if line_width > 0 else 0

    has_prefix = 0
    if text and len(text) > 2:
        first_word = text.split(' ')[0]
        # More robust prefix check: "1.", "1.1.", "A.", "Chapter X", "Section Y"
        if (first_word.endswith('.') and (first_word[:-1].isdigit() or first_word[:-1].isalpha())) or \
           (first_word.isdigit() and len(first_word) < 4) or \
           (first_word.isupper() and len(first_word) < 8 and first_word.isalpha()) or \
           (text.lower().startswith("chapter ") and len(text.split()) < 5) or \
           (text.lower().startswith("section ") and len(text.split()) < 5):
            has_prefix = 1

    is_numeric_only = text.replace('.', '').replace(',', '').replace(' ', '').isdigit() and len(text) < 10

    return {
        "text": text,
        "font_size": font_size,
        "is_uppercase": text.isupper(),
        "is_bold": font_is_bold,
        "is_italic": font_is_italic,
        "font_weight_score": font_weight_score,
        "line_length": len(text),
        "x_position": element.bbox[0],
        "x_position_normalized": x_position_normalized,
        "relative_font_size": relative_font_size,
        "vertical_space_above": vertical_space_above,
        "has_prefix": has_prefix,
        "char_density": char_density,
        "is_numeric_only": is_numeric_only,
        "bbox": element.bbox
    }

def normalize_text_for_comparison(text):
    """Normalizes text for robust comparison with ground truth."""
    return ' '.join(text.lower().strip().split()).replace('\n', ' ')

def collect_real_training_data():
    """
    Collects features and labels from the sample input/output PDFs.
    """
    X = [] # Features
    y = [] # Labels

    pdf_files = [f for f in os.listdir(SAMPLE_INPUT_DIR) if f.lower().endswith(".pdf")]

    if not pdf_files:
        print(f"No PDF files found in {SAMPLE_INPUT_DIR}. Please check the repository structure.")
        return np.array([]), np.array([])

    for pdf_filename in pdf_files:
        pdf_path = os.path.join(SAMPLE_INPUT_DIR, pdf_filename)
        json_filename = os.path.splitext(pdf_filename)[0] + ".json"
        json_path = os.path.join(SAMPLE_OUTPUT_DIR, json_filename)

        if not os.path.exists(json_path):
            print(f"Warning: No corresponding JSON found for {pdf_filename} at {json_path}. Skipping.")
            continue

        print(f"Processing training data for: {pdf_filename}")

        # Load ground truth JSON
        with open(json_path, 'r', encoding='utf-8') as f:
            ground_truth = json.load(f)

        ground_truth_outline_map = {}
        for entry in ground_truth.get('outline', []):
            normalized_gt_text = normalize_text_for_comparison(entry['text'])
            # Store a mapping of normalized text to its level
            ground_truth_outline_map[normalized_gt_text] = entry['level']

        all_elements_raw = []
        for page_num, page_layout in enumerate(extract_pages(pdf_path)):
            page_width = page_layout.bbox[2] - page_layout.bbox[0]
            page_height = page_layout.bbox[3] - page_layout.bbox[1]

            page_elements_on_page = []
            for element in page_layout:
                # Process LTTextBoxHorizontal for individual lines
                if isinstance(element, LTTextBoxHorizontal): # Now expecting LTTextBoxHorizontal for training
                    page_elements_on_page.append({
                        "element": element,
                        "page": page_num + 1,
                        "bbox": element.bbox,
                        "page_width": page_width,
                        "page_height": page_height
                    })
                elif isinstance(element, (LTLine, LTRect, LTFigure)):
                    page_elements_on_page.append({
                        "element": element,
                        "page": page_num + 1,
                        "bbox": element.bbox,
                        "page_width": page_width,
                        "page_height": page_height
                    })
            page_elements_on_page.sort(key=lambda x: x["bbox"][1], reverse=True)
            all_elements_raw.extend(page_elements_on_page)

        prev_element_bbox = None
        page_font_sizes_cache = {}

        for elem_data in all_elements_raw:
            element = elem_data["element"]
            page_num = elem_data["page"]
            page_width = elem_data["page_width"]
            page_height = elem_data["page_height"]

            if isinstance(element, LTTextBoxHorizontal): # Only process text boxes for features
                if page_num not in page_font_sizes_cache:
                    current_page_text_elements = [
                        e["element"] for e in all_elements_raw if e["page"] == page_num and isinstance(e["element"], LTTextBoxHorizontal)
                    ]
                    page_font_sizes_cache[page_num] = {
                        get_text_properties(e, page_width, page_height)["font_size"]
                        for e in current_page_text_elements if get_text_properties(e, page_width, page_height) and get_text_properties(e, page_width, page_height)["font_size"] > 0
                    }

                props = get_text_properties(element, page_width, page_height, page_font_sizes_cache.get(page_num), prev_element_bbox)

                if props:
                    # Determine label based on ground truth
                    normalized_text = normalize_text_for_comparison(props["text"])

                    assigned_level = 'Body' # Default to Body
                    # Check for direct match first
                    if normalized_text in ground_truth_outline_map:
                        assigned_level = ground_truth_outline_map[normalized_text]
                    else:
                        # Fallback to fuzzy matching if direct match fails
                        # This helps with minor parsing differences or extra spaces/newlines
                        for gt_norm_text, gt_level in ground_truth_outline_map.items():
                            if gt_norm_text in normalized_text or normalized_text in gt_norm_text:
                                # Prioritize exact or longer matches if multiple fuzzy matches exist
                                # For simplicity, first fuzzy match wins for now.
                                assigned_level = gt_level
                                break

                    # Map level to numerical label
                    level_to_int = {"H1": 0, "H2": 1, "H3": 2, "Body": 3}
                    label = level_to_int.get(assigned_level, 3) # Default to Body (3) if not found

                    # Append features and label
                    X.append([
                        props["font_size"],
                        int(props["is_uppercase"]),
                        int(props["is_bold"]),
                        int(props["is_italic"]),
                        props["font_weight_score"],
                        props["line_length"],
                        props["x_position"],
                        props["x_position_normalized"],
                        props["relative_font_size"],
                        props["vertical_space_above"],
                        int(props["has_prefix"]),
                        props["char_density"],
                        int(props["is_numeric_only"])
                    ])
                    y.append(label)
                prev_element_bbox = props["bbox"] if props else element.bbox
            else: # Non-text element, just update prev_element_bbox
                prev_element_bbox = element.bbox

    return np.array(X), np.array(y)

def train_and_save_model():
    """
    Trains a Logistic Regression model and a StandardScaler using real sample data, then saves them.
    """
    # Clone the GitHub repository
    print(f"Cloning {GITHUB_REPO_URL}...")
    if os.path.exists(REPO_NAME):
        !rm -rf {REPO_NAME} # Clean up if already exists
    !git clone {GITHUB_REPO_URL}

    print("Collecting real training data from sample PDFs and JSONs...")
    X, y = collect_real_training_data()

    if X.size == 0:
        print("No training data collected. Exiting training process.")
        return

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Initialize and train the StandardScaler
    print("Training StandardScaler...")
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Initialize and train the Logistic Regression model
    print("Training Logistic Regression model...")
    model = LogisticRegression(max_iter=2000, random_state=42, solver='lbfgs', multi_class='multinomial')
    model.fit(X_train_scaled, y_train)

    # Evaluate the model
    y_pred = model.predict(X_test_scaled)
    print("\nModel Classification Report:")
    target_names = ['H1', 'H2', 'H3', 'Body']
    print(classification_report(y_test, y_pred, target_names=target_names))

    # Create the model directory if it doesn't exist
    os.makedirs(MODEL_DIR, exist_ok=True)

    # Save the trained model and scaler
    print(f"Saving model to {MODEL_PATH}")
    joblib.dump(model, MODEL_PATH)
    print(f"Saving scaler to {SCALER_PATH}")
    joblib.dump(scaler, SCALER_PATH)
    print("Model and scaler saved successfully!")

if __name__ == "__main__":
    train_and_save_model()

# --- Code to download the files from Colab ---
from google.colab import files

print("\n--- Training complete. Downloading model files ---")
# Create a zip archive of the model files for easy download
!zip -r /content/colab_model_files.zip {MODEL_DIR}

# Download the zip file
files.download('/content/colab_model_files.zip')

print("Model and scaler files zipped and downloaded to your local machine.")
print("Extract 'colab_model_files.zip' and place 'outline_classifier.joblib' and 'scaler.joblib' into your local 'adobe-hackathon-round1a/model/' directory.")


Cloning https://github.com/jhaaj08/Adobe-India-Hackathon25.git...
Cloning into 'Adobe-India-Hackathon25'...
remote: Enumerating objects: 124, done.[K
remote: Counting objects: 100% (124/124), done.[K
remote: Compressing objects: 100% (104/104), done.[K
remote: Total 124 (delta 26), reused 52 (delta 16), pack-reused 0 (from 0)[K
Receiving objects: 100% (124/124), 19.38 MiB | 23.24 MiB/s, done.
Resolving deltas: 100% (26/26), done.
Collecting real training data from sample PDFs and JSONs...
Processing training data for: file02.pdf
Processing training data for: file05.pdf
Processing training data for: file01.pdf
Processing training data for: file04.pdf
Processing training data for: file03.pdf
Training StandardScaler...
Training Logistic Regression model...

Model Classification Report:
              precision    recall  f1-score   support

          H1       1.00      0.20      0.33         5
          H2       0.80      0.67      0.73         6
          H3       0.57      0.67      

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Model and scaler files zipped and downloaded to your local machine.
Extract 'colab_model_files.zip' and place 'outline_classifier.joblib' and 'scaler.joblib' into your local 'adobe-hackathon-round1a/model/' directory.
