# Latex Quality Classification with YOLOv8m (Detection to Classification)

This notebook handles a **YOLOv8 Detection Dataset** (images + .txt labels) and converts it into a **Classification Dataset** (folders by class) before training.

**Dataset:** `latex-yolov8.zip` (YOLOv8 Detection format).
**Task:** Convert detection labels -> classification folders -> Train YOLOv8m-cls.
**Output:** `Latex.pt` saved to Google Drive.

In [None]:
# 1. Install Ultralytics
!pip install ultralytics

In [None]:
# 2. Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# 3. Extract and Convert Dataset (Detection -> Classification)
import zipfile
import os
import shutil
import yaml
import random
from glob import glob
from tqdm import tqdm

zip_path = '/content/drive/MyDrive/latex-yolov8.zip'
extract_root = '/content/temp_extraction'
dataset_root = '/content/yolo_classification_data'

# Clean up previous runs
if os.path.exists(extract_root):
    shutil.rmtree(extract_root)
if os.path.exists(dataset_root):
    shutil.rmtree(dataset_root)

print(f"Extracting {zip_path}...")
try:
    with zipfile.ZipFile(zip_path, 'r') as z:
        z.extractall(extract_root)
    print("Extraction complete.")
except FileNotFoundError:
    raise FileNotFoundError(f"Error: {zip_path} not found in MyDrive.")

# --- Step 3a: Read Class Names from data.yaml ---
yaml_files = glob(os.path.join(extract_root, '**', 'data.yaml'), recursive=True)
if not yaml_files:
    # Fallback if no yaml found (try to infer or error out)
    print("Warning: data.yaml not found. Searching for 'classes.txt'...")
    txt_files = glob(os.path.join(extract_root, '**', 'classes.txt'), recursive=True)
    if txt_files:
        with open(txt_files[0], 'r') as f:
             class_names = [line.strip() for line in f.readlines()]
    else:
        raise FileNotFoundError("Could not find data.yaml or classes.txt to determine class names!")
else:
    with open(yaml_files[0], 'r') as f:
        data_config = yaml.safe_load(f)
    class_names = data_config['names']

print(f"Classes found: {class_names}")

# --- Step 3b: Setup Classification Directory Structure ---
def normalize_class_name(name):
    # Fix known typos in dataset
    if name.lower() == 'yellow altex': return 'yellow latex'
    return name

# We will use standard train/val/test splits if available, or create them
splits = ['train', 'valid', 'test']
for split in splits:
    for name in class_names:
        clean_name = normalize_class_name(name)
        os.makedirs(os.path.join(dataset_root, split, clean_name), exist_ok=True)

# --- Step 3c: Move Images based on Labels ---
print("\nConverting Detection structure to Classification folders...")
stats = {normalize_class_name(name): 0 for name in class_names}
class_image_paths = {normalize_class_name(name): [] for name in class_names}

# Find all 'images' folders (standard YOLO export structure)
image_folders = glob(os.path.join(extract_root, '**', 'images'), recursive=True)

for img_folder in image_folders:
    # Identify split (train, valid, test)
    parent_dir = os.path.dirname(img_folder)
    split_name = os.path.basename(parent_dir)
    
    # Map 'val' -> 'valid' if needed
    if split_name == 'val': split_name = 'valid'
    if split_name not in splits:
        split_name = 'train' # Default fallback
        
    label_folder = os.path.join(parent_dir, 'labels')
    if not os.path.exists(label_folder):
        print(f"Warning: No labels folder found for {img_folder}, skipping...")
        continue
        
    # Process images
    image_files = glob(os.path.join(img_folder, '*'))
    for img_path in image_files:
        filename = os.path.basename(img_path)
        name_no_ext = os.path.splitext(filename)[0]
        label_path = os.path.join(label_folder, name_no_ext + '.txt')
        
        if os.path.exists(label_path):
            with open(label_path, 'r') as f:
                lines = f.readlines()
                if lines:
                    # Use first object's class as image class
                    try:
                        class_id = int(lines[0].split()[0])
                        if 0 <= class_id < len(class_names):
                            original_name = class_names[class_id]
                            clean_name = normalize_class_name(original_name)
                            
                            # We store the mapping first, then distribute later to ensure split balance
                            class_image_paths[clean_name].append(img_path)
                            stats[clean_name] += 1
                    except (ValueError, IndexError):
                        pass # Skip malformed labels

# --- Step 3d: Distribute Images to Train/Val/Test ---
# The dataset might have come with empty val/test folders, so we enforce a split here
TRAIN_RATIO = 0.8
VAL_RATIO = 0.2

print("\nDistributing images into Train/Val splits...")
for cls_name, paths in class_image_paths.items():
    random.shuffle(paths)
    total_imgs = len(paths)
    if total_imgs == 0: 
        print(f"Warning: No images found for class '{cls_name}'")
        continue
        
    train_count = int(total_imgs * TRAIN_RATIO)
    val_count = total_imgs - train_count
    
    # Ensure at least one image in validation if possible
    if val_count == 0 and total_imgs > 1:
        val_count = 1
        train_count = total_imgs - 1
        
    train_imgs = paths[:train_count]
    val_imgs = paths[train_count:]
    
    # Copy files
    for img in train_imgs:
        shutil.copy(img, os.path.join(dataset_root, 'train', cls_name, os.path.basename(img)))
        
    for img in val_imgs:
        shutil.copy(img, os.path.join(dataset_root, 'valid', cls_name, os.path.basename(img)))
        
    print(f"  {cls_name}: {len(train_imgs)} train, {len(val_imgs)} valid")

print("\nConversion Stats (Total Images per Class):")
for k, v in stats.items():
    print(f"  {k}: {v}")

# Verify we have data
total_images = sum(stats.values())
if total_images == 0:
    raise RuntimeError("No images were converted! Check if labels match classes.")

# Verify validation set is not empty
total_val = len(glob(os.path.join(dataset_root, 'valid', '*', '*')))
if total_val == 0:
     raise RuntimeError("Validation set is empty! YOLO requires validation images.")

In [None]:
# 4. Train YOLOv8m Classifier
from ultralytics import YOLO

# Load pretrained classification model
model = YOLO('yolov8m-cls.pt')

# Train
results = model.train(
    data=dataset_root, 
    epochs=50, 
    imgsz=224, 
    batch=16,
    name='latex_quality_training'
)

In [None]:
# 5. Save Model to Drive
trained_model_path = 'runs/classify/latex_quality_training/weights/best.pt'
destination_path = '/content/drive/MyDrive/Latex.pt'

if os.path.exists(trained_model_path):
    shutil.copy(trained_model_path, destination_path)
    print(f"‚úÖ SUCCESS: Model saved to {destination_path}")
    
    # Optional: Copy confusion matrix or results to drive for inspection
    results_dir = '/content/drive/MyDrive/Latex_Results'
    if not os.path.exists(results_dir): os.makedirs(results_dir)
    shutil.copy('runs/classify/latex_quality_training/results.csv', os.path.join(results_dir, 'results.csv'))
    print(f"üìä Training results saved to {results_dir}")
else:
    print("‚ùå ERROR: Trained model file not found.")
    !ls -R runs/