In [1]:
!wget https://huggingface.co/datasets/barryallen16/Sarav-real-fake-automobile-parts-dataset/resolve/main/dataset_clean.zip
!unzip -q dataset_clean

--2025-10-23 23:48:18--  https://huggingface.co/datasets/barryallen16/Sarav-real-fake-automobile-parts-dataset/resolve/main/dataset_clean.zip
Resolving huggingface.co (huggingface.co)... 108.138.246.79, 108.138.246.71, 108.138.246.67, ...
Connecting to huggingface.co (huggingface.co)|108.138.246.79|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cas-bridge.xethub.hf.co/xet-bridge-us/68dc24c5ee15b4bd54002f0b/0bed99891ab2dcf6c4186bf54c7c4dbbf9af51a86d2b1e56f33ec4a49e8872dc?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20251023%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20251023T234818Z&X-Amz-Expires=3600&X-Amz-Signature=6f496c5d97435c72ed1d7d7d665bc7005e5fdce9a12bfbc760abe07f5037b468&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=public&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27dataset_clean.zip%3B+filename%3D%22dataset_clean.zip%22%3B&response-content-type=application%2Fzip&x-id=GetObject&

In [3]:
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.3.220-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.17-py3-none-any.whl.metadata (14 kB)
Downloading ultralytics-8.3.220-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ultralytics_thop-2.0.17-py3-none-any.whl (28 kB)
Installing collected packages: ultralytics-thop, ultralytics
Successfully installed ultralytics-8.3.220 ultralytics-thop-2.0.17


In [4]:
import os
import torch
import yaml
from ultralytics import YOLO
from pathlib import Path
import matplotlib.pyplot as plt
import pandas as pd

# ============================================================================
# ENVIRONMENT SETUP
# ============================================================================

print("="*80)
print("YOLOV8 CLASSIFICATION TRAINING - COUNTERFEIT BIKE PARTS")
print("="*80)
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
print("="*80 + "\n")

# ============================================================================
# CONFIGURATION
# ============================================================================

# Model selection: n=nano, s=small, m=medium, l=large, x=extra-large
# For 443 images with counterfeit detection: medium (m) recommended
MODEL_SIZE = 'm'  # Best balance for your dataset size
MODEL_PATH = f'yolov8{MODEL_SIZE}-cls.pt'

# Dataset configuration
DATA_DIR = 'dataset_clean'  # Your cleaned dataset directory

# Training hyperparameters (optimized for counterfeit detection)
TRAINING_CONFIG = {
    # Basic settings
    'data': DATA_DIR,
    'epochs': 150,  # Increased for better convergence[web:112][web:234]
    'imgsz': 320,   # Higher than 224 for better detail recognition[web:112][web:237]
    'batch': 32,    # Adjust based on GPU memory[web:112][web:239]
    'device': 0,    # GPU device (0 for single GPU)

    # Early stopping
    'patience': 30,  # Stop if no improvement for 30 epochs[web:112][web:240]

    # Optimizer settings
    'optimizer': 'AdamW',  # Best for small datasets[web:112][web:233]
    'lr0': 0.001,          # Initial learning rate[web:112][web:233]
    'lrf': 0.01,           # Final learning rate (lr0 * lrf)[web:112]
    'momentum': 0.937,     # SGD momentum/Adam beta1[web:112][web:241]
    'weight_decay': 0.0005,  # Optimizer weight decay[web:112][web:233]

    # Data augmentation for classification[web:235]
    'auto_augment': 'randaugment',  # RandAugment for classification[web:235]
    'hsv_h': 0.015,      # HSV-Hue augmentation (0.0-1.0)[web:112][web:235]
    'hsv_s': 0.7,        # HSV-Saturation augmentation (0.0-1.0)[web:112]
    'hsv_v': 0.4,        # HSV-Value augmentation (0.0-1.0)[web:112]
    'degrees': 15.0,     # Rotation (+/- deg)[web:235][web:238]
    'translate': 0.1,    # Translation (+/- fraction)[web:235]
    'scale': 0.5,        # Image scale (+/- gain)[web:112][web:235]
    'flipud': 0.5,       # Vertical flip probability[web:235][web:241]
    'fliplr': 0.5,       # Horizontal flip probability[web:235][web:238]

    # Validation settings
    'val': True,
    'save': True,
    'save_period': -1,   # Save checkpoint every x epochs (-1 = disabled)
    'cache': True,       # Cache images for faster training[web:240]
    'workers': 8,        # Number of worker threads[web:239]
    'verbose': True,

    # Output settings
    'project': 'bike_parts_classifier',
    'name': f'yolov8{MODEL_SIZE}_counterfeit_detection',
    'exist_ok': True,
    'plots': True,       # Generate training plots[web:239]
}


Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
YOLOV8 CLASSIFICATION TRAINING - COUNTERFEIT BIKE PARTS
PyTorch version: 2.8.0+cu126
CUDA available: True
CUDA device: Tesla T4
CUDA memory: 14.7 GB



In [5]:
def validate_dataset_structure(data_dir):
    """Validate dataset structure and count images"""

    print("="*80)
    print("DATASET VALIDATION")
    print("="*80)

    required_splits = ['train', 'valid']
    class_names = []

    for split in required_splits:
        split_path = Path(data_dir) / split
        if not split_path.exists():
            raise ValueError(f"Missing required directory: {split_path}")

        # Get class names from train split
        if split == 'train':
            class_names = sorted([d.name for d in split_path.iterdir() if d.is_dir()])
            print(f"\n📂 Detected {len(class_names)} classes:")
            for cls in class_names:
                print(f"   - {cls}")

    # Count images per split and class
    print(f"\n📊 Image counts:")
    total_train = 0
    total_valid = 0

    for split in required_splits:
        print(f"\n{split.upper()}:")
        split_total = 0
        for cls in class_names:
            cls_path = Path(data_dir) / split / cls
            if cls_path.exists():
                count = len(list(cls_path.glob('*.jpg'))) + len(list(cls_path.glob('*.png')))
                split_total += count
                print(f"   {cls:30s}: {count:4d} images")
            else:
                print(f"   {cls:30s}: MISSING!")

        print(f"   {'TOTAL':30s}: {split_total:4d} images")
        if split == 'train':
            total_train = split_total
        else:
            total_valid = split_total

    print(f"\n📊 Dataset Summary:")
    print(f"   Total training images: {total_train}")
    print(f"   Total validation images: {total_valid}")
    print(f"   Train/Val ratio: {total_train/total_valid:.1f}:1")

    # Check class balance
    print(f"\n⚖️  Class Balance Check:")
    for cls in class_names:
        train_count = len(list((Path(data_dir) / 'train' / cls).glob('*.[jp][pn]g')))
        if total_train > 0:
            percentage = train_count / total_train * 100
            status = "✅" if 20 <= percentage <= 80 else "⚠️"
            print(f"   {status} {cls:30s}: {percentage:5.1f}%")

    print("="*80 + "\n")

    return class_names, total_train, total_valid

In [6]:
def train_counterfeit_detector():
    """Train YOLOv8 classification model for counterfeit detection"""

    # Validate dataset
    class_names, n_train, n_valid = validate_dataset_structure(DATA_DIR)

    # Adjust batch size if dataset is small
    if n_train < 200:
        TRAINING_CONFIG['batch'] = 16
        print(f"⚠️  Small dataset detected. Reduced batch size to 16\n")

    # Load pretrained model
    print("="*80)
    print("MODEL INITIALIZATION")
    print("="*80)
    print(f"Loading YOLOv8{MODEL_SIZE.upper()}-cls pretrained on ImageNet...")
    model = YOLO(MODEL_PATH)
    print(f"✅ Model loaded successfully!")
    print(f"   Parameters: {sum(p.numel() for p in model.model.parameters())/1e6:.1f}M")
    print("="*80 + "\n")

    # Train model
    print("="*80)
    print("STARTING TRAINING")
    print("="*80)
    print(f"Configuration:")
    for key, value in TRAINING_CONFIG.items():
        if key not in ['data']:
            print(f"   {key:20s}: {value}")
    print("="*80 + "\n")

    try:
        results = model.train(**TRAINING_CONFIG)
        print("\n✅ Training completed successfully!")
        return results, model

    except KeyboardInterrupt:
        print("\n⚠️  Training interrupted by user")
        return None, model
    except Exception as e:
        print(f"\n❌ Training failed: {e}")
        raise

In [7]:
def evaluate_model(model, results):
    """Evaluate trained model and generate reports"""

    print("\n" + "="*80)
    print("MODEL EVALUATION")
    print("="*80)

    # Validate on validation set
    print("\n📊 Running validation...")
    metrics = model.val()

    # Print metrics
    print(f"\n📈 Performance Metrics:")
    print(f"   Top-1 Accuracy: {metrics.top1:.4f} ({metrics.top1*100:.2f}%)")
    print(f"   Top-5 Accuracy: {metrics.top5:.4f} ({metrics.top5*100:.2f}%)")

    # Per-class accuracy
    if hasattr(metrics, 'class_result'):
        print(f"\n📊 Per-Class Performance:")
        class_names, _, _ = validate_dataset_structure(DATA_DIR)
        for i, cls in enumerate(class_names):
            if i < len(metrics.class_result):
                print(f"   {cls:30s}: {metrics.class_result[i]:.4f}")

    print("="*80 + "\n")

    return metrics

In [8]:
def test_inference(model, test_images):
    """Run test inference on sample images"""

    print("="*80)
    print("TEST INFERENCE")
    print("="*80)

    for img_path in test_images:
        if not os.path.exists(img_path):
            print(f"⚠️  Image not found: {img_path}")
            continue

        results = model.predict(img_path, verbose=False)
        result = results[0]

        # Get prediction
        top1_idx = result.probs.top1
        top1_conf = result.probs.top1conf
        pred_class = result.names[top1_idx]

        # Get top 3 predictions
        top3_indices = result.probs.top5[:3]

        print(f"\n📸 Image: {os.path.basename(img_path)}")
        print(f"   🎯 Prediction: {pred_class}")
        print(f"   📊 Confidence: {top1_conf:.4f} ({top1_conf*100:.2f}%)")
        print(f"   📋 Top 3 predictions:")
        for i, idx in enumerate(top3_indices, 1):
            cls_name = result.names[idx]
            conf = result.probs.data[idx]
            print(f"      {i}. {cls_name:30s}: {conf:.4f} ({conf*100:.2f}%)")

    print("="*80 + "\n")

In [9]:
def export_model(model, formats=['onnx', 'torchscript']):
    """Export model to deployment formats"""

    print("="*80)
    print("MODEL EXPORT")
    print("="*80)

    for fmt in formats:
        try:
            print(f"\n📦 Exporting to {fmt.upper()}...")
            model.export(format=fmt, imgsz=320)
            print(f"   ✅ Successfully exported to {fmt.upper()}")
        except Exception as e:
            print(f"   ❌ Export to {fmt.upper()} failed: {e}")

    print("="*80 + "\n")

In [11]:
results, model = train_counterfeit_detector()

if results is not None:
    # Evaluate model
    metrics = evaluate_model(model, results)

    # Test inference on sample images
    test_images = [
        f'{DATA_DIR}/valid/spark_plug_fake/61.jpg',
        f'{DATA_DIR}/valid/helmet_genuine/52.jpg',
        f'{DATA_DIR}/valid/air_filter_fake/31.jpg',
    ]
    test_inference(model, test_images)

    # Export model
    export_model(model, formats=['onnx'])

    print("\n🎉 Training pipeline completed successfully!")
    print(f"📁 Results saved to: {TRAINING_CONFIG['project']}/{TRAINING_CONFIG['name']}/")


DATASET VALIDATION

📂 Detected 6 classes:
   - air_filter_fake
   - air_filter_genuine
   - helmet_fake
   - helmet_genuine
   - spark_plug_fake
   - spark_plug_genuine

📊 Image counts:

TRAIN:
   air_filter_fake               :   30 images
   air_filter_genuine            :   68 images
   helmet_fake                   :   51 images
   helmet_genuine                :   58 images
   spark_plug_fake               :   37 images
   spark_plug_genuine            :   82 images
   TOTAL                         :  326 images

VALID:
   air_filter_fake               :    3 images
   air_filter_genuine            :   12 images
   helmet_fake                   :   21 images
   helmet_genuine                :   21 images
   spark_plug_fake               :    9 images
   spark_plug_genuine            :   23 images
   TOTAL                         :   89 images

📊 Dataset Summary:
   Total training images: 326
   Total validation images: 89
   Train/Val ratio: 3.7:1

⚖️  Class Balance Check:
   ⚠️ a

In [12]:
    # Test inference on sample images
    test_images = [
        f'{DATA_DIR}/valid/spark_plug_fake/61.jpg',
        f'{DATA_DIR}/valid/helmet_genuine/52.jpg',
        f'{DATA_DIR}/valid/air_filter_fake/31.jpg',
    ]
    test_inference(model, test_images)


TEST INFERENCE

📸 Image: 61.jpg
   🎯 Prediction: spark_plug_genuine
   📊 Confidence: 0.3046 (30.46%)
   📋 Top 3 predictions:
      1. spark_plug_genuine            : 0.3046 (30.46%)
      2. helmet_fake                   : 0.2201 (22.01%)
      3. helmet_genuine                : 0.1808 (18.08%)

📸 Image: 52.jpg
   🎯 Prediction: helmet_fake
   📊 Confidence: 0.3222 (32.22%)
   📋 Top 3 predictions:
      1. helmet_fake                   : 0.3222 (32.22%)
      2. helmet_genuine                : 0.1724 (17.24%)
      3. air_filter_genuine            : 0.1718 (17.18%)

📸 Image: 31.jpg
   🎯 Prediction: helmet_fake
   📊 Confidence: 0.3314 (33.14%)
   📋 Top 3 predictions:
      1. helmet_fake                   : 0.3314 (33.14%)
      2. air_filter_genuine            : 0.2103 (21.03%)
      3. spark_plug_genuine            : 0.1239 (12.39%)



In [14]:
!zip -r bike_parts_classifier.zip bike_parts_classifier/

  adding: bike_parts_classifier/ (stored 0%)
  adding: bike_parts_classifier/yolov8m_counterfeit_detection/ (stored 0%)
  adding: bike_parts_classifier/yolov8m_counterfeit_detection/confusion_matrix.png (deflated 25%)
  adding: bike_parts_classifier/yolov8m_counterfeit_detection/val_batch0_pred.jpg (deflated 3%)
  adding: bike_parts_classifier/yolov8m_counterfeit_detection/val_batch1_labels.jpg (deflated 4%)
  adding: bike_parts_classifier/yolov8m_counterfeit_detection/train_batch1.jpg (deflated 7%)
  adding: bike_parts_classifier/yolov8m_counterfeit_detection/confusion_matrix_normalized.png (deflated 23%)
  adding: bike_parts_classifier/yolov8m_counterfeit_detection/train_batch0.jpg (deflated 6%)
  adding: bike_parts_classifier/yolov8m_counterfeit_detection/train_batch2.jpg (deflated 8%)
  adding: bike_parts_classifier/yolov8m_counterfeit_detection/val_batch0_labels.jpg (deflated 3%)
  adding: bike_parts_classifier/yolov8m_counterfeit_detection/results.png (deflated 6%)
  adding: bike