In [None]:
%cd /content/drive/MyDrive/

/content/drive/MyDrive


In [1]:
!nvidia-smi
!lscpu

/bin/bash: line 1: nvidia-smi: command not found
Architecture:             x86_64
  CPU op-mode(s):         32-bit, 64-bit
  Address sizes:          46 bits physical, 48 bits virtual
  Byte Order:             Little Endian
CPU(s):                   2
  On-line CPU(s) list:    0,1
Vendor ID:                GenuineIntel
  Model name:             Intel(R) Xeon(R) CPU @ 2.20GHz
    CPU family:           6
    Model:                79
    Thread(s) per core:   2
    Core(s) per socket:   1
    Socket(s):            1
    Stepping:             0
    BogoMIPS:             4399.99
    Flags:                fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 cl
                          flush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc re
                          p_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3
                           fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand
                

In [1]:
import zipfile
import os
import math
import pandas as pd
import cv2  # OpenCV for image processing
import numpy as np
from sklearn.model_selection import train_test_split
import ast
from tqdm import tqdm
import re

In [None]:
def extract_zip(zip_file_path, output_folder):
    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Open the zip file
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        # Extract all the contents into the output folder
        zip_ref.extractall(output_folder)

# Example usage
zip_file_path = '/content/drive/MyDrive/Copy of val.zip'  # Replace with your zip file path
output_folder = '/content/drive/MyDrive/copy of val (unzipped)'     # Replace with your desired output folder path

extract_zip(zip_file_path, output_folder)


In [None]:
def get_directory_info(directory_path):
    total_size = 0
    num_files = 0

    for dirpath, dirnames, filenames in os.walk(directory_path):
        for file in filenames:
            file_path = os.path.join(dirpath, file)
            total_size += os.path.getsize(file_path)
            num_files += 1

    return num_files, total_size

def convert_size(size_bytes):
    if size_bytes == 0:
        return "0B"
    size_name = ("B", "KB", "MB", "GB", "TB")
    i = int(math.floor(math.log(size_bytes, 1024)))
    p = math.pow(1024, i)
    s = round(size_bytes / p, 2)
    return f"{s} {size_name[i]}"

In [None]:
# Define class mapping (this should be based on the classes you have in your dataset)
class_mapping = {
    'lisa_pathfinder': 0,
    'proba_3_csc': 1,
    'smart_1': 2,
    'xmm_newton': 3,
    'soho': 4,
    'earth_observation_sat_1': 5,
    'debris': 6,
    'proba_2': 7,
    'proba_3_ocs': 8,
    'cheops': 9,
    'double_star': 10
}

def convert_bbox(size, box):
    """
    Convert bounding box to YOLO format
    """
    dw = 1. / size[0]
    dh = 1. / size[1]
    x = (box[0] + box[2]) / 2.0
    y = (box[1] + box[3]) / 2.0
    w = box[2] - box[0]
    h = box[3] - box[1]
    x = x * dw
    w = w * dw
    y = y * dh
    h = h * dh
    return (x, y, w, h)

def load_data(csv_file, images_folder, output_folder, start_index, test_size=0.1):
    df = pd.read_csv(csv_file)
    df = df.iloc[start_index:start_index + len(df) // 2]  # Use only 50% of the CSV file
    loaded_count = start_index

    for index, row in tqdm(df.iterrows(), desc="Loading data"):
        if index < start_index:
            continue
        # Replace .png with .jpg in image filename
        image_name = row['filename'].replace('.png', '.jpg')
        image_path = os.path.join(images_folder, image_name)

        if not os.path.exists(image_path):
            print(f"Warning: File '{image_path}' not found. Skipping.")
            continue

        image = cv2.imread(image_path)

        if image is None:
            print(f"Warning: Unable to load image '{image_path}'. Skipping.")
            continue

        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # YOLO typically uses RGB images

        bbox = ast.literal_eval(row['bbox'])
        class_label = row['class']

        if class_label not in class_mapping:
            print(f"Warning: Class label '{class_label}' not in class mapping. Skipping.")
            continue

        class_id = class_mapping[class_label]
        yolo_bbox = convert_bbox(image.shape[:2], bbox)  # Convert bbox to YOLO format

        # YOLO format: [<class_index> <x_center> <y_center> <width> <height>]
        annotation = f"{class_id} {' '.join(map(str, yolo_bbox))}"

        loaded_count += 1

        # Save the image and annotation in the YOLOv5 format
        subset_folder = 'train' if np.random.rand() > test_size else 'val'
        output_image_path = os.path.join(output_folder, subset_folder, 'images', image_name)
        output_label_path = os.path.join(output_folder, subset_folder, 'labels', image_name.replace('.jpg', '.txt'))

        os.makedirs(os.path.dirname(output_image_path), exist_ok=True)
        os.makedirs(os.path.dirname(output_label_path), exist_ok=True)

        cv2.imwrite(output_image_path, image)
        with open(output_label_path, 'w') as f:
            f.write(annotation)

    print(f"Number of files loaded: {loaded_count}")

    return output_folder

# Example usage
val_csv_path = '/content/drive/MyDrive/Copy of labels.zip (Unzipped Files)/val.csv'  # Path to your val.csv file
images_folder = '/content/drive/MyDrive/copy of val (unzipped)/val'  # Folder where images are stored
output_folder = '/content/drive/MyDrive/dataset_yolov5'  # Output folder where YOLOv5 formatted dataset will be saved

# Load and prepare the dataset
dataset_folder = load_data(val_csv_path, images_folder, output_folder, start_index=0)
print(f"Dataset prepared in YOLOv5 format saved in: {dataset_folder}")


Loading data: 11000it [20:43,  8.84it/s]

Number of files loaded: 11000
Dataset prepared in YOLOv5 format saved in: /content/drive/MyDrive/dataset_yolov5





In [4]:
class_mapping = {
    'lisa_pathfinder': 0,
    'proba_3_csc': 1,
    'smart_1': 2,
    'xmm_newton': 3,
    'soho': 4,
    'earth_observation_sat_1': 5,
    'debris': 6,
    'proba_2': 7,
    'proba_3_ocs': 8,
    'cheops': 9,
    'double_star': 10
}

def normalize_bbox(image_shape, bbox):
    """
    Normalize bounding box coordinates to [0, 1] range
    """
    height, width = image_shape
    x_min, y_min, x_max, y_max = bbox
    x_min /= width
    x_max /= width
    y_min /= height
    y_max /= height
    return [x_min, y_min, x_max, y_max]

def load_data(csv_file, images_folder, output_folder):
    df = pd.read_csv(csv_file)
    train_df = df.iloc[:150]
    val_df = df.iloc[150:200]

    for df, subset_folder in zip([train_df, val_df], ['train', 'val']):
        for index, row in tqdm(df.iterrows(), desc=f"Loading {subset_folder} data"):
            # Replace .png with .jpg in image filename
            image_name = row['filename'].replace('.png', '.jpg')
            image_path = os.path.join(images_folder, image_name)

            if not os.path.exists(image_path):
                print(f"Warning: File '{image_path}' not found. Skipping.")
                continue

            image = cv2.imread(image_path)

            if image is None:
                print(f"Warning: Unable to load image '{image_path}'. Skipping.")
                continue

            bbox = ast.literal_eval(row['bbox'])
            class_label = row['class']

            if class_label not in class_mapping:
                print(f"Warning: Class label '{class_label}' not in class mapping. Skipping.")
                continue

            class_id = class_mapping[class_label]
            normalized_bbox = normalize_bbox(image.shape[:2], bbox)

            if any(coord < 0 or coord > 1 for coord in normalized_bbox):
                print(f"Warning: Normalized bbox out of bounds for image '{image_name}'. Skipping.")
                continue

            bbox_str = ' '.join(map(str, normalized_bbox))

            # YOLO format: [<class_index> <x_min> <y_min> <x_max> <y_max>]
            annotation = f"{class_id} {bbox_str}"

            # Save the image and annotation in the YOLOv5 format
            output_image_path = os.path.join(output_folder, subset_folder, 'images', image_name)
            output_label_path = os.path.join(output_folder, subset_folder, 'labels', image_name.replace('.jpg', '.txt'))

            os.makedirs(os.path.dirname(output_image_path), exist_ok=True)
            os.makedirs(os.path.dirname(output_label_path), exist_ok=True)

            cv2.imwrite(output_image_path, image)
            with open(output_label_path, 'w') as f:
                f.write(annotation)

    print(f"Dataset prepared in YOLOv5 format saved in: {output_folder}")

# Example usage
csv_path = '/content/drive/MyDrive/Copy of labels.zip (Unzipped Files)/val.csv'  # Path to your val.csv file
images_folder = '/content/drive/MyDrive/copy of val (unzipped)/val'  # Folder where images are stored
output_folder = '/content/drive/MyDrive/dataset2_yolov5'  # Output folder where YOLOv5 formatted dataset will be saved

# Load and prepare the dataset
load_data(csv_path, images_folder, output_folder)


Loading train data: 150it [00:16,  9.01it/s]
Loading val data: 50it [00:05,  8.69it/s]


Dataset prepared in YOLOv5 format saved in: /content/drive/MyDrive/dataset2_yolov5


In [None]:
def create_yaml_config(csv_file, images_folder, output_yaml):
    # Read CSV file to get unique classes
    df = pd.read_csv(csv_file)
    classes = df['class'].unique().tolist()

    # Create YAML content
    yaml_content = f"""
    train: {os.path.join(images_folder, 'train', 'images')}
    val: {os.path.join(images_folder, 'val', 'images')}

    nc: {len(classes)}  # number of classes
    names: {classes}  # class names
    """

    # Write YAML file
    with open(output_yaml, 'w') as f:
        f.write(yaml_content)

    print(f"Created YAML file: {output_yaml}")

# Example usage
csv_file = '/content/drive/MyDrive/Copy of labels.zip (Unzipped Files)/val.csv'
images_folder = '/content/drive/MyDrive/dataset_yolov5'
output_yaml = '/content/drive/MyDrive/Copy of labels.zip (Unzipped Files)/dataset.yaml'

create_yaml_config(csv_file, images_folder, output_yaml)


Created YAML file: /content/drive/MyDrive/Copy of labels.zip (Unzipped Files)/dataset.yaml


In [None]:
#num_files, total_size = get_directory_info('/content/drive/MyDrive/copy of val (unzipped)/val')
#print(f"Number of files in unzipped folder: {num_files}")
#print(f"Total size: {convert_size(total_size)}")

num_files, total_size = get_directory_info('/content/drive/MyDrive/dataset_yolov5/train/images')
print(f"Number of files in training: {num_files}")
print(f"Total size: {convert_size(total_size)}")

num_files, total_size = get_directory_info('/content/drive/MyDrive/dataset_yolov5/val/images')
print(f"Number of files in validation: {num_files}")
print(f"Total size: {convert_size(total_size)}")

Number of files in training: 9840
Total size: 3.32 GB
Number of files in validation: 1160
Total size: 398.02 MB


In [None]:
def find_corrupt_images(image_folder, label_folder):
    corrupt_images = []

    for img_file in tqdm(os.listdir(image_folder), desc="Scanning images"):
        img_path = os.path.join(image_folder, img_file)
        label_path = os.path.join(label_folder, img_file.replace('.jpg', '.txt'))

        try:
            img = cv2.imread(img_path)
            if img is None:
                corrupt_images.append(img_path)
                print(f'{img_file} img doesnt exist')
                continue

            # Check if label file exists
            if not os.path.exists(label_path):
                corrupt_images.append(img_path)
                print(f'{img_file} label doesnt exist')
                continue

            # Additional checks for label file (optional)
            with open(label_path, 'r') as f:
                label_content = f.read()
                if not label_content.strip():
                    corrupt_images.append(img_path)
                    print(f'{img_file} labels empty')
                    continue

        except Exception as e:
            corrupt_images.append(img_path)

    return corrupt_images

train_images_folder = '/content/drive/MyDrive/dataset_yolov5/train/images'
train_labels_folder = '/content/drive/MyDrive/dataset_yolov5/train/labels'

corrupt_images = find_corrupt_images(train_images_folder, train_labels_folder)
print(f"Found {len(corrupt_images)} corrupt images.")

# Optionally, save the list of corrupt images to a file
#with open('corrupt_images.txt', 'w') as f:
#    for img in corrupt_images:
#        f.write(f"{img}\n")


Scanning images:   8%|▊         | 1603/19736 [37:55<7:09:04,  1.42s/it]


KeyboardInterrupt: 

In [5]:
# Change directory to the YOLOv5 repository
%cd /content/yolov5

# Train the model with the specified parameters
!python train.py --img 512 --batch 32 --epochs 100 --data '/content/dataset.yaml' --weights '/content/drive/MyDrive/Copy of labels.zip (Unzipped Files)/yolov5s.pt'


/content/yolov5
2024-06-25 14:54:01.163877: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-25 14:54:01.163950: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-25 14:54:01.166820: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[34m[1mtrain: [0mweights=/content/drive/MyDrive/Copy of labels.zip (Unzipped Files)/yolov5s.pt, cfg=, data=/content/dataset.yaml, hyp=data/hyps/hyp.scratch-low.yaml, epochs=100, batch_size=32, imgsz=512, rect=False, resume=False, nosave=False, noval=False, noautoanchor=False, noplots=False, evolve=None, evolve_population=data/hyps, resume_evolv