In [None]:
# Mount Google Drive (Alihan)
from google.colab import drive
drive.mount('/content/drive')

import os

# Change to a specific directory
os.chdir('/content/drive/MyDrive/Okul/Eğitim/Ders/5. Dönem/YZV 303E - Deep Learning/Project/Colab')

# Verify the current working directory
print("Current Directory:", os.getcwd())


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Current Directory: /content/drive/MyDrive/Okul/Eğitim/Ders/5. Dönem/YZV 303E - Deep Learning/Project/Colab


In [None]:
# Mount Google Drive (Topcu)
from google.colab import drive
drive.mount('/content/drive')

import os

# Change to the shared folder
# Adjust the path to where the shared folder shortcut was added in their Drive
os.chdir('/content/drive/My Drive/YZV 303E - Deep Learning/Project/Colab')

# Verify the current working directory
print("Current Directory:", os.getcwd())

### Download image link dataset (TSVs) from Unsplash

In [None]:
# Download TSVs from Unsplash

!wget -O "/content/drive/MyDrive/Okul/Eğitim/Ders/5. Dönem/YZV 303E - Deep Learning/Project/data/Unsplash/latest_unsplash_lite.zip" "https://unsplash.com/data/lite/latest"
!unzip "/content/drive/MyDrive/Okul/Eğitim/Ders/5. Dönem/YZV 303E - Deep Learning/Project/data/Unsplash/latest_unsplash_lite.zip" -d "/content/drive/MyDrive/Okul/Eğitim/Ders/5. Dönem/YZV 303E - Deep Learning/Project/data/Unsplash/"
!rm "/content/drive/MyDrive/Okul/Eğitim/Ders/5. Dönem/YZV 303E - Deep Learning/Project/data/Unsplash/latest_unsplash_lite.zip"

import os
directory = "/content/drive/MyDrive/Okul/Eğitim/Ders/5. Dönem/YZV 303E - Deep Learning/Project/data/Unsplash/"

for filename in os.listdir(directory):
    if filename.endswith(".tsv000"):
        base = os.path.splitext(filename)[0]  # Get the base name
        new_name = base + ".tsv"
        os.rename(os.path.join(directory, filename), os.path.join(directory, new_name))
        print(f"Renamed: {filename} to {new_name}")

import pandas as pd

file_path = '../data/Unsplash/photos.tsv'

df = pd.read_csv(file_path, sep='\t')
# Display the first few rows of the DataFrame
df.head()

### Download images and split into L, A, B channels

In [None]:
# Download all images as RGB and create L, A, B channel images

import os
import cv2
import requests
import pandas as pd
from io import BytesIO
from PIL import Image
from datetime import datetime
import numpy as np

# Define paths
BASE_DIR = "../data/Unsplash/Images"
RGB_DIR = os.path.join(BASE_DIR, "RGB")
L_DIR = os.path.join(BASE_DIR, "L")
A_DIR = os.path.join(BASE_DIR, "A")
B_DIR = os.path.join(BASE_DIR, "B")
TSV_PATH = os.path.join(BASE_DIR, "image_links.tsv")
DOWNLOADED_LOG = os.path.join(BASE_DIR, "downloaded_images.tsv")
FAILED_LOG = os.path.join(BASE_DIR, "failed_images.tsv")  # New TSV for failed images

# Create necessary directories
os.makedirs(RGB_DIR, exist_ok=True)
os.makedirs(L_DIR, exist_ok=True)
os.makedirs(A_DIR, exist_ok=True)
os.makedirs(B_DIR, exist_ok=True)

# 1. Load and filter original data
original_df = pd.read_csv("../data/Unsplash/photos.tsv", sep="\t")

# Keep only required columns
new_df = original_df[['photo_id', 'photo_image_url', 'ai_description']]
new_df.to_csv(TSV_PATH, sep="\t", index=False)

# Check if image is grayscale
def is_grayscale(description):
    return "grayscale" in str(description).lower()

# Log downloaded image
def log_download(photo_id):
    if not os.path.exists(DOWNLOADED_LOG):
        with open(DOWNLOADED_LOG, 'w') as f:
            f.write("image_id\tdownloaded_at\n")

    with open(DOWNLOADED_LOG, 'a') as f:
        f.write(f"{photo_id}\t{datetime.now()}\n")

# Log failed image
def log_failure(photo_id):
    if not os.path.exists(FAILED_LOG):
        with open(FAILED_LOG, 'w') as f:
            f.write("image_id\tfailed_at\n")

    with open(FAILED_LOG, 'a') as f:
        f.write(f"{photo_id}\t{datetime.now()}\n")

# Resume functionality: Find last downloaded ID
def get_last_downloaded_id():
    if not os.path.exists(DOWNLOADED_LOG):
        return None
    log_df = pd.read_csv(DOWNLOADED_LOG, sep="\t")
    if not log_df.empty:
        return log_df['image_id'].iloc[-1]
    return None

# Process a single image
def process_image(row):
    photo_id = row['photo_id']
    image_url = row['photo_image_url']

    if is_grayscale(row['ai_description']):
        print(f"Skipping grayscale image: {photo_id}")
        return None

    try:
        # Download the image
        response = requests.get(f"{image_url}?w=256&h=256&fit=crop", stream=True)
        response.raise_for_status()

        # Convert to RGB using PIL
        image = Image.open(BytesIO(response.content)).convert("RGB")
        image_np = np.array(image)  # Convert to NumPy array

        # Convert directly from RGB to LAB
        lab_image = cv2.cvtColor(image_np, cv2.COLOR_RGB2LAB)
        L_channel, A_channel, B_channel = cv2.split(lab_image)

        # Save the RGB image
        cv2.imwrite(os.path.join(RGB_DIR, f"{photo_id}.png"), cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR))

        # Save the LAB channels
        cv2.imwrite(os.path.join(L_DIR, f"{photo_id}.png"), L_channel)
        cv2.imwrite(os.path.join(A_DIR, f"{photo_id}.png"), A_channel)
        cv2.imwrite(os.path.join(B_DIR, f"{photo_id}.png"), B_channel)

        # Log the download
        log_download(photo_id)
        print(f"Successfully processed image: {photo_id}")

    except Exception as e:
        # Log failure and continue
        print(f"Failed to process image {photo_id}: {e}")
        log_failure(photo_id)

# Main function
def download_images():
    # Load image links TSV
    image_links_df = pd.read_csv(TSV_PATH, sep="\t")

    # Find the last downloaded ID
    last_id = get_last_downloaded_id()
    start_index = 0

    if last_id:
        print(f"Resuming from last downloaded ID: {last_id}")
        start_index = image_links_df.index[image_links_df['photo_id'] == last_id].tolist()[0] + 1
    else:
        print("Starting download from the beginning.")

    # Process images starting from the determined index
    for _, row in image_links_df.iloc[start_index:].iterrows():
        process_image(row)

    print("All images processed successfully!")

# Run the script
if __name__ == "__main__":
    download_images()


Resuming from last downloaded ID: NsvzdXtvyio
Successfully processed image: k2DVsB4hTBQ
Successfully processed image: lhPLeHgox9Q
Successfully processed image: WT_nZUfAmBc
Successfully processed image: byq4tkp1nmg
Successfully processed image: s-4i34Z07To
Successfully processed image: DPiWUyqyUNE
Successfully processed image: HwQlPnLtaN4
Successfully processed image: g0wj801aNio
Successfully processed image: 0EOzs194BXQ
Successfully processed image: 9XzyEzPAHMI
Successfully processed image: NCFDCgj-2mM
Successfully processed image: auPmvdEJQ-4
Successfully processed image: E_H-5Cyxjfc
Successfully processed image: 0yeDc5fOgu4
Successfully processed image: 2Ka0oKSMxVE
Successfully processed image: Oo2f26k1vFY
Successfully processed image: s9u9o-BpxZ0
Successfully processed image: 4UBHkX2rlMY
Successfully processed image: 4D7-9lVUvNY
Successfully processed image: WA1Lpc7iFNw
Successfully processed image: em7gIbjEL0I
Successfully processed image: _ypd6fZlJq4
Successfully processed image: 

### Create augmented versions of the downloaded images

In [None]:
import os
import cv2
import numpy as np
import albumentations as A
from albumentations.core.composition import ReplayCompose

In [None]:
# Paths
original_rgb_path = "../data/Unsplash/Images/RGB"
output_base_path = "../data/Unsplash/Images/Augmented"
log_file_path = os.path.join(output_base_path, "error_log.txt")

os.makedirs(os.path.join(output_base_path, "RGB"), exist_ok=True)
os.makedirs(os.path.join(output_base_path, "L"), exist_ok=True)
os.makedirs(os.path.join(output_base_path, "A"), exist_ok=True)
os.makedirs(os.path.join(output_base_path, "B"), exist_ok=True)

# Define augmentation pipeline with ReplayCompose
augmentations = ReplayCompose([
    A.HorizontalFlip(p=0.5),
    A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=15, border_mode=cv2.BORDER_REFLECT, p=0.5),
    A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.5),
    A.GaussianBlur(blur_limit=(3, 7), p=0.3),
    A.Resize(256, 256),  # Ensure output remains 256x256
])

# Initialize counter
image_counter = 0

# Process each image
for filename in os.listdir(original_rgb_path):
    if not filename.endswith(".png"):
        continue

    base_filename = filename.replace(".png", "")

    # Load the original RGB image
    input_path = os.path.join(original_rgb_path, filename)
    rgb_image = cv2.imread(input_path, cv2.IMREAD_COLOR)
    if rgb_image is None:
        with open(log_file_path, "a") as log_file:
            log_file.write(f"Could not load image: {input_path}\n")
        continue

    try:
        for i in range(1, 3):  # Generate 2 augmentations per image
            augmented = augmentations(image=rgb_image)
            augmented_rgb = augmented["image"]

            # Generate file paths for augmented versions
            augmented_rgb_path = os.path.join(output_base_path, "RGB", f"{base_filename}_aug{i}.png")
            l_augmented_path = os.path.join(output_base_path, "L", f"{base_filename}_aug{i}.png")
            a_augmented_path = os.path.join(output_base_path, "A", f"{base_filename}_aug{i}.png")
            b_augmented_path = os.path.join(output_base_path, "B", f"{base_filename}_aug{i}.png")

            # Convert augmented RGB to LAB and split channels
            augmented_lab = cv2.cvtColor(augmented_rgb, cv2.COLOR_BGR2LAB)
            l_augmented, a_augmented, b_augmented = cv2.split(augmented_lab)

            # Save all augmented versions
            cv2.imwrite(augmented_rgb_path, augmented_rgb)
            cv2.imwrite(l_augmented_path, l_augmented)
            cv2.imwrite(a_augmented_path, a_augmented)
            cv2.imwrite(b_augmented_path, b_augmented)

        image_counter += 1
        if image_counter % 10 == 0:
            print(f"Augmented and saved {image_counter} images so far.")

    except Exception as e:
        # Cleanup in case of errors
        with open(log_file_path, "a") as log_file:
            log_file.write(f"Error processing {filename}: {e}\n")

        for i in range(1, 3):  # Ensure all generated versions for this file are deleted
            for path in [
                os.path.join(output_base_path, "RGB", f"{base_filename}_aug{i}.png"),
                os.path.join(output_base_path, "L", f"{base_filename}_aug{i}.png"),
                os.path.join(output_base_path, "A", f"{base_filename}_aug{i}.png"),
                os.path.join(output_base_path, "B", f"{base_filename}_aug{i}.png")
            ]:
                if os.path.exists(path):
                    os.remove(path)

print("All images have been augmented and saved.")


### Train-Validation-Test Split

In [None]:
# Download data, split to train test val

import os
import shutil
import random
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Google Drive and Colab VM paths
drive_base_path = "/content/drive/MyDrive/Okul/Eğitim/Ders/5. Dönem/YZV 303E - Deep Learning/Project/data/Unsplash/Images"
local_base_path = "/content/Unsplash/Images"
split_base_path = "/content/TrainTestVal"

# Create local folders
os.makedirs(local_base_path, exist_ok=True)
os.makedirs(split_base_path, exist_ok=True)

# Step 1: Copy L, A, B, and Augmented folders from Google Drive to Colab
folders_to_copy = ["L", "A", "B", "Augmented/L", "Augmented/A", "Augmented/B"]
for folder in folders_to_copy:
    src_path = os.path.join(drive_base_path, folder)
    dest_path = os.path.join(local_base_path, folder)
    os.makedirs(dest_path, exist_ok=True)
    print(f"Copying {folder} from Google Drive...")
    shutil.copytree(src_path, dest_path, dirs_exist_ok=True)
print("All folders copied successfully!")

# Step 2: Gather all files from L (original and augmented)
l_original_files = [os.path.join(local_base_path, "L", f) for f in os.listdir(os.path.join(local_base_path, "L")) if f.endswith('.png')]
l_augmented_files = [os.path.join(local_base_path, "Augmented/L", f) for f in os.listdir(os.path.join(local_base_path, "Augmented/L")) if f.endswith('.png')]
all_l_files = l_original_files + l_augmented_files

# Ensure consistency for A and B
def get_corresponding_files(base_files, channel):
    corresponding_files = []
    for file_path in base_files:
        if "Augmented" in file_path:
            corresponding_files.append(file_path.replace("Augmented/L", f"Augmented/{channel}"))
        else:
            corresponding_files.append(file_path.replace("/L/", f"/{channel}/"))
    return corresponding_files

all_a_files = get_corresponding_files(all_l_files, "A")
all_b_files = get_corresponding_files(all_l_files, "B")

# Step 3: Reduce dataset size to 20K files
random.seed(42)
selected_indices = random.sample(range(len(all_l_files)), min(len(all_l_files), 20000))
selected_l_files = [all_l_files[i] for i in selected_indices]
selected_a_files = [all_a_files[i] for i in selected_indices]
selected_b_files = [all_b_files[i] for i in selected_indices]

# Step 4: Train-Test-Validation split
train_l, test_l, train_a, test_a, train_b, test_b = train_test_split(selected_l_files, selected_a_files, selected_b_files, test_size=0.2, random_state=42)
train_l, val_l, train_a, val_a, train_b, val_b = train_test_split(train_l, train_a, train_b, test_size=0.2, random_state=42)

# Step 5: Copy files into TrainTestVal directories
def copy_split_files(file_list, split_name, channel):
    split_folder = os.path.join(split_base_path, split_name, channel)
    os.makedirs(split_folder, exist_ok=True)
    for file_path in tqdm(file_list, desc=f"Copying {split_name} {channel} files"):
        shutil.copy(file_path, split_folder)

splits = [("Train", train_l, train_a, train_b), ("Val", val_l, val_a, val_b), ("Test", test_l, test_a, test_b)]
for split_name, l_files, a_files, b_files in splits:
    copy_split_files(l_files, split_name, "L")
    copy_split_files(a_files, split_name, "A")
    copy_split_files(b_files, split_name, "B")

print("Train, Val, and Test splits created successfully!")


In [None]:
# Zip the TrainTestVal and upload to Google Drive

import os
import shutil

# Paths
local_train_test_val_path = "/content/TrainTestVal"
drive_destination_path = "/content/drive/MyDrive/Okul/Eğitim/Ders/5. Dönem/YZV 303E - Deep Learning/Project/data/TrainTestVal.zip"

# Step 1: Zip the TrainTestVal folder
zip_file_path = "/content/TrainTestVal.zip"
print("Zipping TrainTestVal folder...")
shutil.make_archive(base_name=zip_file_path.replace(".zip", ""), format="zip", root_dir=local_train_test_val_path)
print("Zipping complete!")

# Step 2: Copy the zipped file to Google Drive
print("Copying TrainTestVal.zip to Google Drive...")
shutil.copy(zip_file_path, drive_destination_path)
print(f"Copied TrainTestVal.zip to {drive_destination_path}")

# Step 3: Confirm the file exists in Google Drive
if os.path.exists(drive_destination_path):
    print("TrainTestVal.zip successfully saved in Google Drive!")
else:
    print("Failed to copy TrainTestVal.zip to Google Drive.")


In [None]:
# Zip the whole Unsplash data and upload to drive

import os
import shutil

# Paths
local_images_path = "/content/Unsplash"
drive_destination_path = "/content/drive/MyDrive/Okul/Eğitim/Ders/5. Dönem/YZV 303E - Deep Learning/Project/data/Unsplash.zip"

# Step 1: Zip the TrainTestVal folder
zip_file_path = "/content/Unsplash.zip"
print("Zipping Unsplash folder...")
shutil.make_archive(base_name=zip_file_path.replace(".zip", ""), format="zip", root_dir=local_images_path)
print("Zipping complete!")

# Step 2: Copy the zipped file to Google Drive
print("Copying Unsplash.zip to Google Drive...")
shutil.copy(zip_file_path, drive_destination_path)
print(f"Copied Unsplash.zip to {drive_destination_path}")

# Step 3: Confirm the file exists in Google Drive
if os.path.exists(drive_destination_path):
    print("Unsplash.zip successfully saved in Google Drive!")
else:
    print("Failed to copy Unsplash.zip to Google Drive.")


Zipping Unsplash folder...
Zipping complete!
Copying Unsplash.zip to Google Drive...
Copied Unsplash.zip to /content/drive/MyDrive/Okul/Eğitim/Ders/5. Dönem/YZV 303E - Deep Learning/Project/data/Unsplash.zip
Unsplash.zip successfully saved in Google Drive!
