# Enhanced Preprocessing Notebook
This notebook is designed for large-scale data preprocessing with advanced augmentations. It includes options for batch processing to test on smaller subsets of data.

In [14]:
# Import necessary libraries
import os
from PIL import Image, ImageOps, ImageFilter
import requests
from io import BytesIO
import pandas as pd
import numpy as np
import random
from torchvision import models, transforms
from torchvision.transforms import functional as F


In [21]:
# Configure directories and parameters
final_folder = 'final_data'
clothes_folder = 'clothes_data'
preprocessed_folder = 'preprocessed_data'
desired_size = (512, 512)
background_color = (0, 0, 0)
os.makedirs(final_folder, exist_ok=True)
os.makedirs(clothes_folder, exist_ok=True)
os.makedirs(preprocessed_folder, exist_ok=True)

# Load Google form data
df = pd.read_csv('google_form_data.csv')


In [22]:
# Helper functions
def convert_drive_url_to_downloadable(url):
    if 'open?id=' in url:
        file_id = url.split('open?id=')[-1]
        return f"https://drive.google.com/uc?export=download&id={file_id}"
    elif 'file/d/' in url:
        file_id = url.split('file/d/')[-1].split('/view')[0]
        return f"https://drive.google.com/uc?export=download&id={file_id}"
    else:
        return url

# Function to blur the background
def blur_background(img, radius=10):
    blurred_img = img.filter(ImageFilter.GaussianBlur(radius=radius))
    return blurred_img


In [23]:
# Data augmentation functions
def preprocess_image(img, output_dir, image_name):
    # Save original image
    img.save(os.path.join(output_dir, f"{image_name}_original.jpg"))

    # Randomly blur background
    blurred_img = blur_background(img, radius=1.5)
    blurred_img.save(os.path.join(output_dir, f"{image_name}_blurred.jpg"))

    # Rotate randomly
    rotated_img = img.rotate(random.uniform(-30, 30))
    rotated_img.save(os.path.join(output_dir, f"{image_name}_rotated.jpg"))

    # Flip horizontally
    flipped_img = img.transpose(Image.FLIP_LEFT_RIGHT)
    flipped_img.save(os.path.join(output_dir, f"{image_name}_flipped.jpg"))

    # Adjust brightness
    bright_img = F.adjust_brightness(img, brightness_factor=random.uniform(0.8, 1.5))
    bright_img.save(os.path.join(output_dir, f"{image_name}_bright.jpg"))

    # Add Gaussian noise
    noisy_img = np.array(img) + np.random.normal(0, 15, np.array(img).shape)
    noisy_img = Image.fromarray(np.clip(noisy_img, 0, 255).astype(np.uint8))
    noisy_img.save(os.path.join(output_dir, f"{image_name}_noisy.jpg"))

    # Crop and resize
    crop_width, crop_height = random.randint(300, 512), random.randint(300, 512)
    cropped_img = img.crop((0, 0, crop_width, crop_height))
    cropped_resized_img = cropped_img.resize((512, 512), Image.LANCZOS)
    cropped_resized_img.save(os.path.join(output_dir, f"{image_name}_cropped.jpg"))

In [24]:
# Main processing loop with batch functionality
def process_batch(df, batch_size=100):
    for index, row in df.head(batch_size).iterrows():
        image_url = row['照片 Picture']
        download_url = convert_drive_url_to_downloadable(image_url)
        try:
            response = requests.get(download_url)
            img = Image.open(BytesIO(response.content)).convert('RGB')
            img.thumbnail(desired_size, Image.LANCZOS)
            delta_w = desired_size[0] - img.size[0]
            delta_h = desired_size[1] - img.size[1]
            padded_img = ImageOps.expand(img, (delta_w // 2, delta_h // 2, delta_w-delta_w//2, delta_h-delta_h//2), background_color)
            final_filename = f"image_{index}.jpg"
            padded_img.save(os.path.join(clothes_folder, final_filename))
            preprocess_image(padded_img, preprocessed_folder, f"image_{index}")
        except Exception as e:
            print(f"Error processing {download_url}: {e}")

In [25]:
# Run batch processing
batch_size = 30  # Specify the number of samples to process
process_batch(df, batch_size=batch_size)


In [26]:
# Save updated metadata
new_rows = []
for index, row in df.iterrows():
    image_name = f"image_{index}"
    for variant in ["original", "blurred", "rotated", "flipped", "bright", "noisy", "cropped"]:
        new_row = row.copy()
        new_row['照片 Picture'] = f"{image_name}_{variant}.jpg"
        new_rows.append(new_row)

new_df = pd.DataFrame(new_rows)
new_df.to_csv('updated_google_form_data.csv', index=False)