In [1]:
from transformers import AutoTokenizer,CLIPProcessor, CLIPModel
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms,datasets, transforms
from tqdm import tqdm
import json
import os
import torch
from PIL import Image
from torch.utils.data._utils.collate import default_collate

In [2]:
# Load CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

In [3]:
# Classification head
to_pil_image = transforms.ToPILImage()
def preprocess_images(images): # putting the images together and resizing it (idk if I should change this to make the compatibility on each other rather then all three on the style_label)
    image_tensors = []
    for key in images.keys():
        image = images[key]  # Extract individual image
        processed_image = processor(images=image, return_tensors="pt").pixel_values
        image_tensors.append(processed_image)
    # Concatenate all the preprocessed images if needed
    return torch.cat(image_tensors, dim=0)  # Combine images along batch dimension
class CLIPFineTuner(nn.Module):
    def __init__(self, clip_model, num_styles):
        super(CLIPFineTuner, self).__init__()
        self.clip_model = clip_model
        #self.fc = nn.Linear(clip_model.visual_projection.in_features, 1) #1 linear model 
        self.fc = nn.Linear(512, 1)
        self.style_embeddings = nn.Embedding(num_styles, clip_model.text_projection.in_features)
        
    def forward(self, images, style_labels):
        # Get CLIP image embeddings
        image_tensor = preprocess_images(images)
        
        image_features = self.clip_model.get_image_features(image_tensor)
        
        # Get style embeddings
        #print("style_labels",style_labels)
        inputs = tokenizer(style_labels, padding=True, return_tensors="pt")
        style_embeddings = self.clip_model.get_text_features(**inputs)
        # Repeat each style embedding 5 times to match the 20 images
        expanded_style_embeddings = style_embeddings.repeat_interleave(5, dim=0)
        #print("Image features shape:", image_features.shape)
        #print("Style embeddings shape:", style_embeddings.shape)
        # Cosine similarity for compatibility scoring
        compatibility_scores = torch.cosine_similarity(image_features, expanded_style_embeddings)
        
        # Compatibility prediction (classification)
        classification_scores = self.fc(image_features).squeeze(-1)
        
        return classification_scores, compatibility_scores

In [4]:
# Loss function
def compute_loss(classification_scores, compatibility_scores, labels, num_images_per_style):
    labels_expanded = labels.repeat_interleave(num_images_per_style)
    classification_loss = nn.BCEWithLogitsLoss()(classification_scores, labels_expanded.float()) #change this to use cross-entropy 
    contrastive_loss = nn.MSELoss()(compatibility_scores, labels_expanded.float())
    return classification_loss + contrastive_loss

In [5]:
#Training: might need to change this to take into account the different nature of the dataset
def train(model, dataloader, optimizer, epochs, device):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(dataloader):
            # Get the images and labels from the batch
            images = {category: batch["images"][category].to(device) for category in batch["images"]}
            #print(images)
            descriptions = batch["description"]
            
            match_labels = batch["match"].to(device)

            optimizer.zero_grad()

            # Pass the images and style labels through the model
            classification_scores, compatibility_scores = model(images, descriptions)

            # Compute the loss
            loss = compute_loss(classification_scores, compatibility_scores, match_labels, num_images_per_style = 5)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            #Add the print of the loss curve 

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(dataloader)}")

In [6]:
data_dir = r"C:\Users\Jlngo\Deep Learning in Computer Visions\Project\dataset.json"

In [7]:
with open(data_dir,'r') as file:
    data = json.load(file)

In [8]:
print(data[:10])

[{'Outfit_Gender': 'female', 'Outfit_Occasion': 'Campus', 'Outfit_Style': 'Casual', 'Items': [{'Image': '10423_1356_32896545877.jpg', 'category': ['Top'], 'subcategory': ['jacket']}, {'Image': '10020_5257_30537531528.jpg', 'category': ['Pants'], 'subcategory': ['casual pants']}, {'Image': '10000_6916_30454716460.jpg', 'category': ['Shoes'], 'subcategory': ['platform shoes']}], 'match': 1}, {'Outfit_Gender': 'female', 'Outfit_Occasion': 'Campus', 'Outfit_Style': 'Casual', 'Items': [{'Image': '10001_9712_31942033801.jpg', 'category': ['Top'], 'subcategory': ['jacket']}, {'Image': '10001_9720_31632093861.jpg', 'category': ['Skirt'], 'subcategory': ['skirt']}, {'Image': '10001_6916_29041483915.jpg', 'category': ['Shoes'], 'subcategory': ['casual shoes']}], 'match': 1}, {'Outfit_Gender': 'male', 'Outfit_Occasion': 'Home', 'Outfit_Style': 'Simple', 'Items': [{'Image': '10007_6916_31756680500.jpg', 'category': ['Top'], 'subcategory': ['sweaters']}, {'Image': '10002_9730_32136565407.jpg', 'cat

In [9]:
def find_image_path(image_folder,target_suffix,subfolders):
        # Search for the file within each subfolder
        for subfolder in subfolders:
            folder_path = os.path.join(image_folder, subfolder)
            # Loop through all files in the current subfolder
            for dirpath, dirnames, filenames in os.walk(folder_path):
                for filename in filenames:
                    # Check if the filename ends with the target suffix
                    if filename.endswith(target_suffix):
                        return os.path.join(dirpath, filename)
        return None  # Return None if the image is not found

In [10]:
class ClothingCombinationDataset(Dataset):
    def __init__(self, json_file_path, image_folder, transform=None):
        # Load JSON data
        with open(json_file_path, 'r') as f:
            data = json.load(f)
        # Store data and parameters
        self.data = data
        self.image_folder = image_folder
        self.transform = transform
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        outfit = self.data[idx]
        images = {}
        subfolders = ["JD_MM19_Train", "JD_MM19_Validation", "JD_MM19_Test"]
        # Load each item in the outfit
        for item in outfit['Items']:
            category = item['category'][0]  # Main category, e.g., 'Top', 'Pants', 'Shoes'
            image_filename = item['Image']
            # Find the image path by searching for the target suffix in the subfolders
            target_suffix = image_filename.split('_')[-1]
            image_path = find_image_path(image_folder, target_suffix, subfolders)
            if image_path:
                #print(f"Image found: {image_path}")
                image = Image.open(image_path).convert("RGB")
                if transform:
                    image = transform(image)
                images[category] = image  # Store the transformed image in a dictionary
            else:
                #print('IMAGE:',target_suffix)
                #print("Image not found")
                return None 
        # Ensure all categories are represented, even if empty
        all_categories = ['Bags', 'Pants', 'Shoes', 'Skirt', 'Top']  # Add all possible categories here
        for category in all_categories:
            if category not in images:
                images[category] = torch.zeros(3, 224, 224)  # Placeholder for missing images
        # Convert labels to tensors
        gender_label = outfit['Outfit_Gender']  # e.g., 'female', 'male', 'unisex'
        occasion_label = outfit['Outfit_Occasion']  # e.g., 'Campus', 'Home'
        style_label = outfit['Outfit_Style']  # e.g., 'Casual', 'Simple', 'Artistic'
        description = "A " + gender_label + " outfit for a " + occasion_label + " occasion, featuring a " + style_label +" style."
        match_label = torch.tensor(outfit['match'], dtype=torch.long)
        
        return {
            "images": images,  # Dictionary with images per category (e.g., {"Top": ..., "Pants": ..., "Shoes": ...})
            "description": description,
            "match": match_label
        }

In [11]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize images
    transforms.ToTensor(),          # Convert images to tensors
])


In [12]:
json_file_path = r"C:\Users\Jlngo\Deep Learning in Computer Visions\Project\dataset.json"
image_folder = r"C:\Users\Jlngo\Deep Learning in Computer Visions\Project\images"

In [13]:
# Instantiate the dataset
dataset = ClothingCombinationDataset(json_file_path=json_file_path, image_folder=image_folder, transform=transform)

In [14]:
# Instantiate the dataset
print(dataset.__getitem__(1))

{'images': {'Top': tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]]]), 'Skirt': tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
       

24836

In [15]:
def count_missing_images(json_file_path, image_folder):
    # Load JSON data
    with open(json_file_path, 'r') as f:
        data = json.load(f)

    subfolders = ["JD_MM19_Train", "JD_MM19_Validation", "JD_MM19_Test"]
    missing_images_count = 0

    # Iterate through each outfit
    for outfit in data:
        for item in outfit['Items']:
            image_filename = item['Image']
            target_suffix = image_filename.split('_')[-1]
            image_path = find_image_path(image_folder, target_suffix, subfolders)
            
            if image_path is None:
                missing_images_count += 1
                print(f"Missing image: {image_filename}")

    print(f"Total missing images: {missing_images_count}")
    return missing_images_count

In [16]:
#missing_images = count_missing_images(json_file_path, image_folder)

In [17]:
# Define the split ratio
train_size = int(0.8 * len(dataset))  # 80% for training
test_size = len(dataset) - train_size  # 20% for testing

# Randomly split the dataset into training and test datasets
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

In [30]:
train_dataset._getitem_(10)

AttributeError: 'Subset' object has no attribute '_getitem_'

In [20]:
print(train_dataset) 

<torch.utils.data.dataset.Subset object at 0x000001DE0913F230>


In [18]:
def custom_collate(batch):
    batch = [item for item in batch if item is not None]
    return default_collate(batch) if batch else None

In [19]:
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=custom_collate)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=True, collate_fn=custom_collate)

  torch.load(train_dataloader,r"C:\Users\Jlngo\Deep Learning in Computer Visions\Project")


AttributeError: 'DataLoader' object has no attribute 'seek'. You can only torch.load from a file that is seekable. Please pre-load the data into a buffer like io.BytesIO and try to load from it instead.

In [26]:
output_file_path = r"C:\Users\Jlngo\Deep Learning in Computer Visions\Project\train_dataset.pth"

In [27]:
torch.save(train_dataset, output_file_path)

In [28]:
output_file_path = r"C:\Users\Jlngo\Deep Learning in Computer Visions\Project\test_dataset.pth"

In [29]:
torch.save(test_dataset, output_file_path)

In [322]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPFineTuner(clip_model=model, num_styles=5).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

In [323]:
train(model, train_dataloader, optimizer, epochs=5 , device = device)

  2%|▍                   | 106/4967 [23:13<17:44:45, 13.14s/it]


KeyboardInterrupt: 