In [None]:
!pip install git+https://github.com/openai/CLIP.git
!pip install torch torchvision

In [None]:
import os
import torch
import numpy as np
import matplotlib.pyplot as plt
import scipy.io
from google.colab import drive
import zipfile
import tarfile
import pandas as pd
import seaborn as sns
from torch.utils.data import Dataset, DataLoader, Subset
from PIL import Image
import clip
import scipy.io
import h5py
from torchvision import transforms
import random
import time
from tqdm.notebook import tqdm
import nltk
nltk.download('punkt_tab')

In [None]:
drive.mount('/content/drive')
DATASET_PATH = '/content/drive/MyDrive/Fashion Synthesis Benchmark' #update to your directory***

LOCAL_EXTRACT_DIR = '/content/extracted'
os.makedirs(LOCAL_EXTRACT_DIR, exist_ok=True)

anno_path = os.path.join(DATASET_PATH, 'Anno/language_original.mat')
eval_path = os.path.join(DATASET_PATH, 'Eval/ind.mat')

def load_mat_file(file_path):
    """Load a MAT file and return its contents"""
    try:
        return scipy.io.loadmat(file_path)
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None

def analyze_fashion_attributes(anno_data):
    """Analyze fashion dataset attributes and create a structured DataFrame"""
    df = pd.DataFrame()
    attributes = ['cate_new', 'color_', 'gender_', 'sleeve_']

    for attr in attributes:
        if attr in anno_data:
            df[attr.rstrip('_').replace('cate_new', 'category')] = [
                int(val[0]) for val in anno_data[attr]
            ]

    if 'nameList' in anno_data:

      df['filename'] = [
          name[0][0].decode('utf-8') if hasattr(name[0][0], 'decode') else str(name[0][0])
          for name in anno_data['nameList']
      ]

    df = add_text_labels(df)

    return df

def add_text_labels(df):
    """Add text labels for numerical category, color, gender, and sleeve codes"""
    gender_mapping = {0: 'Female', 1: 'Male'}
    sleeve_mapping = {1: 'Short', 2: 'Medium', 3: 'Long', 4: 'Other'}

    # note -- these are just educated guess on based on dataset description 
    category_mapping = {
        1: 'T-Shirt', 2: 'Blouse', 3: 'Tank Top', 4: 'Sweater',
        5: 'Suit/Blazer', 6: 'Jacket', 7: 'Vest', 8: 'Coat',
        9: 'Dress Shirt', 10: 'Polo Shirt', 11: 'Cardigan', 12: 'Dress',
        13: 'Skirt', 14: 'Shorts', 15: 'Sweatshirt', 16: 'Pants/Trousers',
        17: 'Jeans', 18: 'Hoodie', 19: 'Leggings'
    }

    color_mapping = {
        1: 'Black', 2: 'White', 3: 'Blue', 4: 'Red', 6: 'Grey',
        8: 'Green', 9: 'Yellow', 12: 'Brown', 13: 'Pink', 17: 'Multicolor/Pattern'
    }

    df['gender_text'] = df['gender'].map(gender_mapping).fillna('Unknown')
    df['sleeve_text'] = df['sleeve'].map(sleeve_mapping).fillna('Unknown')
    df['category_text'] = df['category'].map(category_mapping).fillna('Category-Unknown')
    df['color_text'] = df['color'].map(color_mapping).fillna('Color-Unknown')

    return df

def visualize_dataset(df):
    """Create visualizations of the dataset"""

    plt.figure(figsize=(12, 6))
    category_counts = df['category_text'].value_counts().head(10)
    sns.barplot(x=category_counts.index, y=category_counts.values)
    plt.title('Top 10 Clothing Categories')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(8, 5))
    gender_counts = df['gender_text'].value_counts()
    sns.barplot(x=gender_counts.index, y=gender_counts.values)
    plt.title('Gender Distribution')
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(10, 5))
    sleeve_counts = df['sleeve_text'].value_counts()
    sns.barplot(x=sleeve_counts.index, y=sleeve_counts.values)
    plt.title('Sleeve Type Distribution')
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(12, 6))
    color_counts = df['color_text'].value_counts()

    color_map = {
        'Black': '#000000',
        'White': '#FFFFFF',
        'Blue': '#0000FF',
        'Red': '#FF0000',
        'Grey': '#808080',
        'Green': '#008000',
        'Yellow': '#FFFF00',
        'Brown': '#A52A2A',
        'Pink': '#FFC0CB',
        'Multicolor/Pattern': '#7F00FF',
        'Color-Unknown': '#CCCCCC'
    }

    bar_colors = [color_map.get(color, '#CCCCCC') for color in color_counts.index]

    plt.figure(figsize=(12, 6))
    ax = sns.barplot(x=color_counts.index, y=color_counts.values, palette=bar_colors)
    plt.title('Color Distribution in Fashion Dataset')
    plt.xticks(rotation=45, ha='right')

    for i, color in enumerate(color_counts.index):
        if color == 'White':
            bar = ax.patches[i]
            bar.set_edgecolor('black')
            bar.set_linewidth(1)

    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(14, 10))
    cat_gender_cross = pd.crosstab(df['category_text'], df['gender_text'], normalize='index')
    sns.heatmap(cat_gender_cross, annot=True, cmap="YlGnBu", fmt='.2f')
    plt.title('Gender Distribution Across Categories')
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(16, 10))
    color_cat_cross = pd.crosstab(df['color_text'], df['category_text'], normalize='index')

    top_categories = df['category_text'].value_counts().head(10).index
    color_cat_cross = color_cat_cross[top_categories]
    sns.heatmap(color_cat_cross, annot=True, cmap="YlOrRd", fmt='.2f')
    plt.title('Color Distribution Across Top 10 Categories')
    plt.tight_layout()
    plt.show()

def main():
    """Main function to run the analysis pipeline"""
    anno_data = load_mat_file(anno_path)
    eval_data = load_mat_file(eval_path)
    df = analyze_fashion_attributes(anno_data)
    visualize_dataset(df)
    df.to_csv('deepfashion_processed.csv', index=False)
    return df

df = main()

In [None]:
def analyze_descriptions_with_nlp(anno_data):
    """Use NLP techniques to extract style and attribute information from descriptions"""

    if 'engJ' not in anno_data:
        print("engJ field not found in annotation data")
        return

    try:
        import nltk
        from nltk.tokenize import word_tokenize
        from nltk.corpus import stopwords
        from collections import Counter

        try:
            nltk.data.find('tokenizers/punkt')
        except LookupError:
            nltk.download('punkt')

        try:
            nltk.data.find('corpora/stopwords')
        except LookupError:
            nltk.download('stopwords')
    except ImportError:
        print("NLTK not available. Install with: !pip install nltk")
        return

    descriptions = []
    for item in anno_data['engJ'].flatten():
        if isinstance(item, np.ndarray) and item.size > 0:
            text = item.flatten()[0]
            if hasattr(text, 'decode'):
                try:
                    text = text.decode('utf-8')
                except:
                    continue
            descriptions.append(text)
        elif isinstance(item, str):
            descriptions.append(item)

    if not descriptions:
        print("No valid descriptions found")
        return

    print(f"Analyzing {len(descriptions)} text descriptions")

    stop_words = set(stopwords.words('english'))

    all_tokens = []
    for desc in descriptions:
        tokens = word_tokenize(desc.lower())
        filtered_tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
        all_tokens.extend(filtered_tokens)

    word_counts = Counter(all_tokens)

    print("\nMost common words in descriptions:")
    for word, count in word_counts.most_common(20):
        print(f"  {word}: {count}")

    try:
        from wordcloud import WordCloud

        wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_counts)

        plt.figure(figsize=(12, 6))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title('Word Cloud of Fashion Descriptions')
        plt.show()
    except ImportError:
        print("WordCloud not available. Install with: !pip install wordcloud")

    bigrams = []
    for desc in descriptions:
        tokens = word_tokenize(desc.lower())
        filtered_tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
        for i in range(len(filtered_tokens) - 1):
            bigrams.append((filtered_tokens[i], filtered_tokens[i+1]))

    bigram_counts = Counter(bigrams)

    print("\nMost common two-word phrases:")
    for bigram, count in bigram_counts.most_common(15):
        print(f"  {bigram[0]} {bigram[1]}: {count}")

    return word_counts, bigram_counts

anno_data = load_mat_file(anno_path)
eval_data = load_mat_file(eval_path)
word_counts, bigram_counts = analyze_descriptions_with_nlp(anno_data)

**Clip Dataset** for loading and preprocessing images and their text descriptions

In [None]:
def load_mat_file(file_path):
  """Load a MAT file and return its contents"""
  try:
    return scipy.io.loadmat(file_path)
  except Exception as e:
    print(f"Error loading {file_path}: {e}")
    return None

In [None]:
class CLIPDataset(Dataset):
  def __init__(self, anno_mat_path, image_h5_path, transform=None):
    self.model, self.preprocess = clip.load("ViT-B/32")
    if transform is None:
      self.transform = self.preprocess
    else:
      self.transform = transform
    self.file_path = image_h5_path
    self.anno_mat_path = anno_mat_path
    anno_data = load_mat_file(self.anno_mat_path)
    self.images = None
    self.texts = anno_data.get('engJ', None)
    if self.texts is None:
      raise ValueError("No 'engJ' found in .mat file.")
    self.original_texts = [text[0] for text in self.texts]

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = self.texts[idx][0][0]
    if self.images is None:
      self.images = h5py.File(self.file_path, 'r')["ih"]
    raw_image = self.images[idx]
    if raw_image.shape[0] == 3:
      raw_image = np.transpose(raw_image, (2, 1, 0))
    if raw_image.dtype == np.float32 and raw_image.max() <= 1.0:
      raw_image = (raw_image * 255).astype(np.uint8)
    elif raw_image.dtype != np.uint8:
      raw_image = raw_image.astype(np.uint8)
    pil_image = Image.fromarray(raw_image)
    transformed_image = self.transform(pil_image)
    return transformed_image, text, idx

In [None]:
if __name__ == "__main__":
  random.seed(42)
  torch.manual_seed(42)
  device = "mps" if torch.backends.mps.is_available() else "cpu"
  print(f"Using device: {device}")

  model, _ = clip.load("ViT-B/32", device=device)


  full_dataset = CLIPDataset("/content/drive/MyDrive/DeepLearning/TeamProject/Fashion Synthesis Benchmark/Anno/language_original.mat", "/content/drive/MyDrive/DeepLearning/TeamProject/G2.h5")
  total_len = len(full_dataset)
  subset_size = int(total_len * 0.30)

  print(f"Full dataset size: {total_len}")
  print(f"Evaluating on a 30% subset: {subset_size} samples")

  subset_indices = random.sample(range(total_len), subset_size)
  subset_dataset = Subset(full_dataset, subset_indices)

  batch_size = 16
  dataloader = DataLoader(
      subset_dataset,
      batch_size=batch_size,
      shuffle=False,
      num_workers=2,
      collate_fn=lambda batch: (
      torch.stack([item[0] for item in batch]),
      [item[1] for item in batch],
      [item[2] for item in batch]
    )
  )
