In [1]:
import pandas as pd
from PIL import Image, ImageOps

train = pd.read_csv('../data/exercise_train.tsv', sep='\t')
test = pd.read_csv('../data/exercise_test.tsv', sep='\t')

Let's have a look at our images. Firstly I'd like to understand image size and shape and if this is already uniform 
and standardized. Otherwise, we'll have to think about how to deal with this problem by cropping, resizing, 
downscaling the resolution etc. 

In [2]:
train.head()

Unnamed: 0,productId,gender,description,imageURL1,imageURL2,imageURL3,imageURL4,name,productType,pattern,productIdentifier
0,1026288,Women,Lightweight dress by ASOS CURVE;Embroidery and...,images.asos-media.com/products/asos-curve-embe...,images.asos-media.com/products/asos-curve-embe...,images.asos-media.com/products/asos-curve-embe...,images.asos-media.com/products/asos-curve-embe...,ASOS CURVE Embellished Neck Dress,Dresses,Embellished,23b469431f1ef587a931d7811a6aa914
1,7601668,Women,Lightly-textured stretch fabric;Leopard print;...,images.asos-media.com/products/asos-mini-dress...,images.asos-media.com/products/asos-mini-dress...,images.asos-media.com/products/asos-mini-dress...,images.asos-media.com/products/asos-mini-dress...,ASOS Mini Dress in Leopard Print With Frill Wa...,Dresses,Animal,133353816a459ef91d87fdee335f3c36
2,7550376,Women,Cotton-rich fabric;Contains stretch for comfor...,images.asos-media.com/products/h-by-henry-holl...,images.asos-media.com/products/h-by-henry-holl...,images.asos-media.com/products/h-by-henry-holl...,images.asos-media.com/products/h-by-henry-holl...,H! By Henry Holland Zebra Print Ruffle Dress,Dresses,Animal,676004bfc416bcb360702352b0f34c70
3,4519370,Women,"Lightweight, woven fabric;Boat neckline;Box pl...",images.asos-media.com/products/asos-reclaimed-...,images.asos-media.com/products/asos-reclaimed-...,images.asos-media.com/products/asos-reclaimed-...,images.asos-media.com/products/asos-reclaimed-...,ASOS Reclaimed Vintage Smudge Floral Midi Dress,Dresses,Floral,f9db7e1b620236c58ef0d8cb92f14abb
4,6436058,Women,Woven fabric;Bandeau neckline;Off-shoulder des...,images.asos-media.com/products/asos-off-the-sh...,images.asos-media.com/products/asos-off-the-sh...,images.asos-media.com/products/asos-off-the-sh...,images.asos-media.com/products/asos-off-the-sh...,ASOS Off The Shoulder Midi Prom Dress In Brig...,Dresses,Floral,cbde47b8c6b84803516b3285bfcbd4f3


In [16]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3049 entries, 0 to 3048
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   productId          3049 non-null   int64 
 1   gender             3049 non-null   object
 2   description        3049 non-null   object
 3   imageURL1          3049 non-null   object
 4   imageURL2          3049 non-null   object
 5   imageURL3          3049 non-null   object
 6   imageURL4          3049 non-null   object
 7   name               3049 non-null   object
 8   productType        3049 non-null   object
 9   pattern            3049 non-null   object
 10  productIdentifier  3049 non-null   object
dtypes: int64(1), object(10)
memory usage: 262.1+ KB


In [15]:
import requests
from io import BytesIO

sizes = set()

for image_url in train['imageURL1'].values:
    response = requests.get(f'https://{image_url}')
    if response.status_code == 200:
        image = Image.open(BytesIO(response.content))
        sizes.add(image.size)
        # display(image)
        # image.show()
    else:
        print("Failed to fetch the image. Status code:", response.status_code)
print(sizes)

{(314, 400)}


In [13]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.models as models
from PIL import Image

# Load pre-trained ResNet-50 model
resnet50 = models.resnet50(pretrained=True)
# Remove the fully connected layer at the top
resnet50 = nn.Sequential(*list(resnet50.children())[:-1])
# Set the model to evaluation mode
resnet50.eval()

# Define preprocessing transforms
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

def extract_image_embedding(image_url):
    # Open and preprocess the image
    
    response = requests.get(image_url)
    if response.status_code == 200:
        image = Image.open(BytesIO(response.content))
    # image = Image.open(image_path)
    image = preprocess(image).unsqueeze(0)  # Add batch dimension

    # Forward pass through the model to extract features
    with torch.no_grad():
        features = resnet50(image)

    # Flatten the features and convert to numpy array
    embedding = features.squeeze().numpy()

    return embedding

# Example usage:
image_url = f'https://{train.imageURL1.values[0]}'
embedding = extract_image_embedding(image_url)
print("Image embedding shape:", embedding.shape)

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /home/achilleasatha/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
24.0%IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

56.0%IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

88.1%IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rat

Image embedding shape: (2048,)
