In [530]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

import random
from PIL import Image
from io import BytesIO  


## 1- Data Scraping 

In [74]:
def soup_with_selenium(url, scroll=False):
    # Start a new instance of Chrome WebDriver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    
    # Navigate to the URL
    driver.get(url)

    # If scroll is True, scroll down to trigger loading of additional content
    # Scroll to the bottom of the page
    if scroll:
         # Get the initial page height
        last_height = driver.execute_script("return document.body.scrollHeight")

        while True:
            # Scroll down to the bottom of the page
            driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight);")

            # Wait for the page to load after scrolling
            time.sleep(30.0)  # Adjust the waiting time as needed

            # Calculate the new page height after scrolling
            new_height = driver.execute_script("return document.body.scrollHeight")

            # If the page height has not changed after scrolling, break the loop
            if new_height == last_height:
                break

            # Update the last height
            last_height = new_height

    time.sleep(5.0)

    # Get the page source after the page has fully loaded
    page_source = driver.page_source
    
    # Close the WebDriver
    driver.quit()

    # Parse the page source with BeautifulSoup
    soup = BeautifulSoup(page_source, 'html.parser')

    return soup

In [75]:
def fetch_product_and_combi(url):
    # Call soup_with_selenium with scroll=True to ensure all content is loaded
    soup = soup_with_selenium(url, scroll=True)

    product_box = soup.find('div', attrs={'class': 'layout-content layout-catalog-content--full'}).find('div', attrs={'class': 'product-groups'}).find_all('li')

    data = []

    for product in product_box:
        image_box = product.find_all('img')
        if len(image_box) < 2:
            pass
        else:
            image_link = image_box[-2].get('src')
            type_color = image_box[-2].get('alt')

            combi_image_link = image_box[-1].get('src')
            combi_type_color = image_box[-1].get('alt')

            data.append({'image_link': image_link,
                         'type_color' : type_color,                    
                         'combi_image_link': combi_image_link,
                         'combi_type_color': combi_type_color})

    df = pd.DataFrame(data)

    return df


In [22]:
pd.set_option('display.max_colwidth', None)

In [76]:
url = 'https://www.zara.com/nl/en/woman-co-ords-l1061.html?v1=2353302'
data = fetch_product_and_combi(url)


In [84]:
data.to_csv('Zara data/zara_data.csv')

## 2- Data cleaning

In [434]:
zara_data = pd.read_csv('Zara data/zara_data.csv')

In [435]:
zara_data.head()

Unnamed: 0.1,Unnamed: 0,image_link,type_color,combi_image_link,combi_type_color
0,0,https://static.zara.net/assets/public/a2ec/f631/60974c5ea7b9/7c64b92ad2b0/04786046515-e1/04786046515-e1.jpg?ts=1709907074921&w=400,ZW JACQUARD OVERSIZE SHIRT - Olive green by Zara,https://static.zara.net/assets/public/6e2f/19f4/78ae49238427/60a14ac7fa35/04786060515-e1/04786060515-e1.jpg?ts=1709907072415&w=400,ZW COLLECTION JACQUARD TROUSERS - Olive green by Zara
1,1,https://static.zara.net/assets/public/6fb2/5120/918e4cb3845c/5f0bdf866a67/02902066400-e1/02902066400-e1.jpg?ts=1709907073646&w=400,ZW COLLECTION PRINTED SHIRT - Blue by Zara,https://static.zara.net/assets/public/b7a5/cec3/0ea446bfb2ae/a57112fdb027/02701066400-e1/02701066400-e1.jpg?ts=1709907073049&w=400,ZW COLLECTION PRINTED PYJAMA-STYLE TROUSERS - Blue by Zara
2,2,https://static.zara.net/assets/public/dcdc/5a19/ca9945d68bf5/e4a441057fd7/02328125800-e1/02328125800-e1.jpg?ts=1709736490024&w=600,ZW COLLECTION POLKA DOT PRINT SHIRT - Black by Zara,https://static.zara.net/assets/public/0720/4bf1/01a443d6a012/48447187386f/02942125800-e1/02942125800-e1.jpg?ts=1709736484873&w=600,ZW COLLECTION HIGH-WAIST POLKA DOT SHORTS - Black by Zara
3,3,https://static.zara.net/assets/public/51e4/7936/50504c12b185/eeba48ec818d/04437048400-e1/04437048400-e1.jpg?ts=1709736491463&w=600,ZW COLLECTION LONG SHIRT - Blue by Zara,https://static.zara.net/assets/public/47b3/8f3a/26924f498860/49ce9d4b9542/04437050400-e1/04437050400-e1.jpg?ts=1709736488715&w=600,ZW COLLECTION STRAIGHT-FIT TROUSERS - Blue by Zara
4,4,https://static.zara.net/assets/public/c33d/90d4/c9094f628b1f/e7a2d7aa68b3/02183044051-e1/02183044051-e1.jpg?ts=1709736496249&w=600,ZW COLLECTION ANIMAL PRINT SHIRT - Leopard by Zara,https://static.zara.net/assets/public/9dfa/eeba/0d664b459b89/8c639521d238/02183045051-e1/02183045051-e1.jpg?ts=1709736494141&w=600,ZW COLLECTION ANIMAL PRINT TROUSERS - Leopard by Zara


In [436]:
zara_data.drop(columns='Unnamed: 0', inplace=True)

column['type'] cleaning

In [437]:
zara_data['type']=zara_data['type_color']
zara_data['color']=zara_data['type_color']

In [438]:
def find_type(zara_data, types):
    for type_ in types:
        zara_data.loc[zara_data['type'].str.contains(type_, case=False), 'type'] = type_

types = ['SKIRT', 'CARDIGAN', 'SWEATER', 'TOP', 'BODYSUIT', 'SHIRT',
         'T-SHIRT', 'JEAN', 'TROUSER', 'SHOE', 'KNITWEAR', 'SWEATSHIRT',
         'TRENCH', 'COAT', 'WAISTCOAT', 'SHORT', 'SKORT', 'BAG', 'BEACHWEAR', 'BLAZER', 'HOODIE','JACKET', 'BLOUSE']

find_type(zara_data, types)

In [501]:
zara_data['type'].value_counts()

Shirts       51
Tops         46
Sweaters     21
Hoodie       14
Blazers      12
Coats        11
Jackets      10
Skirts        5
Trousers      3
Cardigans     3
Blouses       2
Shorts        1
Name: type, dtype: int64

In [443]:
zara_data['type'] = zara_data['type'].map({'TROUSER':'Trousers', 'SKIRT':'Skirts', 'JOGGERS': 'Joggers',
                                                 'LEGGINGS': 'Trousers', 'SKORT':'Shorts', 'SHORT':'Shorts', 
                                                 'TOP':'Tops', 'SHIRT':'Shirts', 'COAT':'Coats',
                                                 'JEAN':'Jeans', 'HOODIE':'Hoodie','SWEATER':'Sweaters',
                                                 'BLAZER':'Blazers','JACKET':'Jackets','BODYSUIT':'Tops',
                                                 'CARDIGAN':'Cardigans','BLOUSE':'Blouses'})


In [440]:
zara_data.drop(zara_data[zara_data['type'] == 'LOOK - 600 by Zara'].index, inplace=True)
zara_data.drop(zara_data[zara_data['type'] == 'LOOK - 180 by Zara'].index, inplace=True)
zara_data.drop(zara_data[zara_data['type'] == 'WOOL BRA WITH LACE TRIM - Grey marl by Zara'].index, inplace=True)

In [441]:
zara_data['type'].replace({'CONTRAST SATIN BUSTIER - Black by Zara': 'TOP'}, inplace=True)

column['color'] cleaning

In [445]:
def combine_colors(zara_data, colors_to_combine):
    for color in colors_to_combine:
        zara_data.loc[zara_data['color'].str.contains(color, case=False), 'color'] = color
        
colors_to_combine = ['Blue', 'Red', 'Green', 'Yellow', 'White', 'Grey', 'Brown', 'Orange', 'Pink', 'Black', 'Purple','Navy', 'Ecru', 'Silver',
                     'Khaki', 'Golden', 'camel', 'Sand']
combine_colors(zara_data, colors_to_combine)

In [500]:
zara_data['color'].value_counts()

Black     37
Grey      33
Red       22
Ecru      21
Blue      16
White     14
Brown     11
Green      9
Pink       4
Khaki      3
Silver     3
Sand       3
Gold       1
Purple     1
Beige      1
Name: color, dtype: int64

In [447]:
def process_color(color):
    parts = color.split('-')
    if len(parts) > 1 and parts[1].strip() not in colors_to_combine:
        return parts[1].strip()
    else:
        return color

zara_data['color'] = zara_data['color'].apply(process_color)


In [448]:
zara_data[zara_data['color']=='Ice by Zara']['image_link']
zara_data['color'].replace({'Ice by Zara': 'Ecru'}, inplace=True)

In [449]:
zara_data[zara_data['color']=='Mink by Zara']['image_link']
zara_data.at[83, 'color'] = 'Ecru'
zara_data.at[127, 'color'] = 'Ecru'
zara_data.at[182, 'color'] = 'Brown'

In [451]:
zara_data[zara_data['color']=='DYE EFFECT']['image_link']
zara_data['color'].replace({'DYE EFFECT': 'Brown'}, inplace=True)

In [452]:
zara_data[zara_data['color']=='camel']['image_link']
zara_data['color'].replace({'camel': 'Ecru'}, inplace=True)

In [453]:
zara_data[zara_data['color']=='ORD']['type']
zara_data['color'].replace({'ORD': 'White'}, inplace=True)

In [454]:
zara_data[zara_data['color']=='LEG DARTED TROUSERS CO']['image_link']
zara_data['color'].replace({'LEG DARTED TROUSERS CO': 'Black'}, inplace=True)

In [455]:
zara_data[zara_data['color']=='LEG TROUSERS CO']['image_link']
zara_data.at[160, 'color'] = 'Black'
zara_data.at[162, 'color'] = 'Brown'

In [456]:
zara_data[zara_data['color']=='Stone by Zara']['image_link']
zara_data['color'].replace({'Stone by Zara': 'Ecru'}, inplace=True)

In [468]:
zara_data[zara_data['color']=='Beige by Zara']['image_link']
zara_data['color'].replace({'Beige by Zara': 'Beige'}, inplace=True)

In [457]:
zara_data[zara_data['color']=='Indigo by Zara']['image_link']
zara_data['color'].replace({'Indigo by Zara': 'Blue'}, inplace=True)

In [458]:
zara_data[zara_data['color']=='Tan marl by Zara']['image_link']
zara_data['color'].replace({'Tan marl by Zara': 'Brown'}, inplace=True)

In [459]:
zara_data[zara_data['color']=='807 by Zara']['image_link']
zara_data['color'].replace({'807 by Zara': 'Black'}, inplace=True)

In [460]:
zara_data[zara_data['color']=='803 by Zara']['image_link']
zara_data['color'].replace({'803 by Zara': 'Grey'}, inplace=True)

In [461]:
zara_data[zara_data['color']=='922 by Zara']['image_link']
zara_data['color'].replace({'922 by Zara': 'Black'}, inplace=True)

In [462]:
zara_data[zara_data['color']=='Leopard by Zara']['image_link']
zara_data['color'].replace({'Leopard by Zara': 'Brown'}, inplace=True)

In [464]:
zara_data[zara_data['color']=='222 by Zara']['image_link']
zara_data['color'].replace({'222 by Zara': 'Silver'}, inplace=True)

In [465]:
zara_data['color'].replace({'Golden': 'Gold'}, inplace=True)

In [466]:
zara_data.drop(columns='type_color', inplace=True)

column['combi_type'] cleaning

In [470]:
zara_data['combi_type']=zara_data['combi_type_color']
zara_data['combi_color']=zara_data['combi_type_color']

In [471]:
def find_combi_type(zara_data, types):
    for type_ in types:
        zara_data.loc[zara_data['combi_type'].str.contains(type_, case=False), 'combi_type'] = type_

types = ['SKIRT', 'CARDIGAN', 'SWEATER', 'TOP', 'BODYSUIT', 'SHIRT',
         'T-SHIRT', 'JEAN', 'TROUSER', 'SHOE', 'KNITWEAR', 'SWEATSHIRT',
         'TRENCH', 'COAT', 'WAISTCOAT', 'SHORT', 'SKORT', 'BAG', 'BEACHWEAR',
         'BLAZER', 'HOODIE','JACKET', 'BLOUSE', 'JOGGERS','LEGGINGS']

find_combi_type(zara_data, types)

In [472]:
zara_data[zara_data['combi_type']=='KNIT CULOTTES - Grey by Zara']['combi_image_link']
zara_data['combi_type'].replace({'KNIT CULOTTES - Grey by Zara': 'LEGGINGS'}, inplace=True)

In [473]:
zara_data.drop(zara_data[zara_data['combi_type'] == 'POLYAMIDE STRETCH BODYCON BRIEFS - Light grey by Zara'].index, inplace=True)
zara_data.drop(zara_data[zara_data['combi_type'] == 'TEXTURED SHIMMERY BRIEFS - Light green by Zara'].index, inplace=True)

In [499]:
zara_data['combi_type'].value_counts()

Trousers              89
Skirts                58
Joggers               13
Shorts                 9
Tops                   3
Blouses and shirts     3
Coats                  2
Jeans                  1
Hoodie                 1
Name: combi_type, dtype: int64

In [475]:
zara_data['combi_type'] = zara_data['combi_type'].map({'TROUSER':'Trousers', 'SKIRT':'Skirts', 'JOGGERS': 'Joggers',
                                                 'LEGGINGS': 'Trousers', 'SKORT':'Shorts', 'SHORT':'Shorts', 
                                                 'TOP':'Tops', 'SHIRT':'Blouses and shirts', 'COAT':'Coats',
                                                 'JEAN':'Jeans', 'HOODIE':'Hoodie'})


column['combi_color'] cleaning

In [477]:
def combine_combi_colors(zara_data, colors_to_combine):
    for color in colors_to_combine:
        zara_data.loc[zara_data['combi_color'].str.contains(color, case=False), 'combi_color'] = color
        
colors_to_combine = ['Blue', 'Red', 'Green', 'Yellow', 'White', 'Grey', 'Brown', 'Orange', 'Pink', 'Black', 'Purple','Navy', 'Ecru', 'Silver',
                     'Khaki', 'Golden', 'camel', 'Sand']
combine_combi_colors(zara_data, colors_to_combine)

In [478]:
def process_combi_color(color):
    parts = color.split('-')
    if len(parts) > 1 and parts[1].strip() not in colors_to_combine:
        return parts[1].strip()
    else:
        return color

zara_data['combi_color'] = zara_data['combi_color'].apply(process_combi_color)


In [498]:
zara_data['combi_color'].value_counts()

Grey      34
Black     33
Ecru      24
Red       23
Blue      17
White     12
Brown      9
Green      8
Silver     5
Khaki      4
Pink       4
Sand       3
Gold       1
Purple     1
Beige      1
Name: combi_color, dtype: int64

In [480]:
zara_data[zara_data['combi_color']=='Beige by Zara']['combi_image_link']
zara_data['combi_color'].replace({'Beige by Zara': 'Beige'}, inplace=True)

In [481]:
zara_data['combi_color'].replace({'Golden': 'Gold'}, inplace=True)

In [482]:
zara_data[zara_data['combi_color']=='camel']['combi_image_link']
zara_data['combi_color'].replace({'camel': 'Ecru'}, inplace=True)

In [483]:
zara_data[zara_data['combi_color']=='Tan marl by Zara']['combi_image_link']
zara_data['combi_color'].replace({'Tan marl by Zara': 'Brown'}, inplace=True)

In [484]:
zara_data[zara_data['combi_color']=='LEG JOGGER TROUSERS']['combi_image_link']
zara_data['combi_color'].replace({'LEG JOGGER TROUSERS': 'Ecru'}, inplace=True)

In [485]:
zara_data[zara_data['combi_color']=='Mink by Zara']['combi_image_link']
zara_data.at[127, 'combi_color'] = 'Ecru'
zara_data.at[182, 'combi_color'] = 'Brown'

In [486]:
zara_data[zara_data['combi_color']=='Leopard by Zara']['combi_image_link']
zara_data['combi_color'].replace({'Leopard by Zara': 'Brown'}, inplace=True)

In [488]:
zara_data[zara_data['combi_color']=='DYE SKIRT']['combi_image_link']
zara_data['combi_color'].replace({'DYE SKIRT': 'Brown'}, inplace=True)

In [489]:
zara_data[zara_data['combi_color']=='222 by Zara']['combi_image_link']
zara_data['combi_color'].replace({'222 by Zara': 'Silver'}, inplace=True)

In [491]:
zara_data[zara_data['combi_color']=='Stone by Zara']['combi_image_link']
zara_data['combi_color'].replace({'Stone by Zara': 'Ecru'}, inplace=True)

In [492]:
zara_data[zara_data['combi_color']=='Indigo by Zara']['combi_image_link']
zara_data['combi_color'].replace({'Indigo by Zara': 'Blue'}, inplace=True)

In [493]:
zara_data[zara_data['combi_color']=='LEG TROUSERS']['combi_image_link']
zara_data['combi_color'].replace({'LEG TROUSERS': 'Brown'}, inplace=True)

In [494]:
zara_data.drop(columns='combi_type_color', inplace=True)

In [496]:
desired_columns_order = ['image_link', 'type', 'color', 'combi_image_link', 'combi_type', 'combi_color']

zara_data = zara_data[desired_columns_order]


In [502]:
zara_data.reset_index(drop=True, inplace=True)
zara_data.to_csv('Zara data/cleaned_Zara_data.csv')

In [507]:
zara_data.head()

Unnamed: 0,image_link,type,color,combi_image_link,combi_type,combi_color
0,https://static.zara.net/assets/public/a2ec/f631/60974c5ea7b9/7c64b92ad2b0/04786046515-e1/04786046515-e1.jpg?ts=1709907074921&w=400,Shirts,Green,https://static.zara.net/assets/public/6e2f/19f4/78ae49238427/60a14ac7fa35/04786060515-e1/04786060515-e1.jpg?ts=1709907072415&w=400,Trousers,Green
1,https://static.zara.net/assets/public/6fb2/5120/918e4cb3845c/5f0bdf866a67/02902066400-e1/02902066400-e1.jpg?ts=1709907073646&w=400,Shirts,Blue,https://static.zara.net/assets/public/b7a5/cec3/0ea446bfb2ae/a57112fdb027/02701066400-e1/02701066400-e1.jpg?ts=1709907073049&w=400,Trousers,Blue
2,https://static.zara.net/assets/public/dcdc/5a19/ca9945d68bf5/e4a441057fd7/02328125800-e1/02328125800-e1.jpg?ts=1709736490024&w=600,Shirts,Black,https://static.zara.net/assets/public/0720/4bf1/01a443d6a012/48447187386f/02942125800-e1/02942125800-e1.jpg?ts=1709736484873&w=600,Shorts,Black
3,https://static.zara.net/assets/public/51e4/7936/50504c12b185/eeba48ec818d/04437048400-e1/04437048400-e1.jpg?ts=1709736491463&w=600,Shirts,Blue,https://static.zara.net/assets/public/47b3/8f3a/26924f498860/49ce9d4b9542/04437050400-e1/04437050400-e1.jpg?ts=1709736488715&w=600,Trousers,Blue
4,https://static.zara.net/assets/public/c33d/90d4/c9094f628b1f/e7a2d7aa68b3/02183044051-e1/02183044051-e1.jpg?ts=1709736496249&w=600,Shirts,Brown,https://static.zara.net/assets/public/9dfa/eeba/0d664b459b89/8c639521d238/02183045051-e1/02183045051-e1.jpg?ts=1709736494141&w=600,Trousers,Brown


## 3- Generating synthetic data

In [513]:
zara_syn_data = pd.read_csv('Zara data/cleaned_zara_data.csv')

def generate_synthetic_data(data, num_samples):
    synthetic_data = []
    data_len = len(data)
    for _ in range(num_samples):
        # Randomly select unique row indices for product1 and product2
        row_index_1 = random.randint(0, data_len - 1)
        row_index_2 = random.randint(0, data_len - 1)
        while row_index_2 == row_index_1:
            row_index_2 = random.randint(0, data_len - 1)
        
        # Get product details for product1
        product_image_1 = data.iloc[row_index_1]['image_link']
        product_type_1 = data.iloc[row_index_1]['type']
        product_color_1 = data.iloc[row_index_1]['color']
        
        # Get product details for product2
        product_image_2 = data.iloc[row_index_2]['combi_image_link']
        product_type_2 = data.iloc[row_index_2]['combi_type']
        product_color_2 = data.iloc[row_index_2]['combi_color']
        
        # Create synthetic combination
        synthetic_data.append({
            'image_link': product_image_1,
            'type': product_type_1,
            'color': product_color_1,
            'combi_image_link': product_image_2,
            'combi_type': product_type_2,
            'combi_color': product_color_2
        })
    return pd.DataFrame(synthetic_data)

# Generate 3000 synthetic data samples
synthetic_data = generate_synthetic_data(zara_syn_data, 3000)

# Save the synthetic data to a new CSV file
synthetic_data.to_csv("Zara data/synthetic_Zara_data.csv", index=False)


In [560]:
synthetic_data['match'] = 0

In [561]:
synthetic_data.to_csv("Zara data/synthetic_Zara_data.csv", index=False)

In [562]:
synthetic_data = pd.read_csv('Zara data/synthetic_Zara_data.csv')

In [563]:
synthetic_data.head()

Unnamed: 0,image_link,type,color,combi_image_link,combi_type,combi_color,match
0,https://static.zara.net/assets/public/7aa1/1577/639142eea9dc/1b59f79e844a/03199305712-e1/03199305712-e1.jpg?ts=1708505012502&w=400,Shirts,Ecru,https://static.zara.net/assets/public/3b5b/4f64/09824726b943/fda95d155c48/09464444401-e1/09464444401-e1.jpg?ts=1706716246324&w=400,Trousers,Blue,0
1,https://static.zara.net/assets/public/6fb5/d90b/b14c488ebf20/93ecaf0da992/04083228407-e1/04083228407-e1.jpg?ts=1707930382083&w=400,Shirts,Blue,https://static.zara.net/assets/public/68c2/4816/b54e4715a674/8062beda6ec2/01879025427-e1/01879025427-e1.jpg?ts=1707150703955&w=400,Skirts,Blue,0
2,https://static.zara.net/assets/public/3a34/cd42/ed3440c1ab86/742ab121027d/02132425801-e1/02132425801-e1.jpg?ts=1704970847582&w=400,Tops,Grey,https://static.zara.net/assets/public/81ff/15ed/c2754628ae70/6f6a048b7bcb/02615747800-e1/02615747800-e1.jpg?ts=1707997867807&w=400,Skirts,Black,0
3,https://static.zara.net/assets/public/0dfa/b72c/7325428a954f/c67014a53285/02893164922-e1/02893164922-e1.jpg?ts=1705652687939&w=400,Cardigans,Grey,https://static.zara.net/assets/public/772f/e5ab/012c48839dd7/b193a9551f47/03199052800-e1/03199052800-e1.jpg?ts=1706184194971&w=400,Trousers,Black,0
4,https://static.zara.net/assets/public/53ab/0bb3/a81142039f0c/b5ef3ce8c7a7/02520624251-e1/02520624251-e1.jpg?ts=1707299731557&w=400,Tops,White,https://static.zara.net/assets/public/25ed/b8d6/1809438e9cf6/2c0ee9f6c60b/08417804712-e1/4000.jpg?ts=1697793472602&w=400,Trousers,Ecru,0


## 4- Image download

In [538]:
def modified_url(url):
    # Split the URL by '?' to separate the base URL and the parameters
    base_url, params = url.split('?')

    # Split the parameters by '&' to separate them
    params_list = params.split('&')

    # Create a dictionary to store the parameters
    params_dict = {}
    for param in params_list:
        key, value = param.split('=')
        params_dict[key] = value

    # Modify the parameters
    params_dict['w'] = '1280'

    # Reconstruct the modified URL
    return base_url + '?' + '&'.join([f"{key}={value}" for key, value in params_dict.items()])

In [549]:
def download_image(url, folder, name):
    url = modified_url(url)

    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    'Referer': 'https://www.google.com/'
    }

    # Send a GET request to the image URL
    response = requests.get(url, headers=headers)

    time.sleep(1.0)
    
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        img = Image.open(BytesIO(response.content))
    
        new_size = (768,1074)

        new_img = Image.new("RGB", new_size)
        # Paste the original image onto the new image with the desired size
        new_img.paste(img.resize(new_size), (0, 0))
        
        img = new_img
    
        img.info['dpi'] = (150, 150)

        # Save the image to the specified folder with the given name
        img.save(folder + '/' + name + ".jpg")
    else:
        print("Failed to download image url:", url, name)

In [548]:
for index, row in synthetic_data.iloc[2250:].iterrows():
        image_url = row['image_link']
        image_name = f"A{index}"  # Creating a unique name for each image
        print(f'Progress: {index} / {synthetic_data.shape[0]}', end='\r')
        download_image(image_url, 'A-Zara-images', image_name)

Progress: 2999 / 3000

In [550]:
for index, row in synthetic_data.iterrows():
        image_url = row['combi_image_link']
        image_name = f"B{index}"  # Creating a unique name for each image
        print(f'Progress: {index} / {synthetic_data.shape[0]}', end='\r')
        download_image(image_url, 'B-Zara-images', image_name)

Progress: 2999 / 3000

## 5- Concat Mango and Zara

In [579]:
mango = pd.read_csv('Mango data/cleaned_Mango_data.csv',index_col=0)
zara = pd.read_csv('Zara data/synthetic_Zara_data.csv')


In [585]:
mango.head()

Unnamed: 0,image_link,type,color,combi_image_link,combi_type,combi_color,match
0,A-Mango-images/A0,Shirts,Ecru,B-Mango-images/B0,Skirts,Ecru,1
1,A-Mango-images/A1,Shirts,Ecru,B-Mango-images/B1,Bags,Brown,1
2,A-Mango-images/A2,Shirts,Ecru,B-Mango-images/B2,Jewellery,Gold,1
3,A-Mango-images/A3,Shirts,Pink,B-Mango-images/B3,Trousers,Pink,1
4,A-Mango-images/A4,Shirts,Pink,B-Mango-images/B4,Shoes,Silver,1


In [584]:
mango['image_link']='A-Mango-images/A'+mango.index.astype(str)
mango['combi_image_link']='B-Mango-images/B'+mango.index.astype(str)

In [588]:
zara.head()

Unnamed: 0,image_link,type,color,combi_image_link,combi_type,combi_color,match
0,A-Zara-images/A0,Shirts,Ecru,B-Zara-images/B0,Trousers,Blue,0
1,A-Zara-images/A1,Shirts,Blue,B-Zara-images/B1,Skirts,Blue,0
2,A-Zara-images/A2,Tops,Grey,B-Zara-images/B2,Skirts,Black,0
3,A-Zara-images/A3,Cardigans,Grey,B-Zara-images/B3,Trousers,Black,0
4,A-Zara-images/A4,Tops,White,B-Zara-images/B4,Trousers,Ecru,0


In [587]:
zara['image_link']='A-Zara-images/A'+zara.index.astype(str)
zara['combi_image_link']='B-Zara-images/B'+zara.index.astype(str)

In [589]:
data = pd.concat([mango,zara], axis=0)
data.reset_index(drop=True, inplace=True)

In [590]:
data.head()

Unnamed: 0,image_link,type,color,combi_image_link,combi_type,combi_color,match
0,A-Mango-images/A0,Shirts,Ecru,B-Mango-images/B0,Skirts,Ecru,1
1,A-Mango-images/A1,Shirts,Ecru,B-Mango-images/B1,Bags,Brown,1
2,A-Mango-images/A2,Shirts,Ecru,B-Mango-images/B2,Jewellery,Gold,1
3,A-Mango-images/A3,Shirts,Pink,B-Mango-images/B3,Trousers,Pink,1
4,A-Mango-images/A4,Shirts,Pink,B-Mango-images/B4,Shoes,Silver,1


In [591]:
data.shape

(6793, 7)

In [596]:
data.to_csv('all_data.csv')