In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd


def extract_product_info(html_file):
    with open(html_file, "r", encoding="utf-8") as file:
        # Read the HTML content
        html = file.read()

    # Create a BeautifulSoup object
    soup = BeautifulSoup(html, "html.parser")

    # Find all div elements with data-component="ProductCardImageContainer"
    product_image_containers = soup.find_all(
        "div", {"data-component": "ProductCardImageContainer"}
    )
    image_urls = [
        container.find("img")["src"] if container.find("img") else None
        for container in product_image_containers
    ]

    # Find all div elements with data-component="ProductCardBrandName"
    brand_name_containers = soup.find_all(
        "p", {"data-component": "ProductCardBrandName"}
    )
    brand_names = [container.get_text() for container in brand_name_containers]

    # Find all div elements with data-component="ProductCardDescription"
    description_containers = soup.find_all(
        "p", {"data-component": "ProductCardDescription"}
    )
    descriptions = [container.get_text() for container in description_containers]

    return pd.DataFrame(
        {
            "image_urls": image_urls,
            "brand_names": brand_names,
            "descriptions": descriptions,
        }
    )


def download_and_save_html(url, output_file):
    headers = {
        "authority": "www.farfetch.com",
        "method": "GET",
        "path": url[24:],
        "scheme": "https",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7",
        "Cache-Control": "max-age=0",
        "Cookie": 'BIcookieID=e5122c4f-5cc7-45a1-91bd-5e95939b132d; ckm-ctx-sf=%2F; BISessionId=2f5ca6e8-2f33-5b02-cfec-177e4d9d99b8; ffcp=a.1.0_f.1.0_p.1.0_c.1.0; ub=50CAC614848E6A27CBC86CA9829B46A4; __Host-CSRF-TOKEN=CfDJ8BZV7bSK_gVKoJ5_tFR15T5irBcu-Dgw7zsyADu4y8xK5IWNVa4B9iEfwax2M4osLjSvK8NLACg-6JHQYslMicW0nogNXAFEoreDRz2-p1ggVhwl6kHJ36XNNwrNipKtlV2lu82MF9jrRjPvijeka_o; __Host-FF.AppSession=CfDJ8BZV7bSK%2FgVKoJ5%2FtFR15T6Reuvv2H1%2BfqGK%2B0opObjiGF65yVwv48h%2BCR1Q7lK4YalFF1IyqjQu8ewElUobyLZ2%2Fz7gNS7vUFPyvKTwY5qAUUz5LZ3xyL37JDPx91sEFjJwWg4QQpWqP0yoAqN22xJU%2FxbDuReeQ%2BZsaJssWqxz; checkoutType2=4; session-1=2f5ca6e8-2f33-5b02-cfec-177e4d9d99b8; ff_newsletter_pv=1; __utmz=other; _gcl_aw=GCL.1696208830.CjwKCAjwseSoBhBXEiwA9iZtxkIOtRiM3LIKUh4-ZGMgwexaElGfrq_tLd0MAerRIIDPfIsf9iY_-hoC2SUQAvD_BwE; _gcl_au=1.1.994878686.1696208830; _gac_UA-3819811-6=1.1696208830.CjwKCAjwseSoBhBXEiwA9iZtxkIOtRiM3LIKUh4-ZGMgwexaElGfrq_tLd0MAerRIIDPfIsf9iY_-hoC2SUQAvD_BwE; _cs_c=0; rskxRunCookie=0; rCookie=fdavulcwdjvcpfydjt439mln86xy44; FPID=FPID2.3.wUYWFRADdb0Ac9oryFj9Y5RD6RNV%2FC9nTjUOG5dI8PY%3D.1696208830; FPGCLAW=GCL.1696208831.CjwKCAjwseSoBhBXEiwA9iZtxkIOtRiM3LIKUh4-ZGMgwexaElGfrq_tLd0MAerRIIDPfIsf9iY_-hoC2SUQAvD_BwE; FPAU=1.1.994878686.1696208830; __Host-FF.AppCookie=CfDJ8BZV7bSK_gVKoJ5_tFR15T4eHjMCos8s4kAdWVWGvMSsXQsoFxdEeH2LoI-ZScguKv9bnPh7m854kj0usVU45xmvbi8t9F6-6snVOnxTGDcBGqCVvlsEEuddWs-yY_NYJIFtf8ELv_wCEjdHD0wYJ1cnqEk-4AfXEVkyctI_UGt5EoXyeQm8R6q0KMJui0DggGcgZ8Ra9sx2frRid1LN56By2hRSgOVVHVJ3L-91wSi7lvFRLvMjIfs5cptPlyV5YCija8mS-ph41eFuzOe3fqEqPxlGKBIjNRCUwik-8Mh3; ff_navroot_history=141258; g_state={"i_p":1697350439861,"i_l":1}; _abck=02946E580FEB729CD982A787F7BCCEE2~0~YAAQFBEoF8wZXRuLAQAA1RRFQApiBSwr+cX7L49XLH2448UsdoIU3KdnMTQ4QmayoW9JUTiL4sQMwkSKJNjFQokBB2bqrND8xAxcykPtaGnvAwYisXlKPV/8vK+VM2i7Z9AkIprl1MyJgVNFmJN+FuNy8rlyqZJipRubcDWJ8cOtPfJprHu0JJWv0k3e9UW7B7kYF87LB11mPI/2TfyuXXJHqo/3DbAh8Iqa53F9HHjWSkeRwNdK3NR7JpSgVc1MLa+j8n6s2lPZg0JLbVEqshZBZIIFbzzhxGKLqIHh8xZg6X9bItAQaxg9L3cCVYgQH+lpcHvcBFRJ3cBAXv2vCEjYakBqa5AmcPf8pKyJc57z5pbjEpf84Bz0hN9iUebEs7Fv666g1OIijwWO60MpU1KeJHe/KY389gI=~-1~-1~-1; bm_sz=59A251A51A42F233950C108E2FC7172D~YAAQFBEoF80ZXRuLAQAA1RRFQBWHb2I0nZa6rOc8+9mKhBWjQLofGTk9hbAwrTcZ4NmKlQtoB1IeKcM9O9tJ4XBpPkntsjwfApjP+2OFFA43HkBBlMz+1B+5cAOrCd4msPz0xqE7PblRiLcJ/fmFp9mvb4ueyAJI6boG4ADhfyV5Xpop3tWcmsaJb2fSdSiA4/ucXeeDiFNVCCX4aAKZiyrW6YxMFoK5J/BedssVquIRDO2W+lCTKPeDmlU38B26AYBEMVF05v79NTjY/CwUcTtAos+k33NOfKXWS6bSreNLl5DvFw==~3616834~4535095; ABProduct=; ABListing=; ABGeneral=; ABLanding=; ABCheckout=; ABRecommendations=; ABReturns=; ABWishlist=; ak_bmsc=E88534218FD99C9E10D3219BF4748EA9~000000000000000000000000000000~YAAQFBEoFzAaXRuLAQAAnR9FQBVjdtwduRe39hPoD979xDpTZXUcZ05vUlaFgYw/pQxWOkyF7XCil8BCztF3XV409FbzwSYaFSjYsiLuf98a+m7/LGlMzsgkZptn/sBsM8mFZXw+O4AiTLFU5g3YinkOO3t9j65SYWeMOjqrQRtFwX4iVje6Ga+BbjWsgQKzJ1gQgvY5N+5C3kBiPDY2mLqnxhaoXUkV7xcJDckCDXvJwKWQXaxZCFVtk1pySImI+a5OtOeywHmlFDrXqXYTXc8d00AEAP2yjJJuKvdjfBJx85xmuYZjN6Pw+59FCXtS3NWp7Z9Lsu4FFeOIDsmTt488Y8zrn+iuZRlIYwl9tbzW1rqQQ8Ya7n1xFfbglwOcHGOKQG3deuVRGgBp5C94Re4JV+BTaaGAjz4dhlP88GvZxMxGEMMoFaQnSdPfYcrNuZbyKfEguLPDsMrbO6HjnBK2LQetoVwX0vpcR5ITTelS4CE8xfhkvSjh9lro3n16Et42ng==; _gid=GA1.2.343771972.1697590356; ftr_blst_1h=1697590355812; FPLC=6AY5Hw0gFha%2FBIsxhwhTSMsHf21C1NanOQjcE%2BzFr8ZWgkQlvpYjFfkJZKBxhjsrfK1Zfz4nE7iUkCMPgEIpoWXBFCsagOESfLTlhazijHZprktp1J%2FEGM4hlbkrXA%3D%3D; ExperimentsListing=2548c5.1; _cs_mk=0.8827785230350644_1697592566831; AkamaiFeatureToggle=02a57c.1_0357f7.1_04154b.1_050b85.1_0a3efc.1_157b5e.-1_15d9b3.2_1d8e03.1_1fc0ee.-1_20b92f.1_213bb6.-1_247006.-1_26ddb8.2_2ba087.3_361eee.-1_3aa8d2.0_3c8089.2_4247d8.-210644093_425ded.1_45dc7d.1_48259b.1_4b57a6.-1_4d76c8.2_56f7db.-1_5836e0.-1_590a92.1_5a000f.1_5a745a.-1_5dbd1a.1_5edc51.-1_603919.1_613a9b.-416292886_64d19c.1_67486d.-1_678f94.0_687752.-1_6df3b9.-1_729a35.1_751ef1.-1_8c3210.-1_8c4007.-1_931982.1_945679.0_999fce.3_9a710c.1_9ebcf7.1_9f0eda.-1_9fca73.-1_a00510.1148090917_a54601.1_ac992b.1_ae71cc.-1531679491_b45ee1.1_b833c7.-1_b8833c.1_b8e9db.0_b90715.3_bf09c6.1_bfc591.1_c06844.-1_c0ba66.-900375819_c2155c.1_cfc1ba.1_d052f2.-1_d26d24.1_d47781.-1_d59758.-1_da4cdf.1_dab09d.632075632_db79f1.2_dd19ed.1_deb641.-1_dec9f3.1_df039e.-1_df93a0.1_e7eec4.1_e89c2a.1_ed07fa.0_ed8d9e.1_ef0e65.-1_f220ef.4_f3db94.1_f5969a.1_f8c66b.1_fb2b96.1_fbf4d6.1_fdbb7a.0_fdd39e.-1; __gads=ID=51abeb43289207cc:T=1697342564:RT=1697593071:S=ALNI_MadRQwhK_ckj-z6y69O1SR9EvgTkQ; __gpi=UID=00000d99b9240e66:T=1697342564:RT=1697593071:S=ALNI_MasuL4OBRzbiJrJSKWX_TvI1SNMwg; lastRskxRun=1697593072487; _uetsid=a0a01bb06d5011ee89ab212db2e16dad; _uetvid=dc1bd0b0208011ecbb659f05fb7ba070; _cs_id=387a04be-1680-a21b-9e6b-63885761d988.1696208830.5.1697593073.1697592323.1.1730372830528; _cs_s=2.0.0.1697594873440; cto_bundle=tG6nAV9LdW9BcXozWFdEbkdLZUZFZHlYYnRVc3VYTk9lbnpxVU50SUgyJTJGOFdTQ0FBbktwOW5VMDFVMHZWa3Y4bHpHemxmbDc3a1dRMVRoMkJZSGZCT0VaOVlLd3FpQnJ5NWxnWVNVRGtyYlBlSVZpS2Rac3dENjhNMVJ2bXVrSVV6SHU4ZkFOQUlBcXJEbjdPMnUlMkJ5NDZpUkgxN1dqUFBCJTJCMldZd2dRSlExRjFXZkhOTWxYNkVzZDV0b3ljMFV5ZVY0OUtWWkF3QWlLWlNKciUyQlBheEVqdmN3cTM3c1lZeGp0cmtGbVAzTlJnaSUyRjloaUI2YmxpWlRyQndtUyUyQlYzbVF1JTJGaWM; forterToken=f2cd5c3f4ef840ab94bfef6cb85fb485_1697593071476__UDF43-m4_11ck; _ga=GA1.2.2039679431.1696208830; _ga_HLS8C90D41=GS1.1.1697592567.5.1.1697593174.0.0.0; _ga_CEF7PMN9HX=GS1.1.1697592567.5.1.1697593174.60.0.0; __cuid=3553b5d1c20d4c8fb860dd19da3fb547; __Host-CSRF-REQUEST-TOKEN=CfDJ8BZV7bSK_gVKoJ5_tFR15T5Yrx-yBYfj-vSt6elC-6vlkGVrob62gsNNM9nG9Wmij39r8S1lvu2eipW_P9carzZbejPJlEhJH_iAU1Q-MmOFnDEZgDuLtooTktyutNyMOZ5gSfCLhd2PiU3KiQFj2PHw7uP9ASmJu4HuZjQyflECi2OrQ824cGrrBe-dcEG1kA; bm_sv=2569AA33F1A327781AE329382FA91AE9~YAAQ5yTDF8LB7DOLAQAAAJ1xQBX/H8uk63Kof2lE1LK4fvhDumkoCtbeqyuvFrRja4tV4G9p2Qr4UBFIP4/Gt9uH5uD6zh9DkXLbWaNOZWc7bjZ+DPnzbMkx84wR//Z91DdBxRoidPKK/NHyBmtnXVnmdpCTb6O+dRFcSIlZP1gPZddv1L35z7/qAhTSqYdJcKvt2IL3oi+nf6eEObqvFfaDaWNUkeSDLRe/XjNIqSJksAj4XbhzFwr/wE4+okaS/LCS~1; RT="z=1&dm=www.farfetch.com&si=d4cab7f7-da1c-4c6d-93bc-5c7a4258012f&ss=lnv32xx5&sl=1&tt=37t&rl=1&ld=37w&nu=1smlnz34&cl=294k&ul=4ufb',
        "Referer": "https://www.farfetch.com/sets/womenswear-gift-list.aspx?page=1&view=96&sort=3&scale=280&category=136326",
        "Sec-Ch-Ua": '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"',
        "Sec-Ch-Ua-Arch": "x86",
        "Sec-Ch-Ua-Full-Version-List": '"Chromium";v="118.0.5993.70", "Google Chrome";v="118.0.5993.70", "Not=A?Brand";v="99.0.0.0"',
        "Sec-Ch-Ua-Mobile": "?0",
        "Sec-Ch-Ua-Model": '""',
        "Sec-Ch-Ua-Platform": "macOS",
        "Sec-Ch-Ua-Platform-Version": "13.6.0",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "same-origin",
        "Sec-Fetch-User": "?1",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
    }

    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            with open(output_file, "wb") as file:
                file.write(response.content)
            print(f"HTML page downloaded from {url} and saved as {output_file}")
        else:
            print(f"Failed to download HTML page. Status code: {response.status_code}")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

In [91]:
import pandas as pd
import os


def save_complete_dataset():
    folder_paths = [
        "causal",
        "formal-event",
        "meeting",
        "night-out",
        "others",
        "workout",
    ]
    dfs = []
    for folder_path in folder_paths:
        for filename in os.listdir(folder_path):
            if filename.endswith(".csv"):
                file_path = os.path.join(folder_path, filename)
                df = pd.read_csv(file_path).iloc[:]
                dfs.append(df)

    combined_df = pd.concat(dfs, ignore_index=True)
    combined_df["image_urls"] = combined_df["image_urls"].str.strip()
    combined_df["image_urls"] += " "
    combined_df.to_csv("complete_dataset.csv", index=False)


save_complete_dataset()


In [72]:
df = pd.read_csv("complete_dataset.csv")
df.keys()
df["type"] = df["type"].str.lower()

In [73]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

uri = "mongodb+srv://zxiao98:xgVREiehDz2Q8oTA@cluster0.4oxubqp.mongodb.net/?retryWrites=true&w=majority"

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi("1"))

# Send a ping to confirm a successful connection
try:
    client.admin.command("ping")
    print("Pinged your deployment. You successfully connected to MongoDB!")

    # Replace with your preferred database and collection names
    db = client["Wardrobe-Wizard"]
    collection = db["clothes"]

    # Assuming you have a DataFrame named 'df'
    data = df.to_dict(orient="records")
    # drop all in collection
    collection.delete_many({})
    collection.insert_many(data)

except Exception as e:
    print(e)


Pinged your deployment. You successfully connected to MongoDB!


We have, in total, 6 types, including:
whole 
top
bottom
shoes (optional)
accessory (optional)
jacket (optional)

in which, (whole) and (top and bottom) are exclusive. 


* Active Wear
  * Top
  * Trousers
  * Short
* Formal Event
  * Dress
  * Pump 
* Meeting
  * Whole Skirt
  * Jacket
  * Shirt
  * Pencil Skirt
  * Trousers
* Night Out
  * Dresses
  * Tops
  * Pants
  * Skirt
  * Jackets
* Casual 
  * T-shirt

In [83]:
df = extract_product_info("sample.html")


In [86]:
file = "causal/causal_shorts.csv"
# df = pd.read_csv(file)
df["image_urls"] += " "
df["event"] = "causal"
df["type"] = "bottom"
df.head()


Unnamed: 0,image_urls,brand_names,descriptions,event,type
0,https://cdn-images.farfetch-contents.com/19/89...,Lygia & Nanny,Lee elasticated-waist mini shorts,causal,bottom
1,https://cdn-images.farfetch-contents.com/18/22...,Levi's,raw-edge denim shorts,causal,bottom
2,https://cdn-images.farfetch-contents.com/19/08...,adidas by Stella McCartney,TruePurpose training shorts,causal,bottom
3,https://cdn-images.farfetch-contents.com/19/89...,The Upside,Zinnia Soho terry shorts,causal,bottom
4,https://cdn-images.farfetch-contents.com/20/92...,Lygia & Nanny,Lee elasticated-waist mini shorts,causal,bottom


In [87]:
df.to_csv(file, index=False)
