## The goal: First we should scrape size.charts.com and find the sizes of each of these brands. 

### Note: The original Playwright + BeautifulSoup method failed due to an
### HTTPS certificate mismatch on SizeCharter’s domain. AI was used to help work around this. 

In [1]:
import asyncio
import nest_asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup

nest_asyncio.apply()

async def scrape():
    async with async_playwright() as p:
        browser = await p.chromium.launch(
            headless=True,
            args=["--ignore-certificate-errors"]
        )

        page = await browser.new_page()

        await page.goto(
            "https://www.sizecharter.com/brands/",
            wait_until="networkidle"
        )

        html = await page.content()
        await browser.close()

        return html

html = asyncio.run(scrape())

soup_doc = BeautifulSoup(html, "html.parser")
soup_doc

<!DOCTYPE html>
<html><head>
<meta charset="utf-8"/>
<title>Brands for Clothing Size Charts</title>
<meta content="See size charts for women and men for all your favorite brands." name="description"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<meta content="http://www.sizecharter.com/images/tape.png" property="og:image"/>
<link href="/assets/application-facb66ce8c6c11008e13b48392b85721.css" media="all" rel="stylesheet"/>
<script async="" src="https://www.google-analytics.com/plugins/ua/linkid.js" type="text/javascript"></script><script async="" src="//www.google-analytics.com/analytics.js"></script><script src="/assets/application-28d93dca1c61c20f2d5ca27562bf137b.js"></script>
<meta content="authenticity_token" name="csrf-param"/>
<meta content="Q90Dy6uJ8mSEGGeFxThWFDEZqeqRkKQrJNAohFq2cmY=" name="csrf-token"/>
<script async="" src="//www.googletagservices.com/tag/js/gpt.js"></script>
<script>
        var googletag = googletag || {}, cw = document.documentEle

## Awesome! We were able to scrape. Now, let's just look at the brands. 

In [2]:
all_brands=soup_doc.find_all('ul', id="list")
all_brands

[<ul id="list">
 <li>
 <a href="/brands/7-for-all-mankind/womens">7 for all mankind</a>
 </li>
 <li>
 <a href="/brands/abercrombie-fitch/womens">Abercrombie &amp; Fitch</a>
 </li>
 <li>
 <a href="/brands/adidas/womens">Adidas</a>
 </li>
 <li>
 <a href="/brands/adriannpapell/womens">Adrianna Papell</a>
 </li>
 <li>
 <a href="/brands/aeropostale/womens">Aeropostale</a>
 </li>
 <li>
 <a href="/brands/affliction/womens">Affliction</a>
 </li>
 <li>
 <a href="/brands/gaci/womens">A'Gaci</a>
 </li>
 <li>
 <a href="/brands/ag-adriano-goldschmied/womens">AG Adriano Goldschmied </a>
 </li>
 <li>
 <a href="/brands/alloy/womens">Alloy</a>
 </li>
 <li>
 <a href="/brands/americapparel/womens">American Apparel</a>
 </li>
 <li>
 <a href="/brands/americeagle-outfitters/womens">American Eagle Outfitters</a>
 </li>
 <li>
 <a href="/brands/anne-klein/womens">Anne Klein</a>
 <a href="/brands/anne-klein-petites/womens">Anne Klein Petites</a>
 </li>
 <li>
 <a href="/brands/ann-taylor/womens">Ann Taylor</a>
 

## Now, let's make a list of the Brand name and the hyperlink to the brand's size guide. 
### We will multipage scrape. 

In [3]:
brands_list=[]
base_url = "https://www.sizecharter.com"

for clothes in all_brands:
    a_tags=clothes.find_all('a')
    for a in a_tags:
         href = a.get('href')
         brand_name = a.get_text(strip=True) 
         full_url = base_url + href 
         brands_list.append({'Brand': brand_name, 'URL': full_url})
brands_list

[{'Brand': '7 for all mankind',
  'URL': 'https://www.sizecharter.com/brands/7-for-all-mankind/womens'},
 {'Brand': 'Abercrombie & Fitch',
  'URL': 'https://www.sizecharter.com/brands/abercrombie-fitch/womens'},
 {'Brand': 'Adidas',
  'URL': 'https://www.sizecharter.com/brands/adidas/womens'},
 {'Brand': 'Adrianna Papell',
  'URL': 'https://www.sizecharter.com/brands/adriannpapell/womens'},
 {'Brand': 'Aeropostale',
  'URL': 'https://www.sizecharter.com/brands/aeropostale/womens'},
 {'Brand': 'Affliction',
  'URL': 'https://www.sizecharter.com/brands/affliction/womens'},
 {'Brand': "A'Gaci", 'URL': 'https://www.sizecharter.com/brands/gaci/womens'},
 {'Brand': 'AG Adriano Goldschmied',
  'URL': 'https://www.sizecharter.com/brands/ag-adriano-goldschmied/womens'},
 {'Brand': 'Alloy', 'URL': 'https://www.sizecharter.com/brands/alloy/womens'},
 {'Brand': 'American Apparel',
  'URL': 'https://www.sizecharter.com/brands/americapparel/womens'},
 {'Brand': 'American Eagle Outfitters',
  'URL': 

## Awesome. Now we will begin the multipage scrape. 
### But before we do that, let's scrape of of the brand's links. From there, we will have the framework to multipage scrape. 

In [4]:
import requests
from bs4 import BeautifulSoup

my_second_url = "https://www.sizecharter.com/brands/james-jeans/womens"

response = requests.get(my_second_url, verify=False)
second_raw_html = response.content

soup_doc_1 = BeautifulSoup(second_raw_html, "html.parser")
soup_doc_1



<!DOCTYPE html>

<html>
<head>
<meta charset="utf-8"/>
<title>James Jeans - Women's Size Charts</title>
<meta content="James Jeans size charts including Women's Tops sizing and Maternity Bottoms sizing." name="description"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<meta content="http://www.sizecharter.com/images/tape.png" property="og:image"/>
<link href="/assets/application-facb66ce8c6c11008e13b48392b85721.css" media="all" rel="stylesheet"/>
<script src="/assets/application-28d93dca1c61c20f2d5ca27562bf137b.js"></script>
<meta content="authenticity_token" name="csrf-param">
<meta content="Q90Dy6uJ8mSEGGeFxThWFDEZqeqRkKQrJNAohFq2cmY=" name="csrf-token">
<script async="" src="//www.googletagservices.com/tag/js/gpt.js"></script>
<script>
        var googletag = googletag || {}, cw = document.documentElement.clientWidth,
            ad300 = {show:true,sizes: [[300, 600], [300, 250]]},
            admid = {sizes: [[728, 90], [970, 90]]},
            adtop = {si

In [5]:
soup_doc_1.find('h1')
all_charts=soup_doc_1.find_all(class_="chart")
all_charts

[<section class="chart" id="style-1458">
 <h2> Shirts - Alpha</h2>
 <table>
 <thead>
 <tr>
 <th>Size</th>
 <th>Bust</th>
 <th>Waist</th>
 </tr>
 </thead>
 <tr>
 <td>P</td>
 <td data-cm="82½" data-in="32½">32½</td>
 <td data-cm="61" data-in="24">24</td>
 </tr>
 <tr>
 <td>S</td>
 <td data-cm="85 - 87½" data-in="33½ - 34½">33½ - 34½</td>
 <td data-cm="63½ - 66" data-in="25 - 26">25 - 26</td>
 </tr>
 <tr>
 <td>M</td>
 <td data-cm="90 - 92½" data-in="35½ - 36½">35½ - 36½</td>
 <td data-cm="68½ - 71" data-in="27 - 28">27 - 28</td>
 </tr>
 <tr>
 <td>L</td>
 <td data-cm="96½ - 100½" data-in="38 - 39½">38 - 39½</td>
 <td data-cm="75 - 78½" data-in="29½ - 31">29½ - 31</td>
 </tr>
 <tr>
 <td>XL</td>
 <td data-cm="104" data-in="41">41</td>
 <td data-cm="82½" data-in="32½">32½</td>
 </tr>
 </table>
 </section>,
 <section class="chart" id="style-1459">
 <h2> Shirts - Numeric</h2>
 <table>
 <thead>
 <tr>
 <th>Size</th>
 <th>Bust</th>
 <th>Waist</th>
 </tr>
 </thead>
 <tr>
 <td>2</td>
 <td data-cm="82

In [6]:
all_charts=soup_doc_1.find_all(class_="chart")
all_charts

[<section class="chart" id="style-1458">
 <h2> Shirts - Alpha</h2>
 <table>
 <thead>
 <tr>
 <th>Size</th>
 <th>Bust</th>
 <th>Waist</th>
 </tr>
 </thead>
 <tr>
 <td>P</td>
 <td data-cm="82½" data-in="32½">32½</td>
 <td data-cm="61" data-in="24">24</td>
 </tr>
 <tr>
 <td>S</td>
 <td data-cm="85 - 87½" data-in="33½ - 34½">33½ - 34½</td>
 <td data-cm="63½ - 66" data-in="25 - 26">25 - 26</td>
 </tr>
 <tr>
 <td>M</td>
 <td data-cm="90 - 92½" data-in="35½ - 36½">35½ - 36½</td>
 <td data-cm="68½ - 71" data-in="27 - 28">27 - 28</td>
 </tr>
 <tr>
 <td>L</td>
 <td data-cm="96½ - 100½" data-in="38 - 39½">38 - 39½</td>
 <td data-cm="75 - 78½" data-in="29½ - 31">29½ - 31</td>
 </tr>
 <tr>
 <td>XL</td>
 <td data-cm="104" data-in="41">41</td>
 <td data-cm="82½" data-in="32½">32½</td>
 </tr>
 </table>
 </section>,
 <section class="chart" id="style-1459">
 <h2> Shirts - Numeric</h2>
 <table>
 <thead>
 <tr>
 <th>Size</th>
 <th>Bust</th>
 <th>Waist</th>
 </tr>
 </thead>
 <tr>
 <td>2</td>
 <td data-cm="82

In [7]:
for clothing_type in all_charts:
    print(clothing_type.find_all('h2'))

[<h2> Shirts - Alpha</h2>]
[<h2> Shirts - Numeric</h2>]
[<h2> Bottoms</h2>]
[<h2>Plus Jeans and Pants - Numeric Sizes</h2>]
[<h2>Plus Jeans and Pants - X Sizes</h2>]
[<h2> Dresses - Alpha</h2>]
[<h2> Dresses - Numeric</h2>]
[<h2> Jackets - Alpha</h2>]
[<h2> Jackets - Numeric</h2>]


In [8]:
tables = soup_doc_1.find_all("table")
category_headers = soup_doc_1.find_all("h2")

brand_name = soup_doc_1.find("h1").get_text(strip=True)  

name_of_brand = {} 

size_chart = {} 

for i, table in enumerate(tables):
    category = category_headers[i].get_text(strip=True) if i < len(category_headers) else f"Category {i+1}"

    headers = [th.get_text(strip=True) for th in table.find_all("th")]

    data = []
    for row in table.find_all("tr")[1:]: 
        cells = row.find_all("td")
        data.append([cell.get_text(strip=True) for cell in cells])

    size_chart[category] = {headers[j]: [row[j] for row in data] for j in range(len(headers))}
name_of_brand[brand_name] = size_chart

name_of_brand

{'James Jeans Size Charts': {'Shirts - Alpha': {'Size': ['P',
    'S',
    'M',
    'L',
    'XL'],
   'Bust': ['32½', '33½ - 34½', '35½ - 36½', '38 - 39½', '41'],
   'Waist': ['24', '25 - 26', '27 - 28', '29½ - 31', '32½']},
  'Shirts - Numeric': {'Size': ['2', '4', '6', '8', '10', '12', '14', '16'],
   'Bust': ['32½', '33½', '34½', '35½', '36½', '38', '39½', '41'],
   'Waist': ['24', '25', '26', '27', '28', '29½', '31', '32½']},
  'Bottoms': {'Size': ['23',
    '24',
    '25',
    '26',
    '27',
    '28',
    '29',
    '30',
    '31',
    '32'],
   'Waist': ['23', '24', '25', '26', '27', '28', '29', '30', '31', '32']},
  'Plus Jeans and Pants - Numeric Sizes': {'Length': ['Crop', 'Regular'],
   'Inseam': ['16 - 27', '29 - 34']},
  'Plus Jeans and Pants - X Sizes': {'Size': ['14',
    '16',
    '18',
    '20',
    '22',
    '24'],
   'Waist': ['34 - 35', '36 - 37', '38 - 39', '40 - 41', '42 - 43', '44 - 45'],
   'Hips': ['44', '45 - 46', '47 - 48', '49 - 50', '51 - 52', '53 - 54']},


## Great. We were successful in scraping for one brand.
## Now let's do it for ALL brands. 
### Disclaimer: AI assistance was used to resolve SSL certificate verification errors in the original scraping implementation.

In [9]:
import requests
from bs4 import BeautifulSoup
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def getSizeChart(href):
    try:
        raw_html = requests.get(href, verify=False).content  # Add verify=False
        soup = BeautifulSoup(raw_html, "html.parser")
        brand_name = soup.find("h1").get_text(strip=True)
        tables = soup.find_all("table")
        category_headers = soup.find_all("h2")
        
        size_chart = {}
        for i, table in enumerate(tables):
            category = category_headers[i].get_text(strip=True) if i < len(category_headers) else f"Category {i+1}"
            headers = [th.get_text(strip=True) for th in table.find_all("th")]
            data = []
            for row in table.find_all("tr")[1:]:
                cells = row.find_all("td")
                data.append([cell.get_text(strip=True) for cell in cells])
            
            size_chart[category] = {headers[j]: [row[j] for row in data] for j in range(len(headers))}
        
        return {brand_name: size_chart}
    except Exception as e:
        print(f"Error scraping {href}: {e}")
        return None

base_url = "https://www.sizecharter.com"
brand_list_url = f"{base_url}/brands/"

brand_page = requests.get(brand_list_url, verify=False).content  
soup = BeautifulSoup(brand_page, "html.parser")
brand_links = [a['href'] for a in soup.find_all("a", href=True) if "/brands/" in a['href'] and "/womens" in a['href']]

all_brands_size_chart = []
for href in brand_links:
    full_url = f"{base_url}{href}"
    size_chart = getSizeChart(full_url)
    if size_chart:
        all_brands_size_chart.append(size_chart)

all_brands_size_chart

[{'7 for all mankind Size Charts': {'Shirts and Sweaters': {'Size': ['XS',
     'S',
     'M',
     'L'],
    'Bust': ['32 - 33', '33 - 34', '34 - 35', '35 - 36'],
    'Waist': ['25 - 26', '26 - 27', '27 - 28', '28 - 29'],
    'Hips': ['35 - 36', '36 - 37', '37 - 38', '38 - 39']},
   'Jeans': {'Size': ['23',
     '24',
     '25',
     '26',
     '27',
     '28',
     '29',
     '30',
     '31',
     '32'],
    'Waist': ['22¼',
     '23¼',
     '24¼',
     '25¼',
     '26¼',
     '27¼',
     '28¼',
     '29¼',
     '30¼',
     '31¼'],
    'Hips': ['33½',
     '34¼',
     '35¼',
     '36¼',
     '37¼',
     '38¼',
     '39¼',
     '40¼',
     '41¼',
     '42¼']},
   'Jackets': {'Size': ['XS', 'S', 'M', 'L'],
    'Bust': ['32 - 33', '33 - 34', '34 - 35', '35 - 36'],
    'Waist': ['25 - 26', '26 - 27', '27 - 28', '28 - 29'],
    'Hips': ['35 - 36', '36 - 37', '37 - 38', '38 - 39']}}},
 {'Abercrombie & Fitch Size Charts': {'Tops': {'Size': ['XS', 'S', 'M', 'L'],
    'Bust': ['32 - 33', '34 

## Let's print out each record. 

In [46]:
for brand in all_brands_size_chart:
    for k, v in brand.items():
        print(k)
        #brand = k.replace(
    print('\n')

{'Shirts and Sweaters': {'Size': ['XS', 'S', 'M', 'L'], 'Bust': ['32 - 33', '33 - 34', '34 - 35', '35 - 36'], 'Waist': ['25 - 26', '26 - 27', '27 - 28', '28 - 29'], 'Hips': ['35 - 36', '36 - 37', '37 - 38', '38 - 39']}, 'Jeans': {'Size': ['23', '24', '25', '26', '27', '28', '29', '30', '31', '32'], 'Waist': ['22¼', '23¼', '24¼', '25¼', '26¼', '27¼', '28¼', '29¼', '30¼', '31¼'], 'Hips': ['33½', '34¼', '35¼', '36¼', '37¼', '38¼', '39¼', '40¼', '41¼', '42¼']}, 'Jackets': {'Size': ['XS', 'S', 'M', 'L'], 'Bust': ['32 - 33', '33 - 34', '34 - 35', '35 - 36'], 'Waist': ['25 - 26', '26 - 27', '27 - 28', '28 - 29'], 'Hips': ['35 - 36', '36 - 37', '37 - 38', '38 - 39']}}


{'Tops': {'Size': ['XS', 'S', 'M', 'L'], 'Bust': ['32 - 33', '34 - 35', '36 - 37', '38']}, 'Bottoms - Numeric Sizes': {'Size': ['000', '00', '0', '2', '4', '6', '8', '10', '12'], 'Waist': ['23', '24', '25', '26', '27', '28', '29', '30', '31']}, 'Bottoms - Alpha Sizes': {'Size': ['XS', 'S', 'M', 'L'], 'Waist': ['23 - 25', '26 - 

## Let's now move it to Pandas to make a clean csv. 

In [17]:
import pandas as pd
df = pd.DataFrame.from_dict(all_brands_size_chart)
df

Unnamed: 0,7 for all mankind Size Charts,Abercrombie & Fitch Size Charts,Adidas Size Charts,Adrianna Papell Size Charts,Aeropostale Size Charts,Affliction Size Charts,A'Gaci Size Charts,AG Adriano Goldschmied Size Charts,Alloy Size Charts,American Apparel Size Charts,...,Under Armour Size Charts,Uniqlo Size Charts,Urban Outfitters Size Charts,Victoria's Secret Size Charts,Pink Size Charts,Vigoss Size Charts,Wet Seal Size Charts,White House/Black Market Size Charts,Woman Within Size Charts,Zara Size Charts
0,"{'Shirts and Sweaters': {'Size': ['XS', 'S', '...",,,,,,,,,,...,,,,,,,,,,
1,,"{'Tops': {'Size': ['XS', 'S', 'M', 'L'], 'Bust...",,,,,,,,,...,,,,,,,,,,
2,,,{'Tops and Jackets - Alpha Sizes': {'Size': ['...,,,,,,,,...,,,,,,,,,,
3,,,,"{'Dresses - Alpha Sizes': {'Size': ['S', 'M', ...",,,,,,,...,,,,,,,,,,
4,,,,,"{'Tops': {'Size': ['XXS', 'XS', 'S', 'M', 'L',...",,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,,,,,,,,,,,...,,,,,,"{'Jeans - Straight': {'Size': ['0', '1 - 2', '...",,,,
153,,,,,,,,,,,...,,,,,,,"{'Junior Tops - Alpha Sizes': {'Size': ['XS', ...",,,
154,,,,,,,,,,,...,,,,,,,,"{'Shirts, T-shirts, Sweaters': {'Size': ['XXS'...",,
155,,,,,,,,,,,...,,,,,,,,,"{'Plus Tops - Alpha Sizes': {'Size': ['S', 'M'...",


## Ok this is NOT what we want. But, it can be fixed.

### First, let's use the melt function to move the brand names under a single column called variable. 

In [18]:
df = df.melt(value_name='Type of clothing')
df

Unnamed: 0,variable,Type of clothing
0,7 for all mankind Size Charts,"{'Shirts and Sweaters': {'Size': ['XS', 'S', '..."
1,7 for all mankind Size Charts,
2,7 for all mankind Size Charts,
3,7 for all mankind Size Charts,
4,7 for all mankind Size Charts,
...,...,...
24644,Zara Size Charts,
24645,Zara Size Charts,
24646,Zara Size Charts,
24647,Zara Size Charts,


## Drop the NaN values. 

In [19]:
df.dropna(inplace=True)
df

Unnamed: 0,variable,Type of clothing
0,7 for all mankind Size Charts,"{'Shirts and Sweaters': {'Size': ['XS', 'S', '..."
158,Abercrombie & Fitch Size Charts,"{'Tops': {'Size': ['XS', 'S', 'M', 'L'], 'Bust..."
316,Adidas Size Charts,{'Tops and Jackets - Alpha Sizes': {'Size': ['...
474,Adrianna Papell Size Charts,"{'Dresses - Alpha Sizes': {'Size': ['S', 'M', ..."
632,Aeropostale Size Charts,"{'Tops': {'Size': ['XXS', 'XS', 'S', 'M', 'L',..."
...,...,...
24016,Vigoss Size Charts,"{'Jeans - Straight': {'Size': ['0', '1 - 2', '..."
24174,Wet Seal Size Charts,"{'Junior Tops - Alpha Sizes': {'Size': ['XS', ..."
24332,White House/Black Market Size Charts,"{'Shirts, T-shirts, Sweaters': {'Size': ['XXS'..."
24490,Woman Within Size Charts,"{'Plus Tops - Alpha Sizes': {'Size': ['S', 'M'..."


## Let's bring Clothing Type as a column. I will do this by pulling out clothing type from the column Type of clothing. 

In [20]:
df["Type of clothing"] = df["Type of clothing"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df["Clothing Type"] = df["Type of clothing"].apply(lambda x: list(x.keys()) if isinstance(x, dict) else None)
df = df.explode("Clothing Type")
df

Unnamed: 0,variable,Type of clothing,Clothing Type
0,7 for all mankind Size Charts,"{'Shirts and Sweaters': {'Size': ['XS', 'S', '...",Shirts and Sweaters
0,7 for all mankind Size Charts,"{'Shirts and Sweaters': {'Size': ['XS', 'S', '...",Jeans
0,7 for all mankind Size Charts,"{'Shirts and Sweaters': {'Size': ['XS', 'S', '...",Jackets
158,Abercrombie & Fitch Size Charts,"{'Tops': {'Size': ['XS', 'S', 'M', 'L'], 'Bust...",Tops
158,Abercrombie & Fitch Size Charts,"{'Tops': {'Size': ['XS', 'S', 'M', 'L'], 'Bust...",Bottoms - Numeric Sizes
...,...,...,...
24648,Zara Size Charts,"{'Blouses, Shirts, T-Shirts, Sweaters - Numeri...","Pants, Skirts - Alpha sizes"
24648,Zara Size Charts,"{'Blouses, Shirts, T-Shirts, Sweaters - Numeri...",Dresses - Numeric Sizes
24648,Zara Size Charts,"{'Blouses, Shirts, T-Shirts, Sweaters - Numeri...",Dresses - Alpha sizes
24648,Zara Size Charts,"{'Blouses, Shirts, T-Shirts, Sweaters - Numeri...",Jackets - Numeric Sizes


## Let's add the Size Chart column. We will get this information from the Type of Clothing column. 

In [21]:
df["Size Chart"] = df.apply(lambda row: row["Type of clothing"].get(row["Clothing Type"], {}), axis=1)
df

Unnamed: 0,variable,Type of clothing,Clothing Type,Size Chart
0,7 for all mankind Size Charts,"{'Shirts and Sweaters': {'Size': ['XS', 'S', '...",Shirts and Sweaters,"{'Size': ['XS', 'S', 'M', 'L'], 'Bust': ['32 -..."
0,7 for all mankind Size Charts,"{'Shirts and Sweaters': {'Size': ['XS', 'S', '...",Jeans,"{'Size': ['23', '24', '25', '26', '27', '28', ..."
0,7 for all mankind Size Charts,"{'Shirts and Sweaters': {'Size': ['XS', 'S', '...",Jackets,"{'Size': ['XS', 'S', 'M', 'L'], 'Bust': ['32 -..."
158,Abercrombie & Fitch Size Charts,"{'Tops': {'Size': ['XS', 'S', 'M', 'L'], 'Bust...",Tops,"{'Size': ['XS', 'S', 'M', 'L'], 'Bust': ['32 -..."
158,Abercrombie & Fitch Size Charts,"{'Tops': {'Size': ['XS', 'S', 'M', 'L'], 'Bust...",Bottoms - Numeric Sizes,"{'Size': ['000', '00', '0', '2', '4', '6', '8'..."
...,...,...,...,...
24648,Zara Size Charts,"{'Blouses, Shirts, T-Shirts, Sweaters - Numeri...","Pants, Skirts - Alpha sizes","{'Size': ['XXS', 'XS', 'S', 'M', 'L', 'XL'], '..."
24648,Zara Size Charts,"{'Blouses, Shirts, T-Shirts, Sweaters - Numeri...",Dresses - Numeric Sizes,"{'Size': ['2', '4', '6', '8', '10', '12', '14'..."
24648,Zara Size Charts,"{'Blouses, Shirts, T-Shirts, Sweaters - Numeri...",Dresses - Alpha sizes,"{'Size': ['XXS', 'XS', 'S', 'M', 'L', 'XL'], '..."
24648,Zara Size Charts,"{'Blouses, Shirts, T-Shirts, Sweaters - Numeri...",Jackets - Numeric Sizes,"{'Size': ['2', '4', '6', '8', '10', '12', '14'..."


## Same thing here. We must add columns for waist, bust and hips. Let's get this information from the Size Chart Column. 
### After we do that, we may drop the Size Chart and Type of Clothing Column. 

In [22]:
df["Sizes"] = df["Size Chart"].apply(lambda x: x.get("Size", []))
df["Bust"] = df["Size Chart"].apply(lambda x: x.get("Bust", []))
df["Waist"] = df["Size Chart"].apply(lambda x: x.get("Waist", []))
df["Hips"] = df["Size Chart"].apply(lambda x: x.get("Hips", []))
df = df.drop(['Size Chart', 'Type of clothing'], axis=1)
df = df.rename(columns={'variable': 'Brands'})
df

Unnamed: 0,Brands,Clothing Type,Sizes,Bust,Waist,Hips
0,7 for all mankind Size Charts,Shirts and Sweaters,"[XS, S, M, L]","[32 - 33, 33 - 34, 34 - 35, 35 - 36]","[25 - 26, 26 - 27, 27 - 28, 28 - 29]","[35 - 36, 36 - 37, 37 - 38, 38 - 39]"
0,7 for all mankind Size Charts,Jeans,"[23, 24, 25, 26, 27, 28, 29, 30, 31, 32]",[],"[22¼, 23¼, 24¼, 25¼, 26¼, 27¼, 28¼, 29¼, 30¼, ...","[33½, 34¼, 35¼, 36¼, 37¼, 38¼, 39¼, 40¼, 41¼, ..."
0,7 for all mankind Size Charts,Jackets,"[XS, S, M, L]","[32 - 33, 33 - 34, 34 - 35, 35 - 36]","[25 - 26, 26 - 27, 27 - 28, 28 - 29]","[35 - 36, 36 - 37, 37 - 38, 38 - 39]"
158,Abercrombie & Fitch Size Charts,Tops,"[XS, S, M, L]","[32 - 33, 34 - 35, 36 - 37, 38]",[],[]
158,Abercrombie & Fitch Size Charts,Bottoms - Numeric Sizes,"[000, 00, 0, 2, 4, 6, 8, 10, 12]",[],"[23, 24, 25, 26, 27, 28, 29, 30, 31]",[]
...,...,...,...,...,...,...
24648,Zara Size Charts,"Pants, Skirts - Alpha sizes","[XXS, XS, S, M, L, XL]",[],"[22¾, 24½, 26, 27½, 30, 32¼]","[33¾, 35½, 37, 38½, 41, 43¼]"
24648,Zara Size Charts,Dresses - Numeric Sizes,"[2, 4, 6, 8, 10, 12, 14]","[32¼, 33¾, 35½, 37, 38½, 40¼, 41¾]","[25¼, 26, 27½, 29, 30¾, 32¼, 33¾]","[35½, 37, 38½, 40¼, 41¾, 43¼, 45]"
24648,Zara Size Charts,Dresses - Alpha sizes,"[XXS, XS, S, M, L, XL]","[31½, 32¼, 34, 35½, 37¾, 40¼]","[22¾, 24½, 26, 27½, 30, 32¼]","[33¾, 35½, 37, 38½, 41, 43¼]"
24648,Zara Size Charts,Jackets - Numeric Sizes,"[2, 4, 6, 8, 10, 12, 14]","[32¼, 33¾, 35½, 37, 38½, 40¼, 41¾]","[25¼, 26, 27½, 29, 30¾, 32¼, 33¾]","[35½, 37, 38½, 40¼, 41¾, 43¼, 45]"


In [108]:
# 35½ and 38½ - 40 are strings
# nan is flo 
from unicodedata import numeric

def handleFraction(fraction):
    if fraction == '½':
        return 0.5
    elif fraction == '¼':
        return 0.25
    elif fraction == '¾':
        return 0.75

def handleMeasurement(value):
    if isinstance(value, float):
        value = 'N/A'
    elif isinstance(value, str):
        #case 1: is a range. Identifyer is a dash (-)
        if '-' in value:
            value_range = [x.strip() for x in value.split('-')]
            if len(value_range[0]) == 2:
                value_range[0] = float(value_range[0])
            elif len(value_range[0]) == 3: #case 2: is a range with a fraction. Identifyer ¼, ¾, ½
                fraction = handleFraction(value_range[0][2])
                measurement = float(value_range[0][:-1])
                measurement += fraction
                value_range[0] = measurement
            if len(value_range[1]) == 2:
                value_range[1] = float(value_range[1])
            elif len(value_range[1]) == 3: #case 2: is a range with a fraction. Identifyer ¼, ¾, ½
                    fraction = handleFraction(value_range[1][2])
                    measurement = float(value_range[1][:-1])
                    measurement += fraction
                    value_range[1] = measurement
            value = (float(value_range[0]) + float(value_range[1])) / 2
        #case 3: has a fraction. Identifyer  ¼, ¾, ½
        elif len(value) == 3:
            fraction = handleFraction(value[2])
            measurement = float(value[:-1])
            measurement += fraction
            value = measurement
        #case 4: has two characters.
        else:
            value = float(value)
    return value

In [96]:
df['Waist'] = df['Waist'].apply(handleMeasurement)
df

Unnamed: 0,Brands,Clothing Type,Sizes,Bust,Waist,Hips
0,7 for all mankind Size Charts,Shirts and Sweaters,XS,32.5,25.5,35 - 36
1,7 for all mankind Size Charts,Shirts and Sweaters,S,33.5,26.5,36 - 37
2,7 for all mankind Size Charts,Shirts and Sweaters,M,34.5,27.5,37 - 38
3,7 for all mankind Size Charts,Shirts and Sweaters,L,35.5,28.5,38 - 39
4,7 for all mankind Size Charts,Shirts and Sweaters,XS,32.5,25.5,35 - 36
...,...,...,...,...,...,...
83151,Zara Size Charts,Jackets - Alpha sizes,XS,32.25,24.5,35½
83152,Zara Size Charts,Jackets - Alpha sizes,S,34.0,26.0,37
83153,Zara Size Charts,Jackets - Alpha sizes,M,35.5,27.5,38½
83154,Zara Size Charts,Jackets - Alpha sizes,L,37.75,30.0,41


In [110]:
df

Unnamed: 0,Brands,Clothing Type,Sizes,Bust,Waist,Hips
0,7 for all mankind Size Charts,Shirts and Sweaters,XS,32.5,25.5,35.5
1,7 for all mankind Size Charts,Shirts and Sweaters,S,33.5,26.5,36.5
2,7 for all mankind Size Charts,Shirts and Sweaters,M,34.5,27.5,37.5
3,7 for all mankind Size Charts,Shirts and Sweaters,L,35.5,28.5,38.5
4,7 for all mankind Size Charts,Shirts and Sweaters,XS,32.5,25.5,35.5
...,...,...,...,...,...,...
83151,Zara Size Charts,Jackets - Alpha sizes,XS,32.25,24.5,35.5
83152,Zara Size Charts,Jackets - Alpha sizes,S,34.0,26.0,37.0
83153,Zara Size Charts,Jackets - Alpha sizes,M,35.5,27.5,38.5
83154,Zara Size Charts,Jackets - Alpha sizes,L,37.75,30.0,41.0


In [113]:
df.to_csv('1.13.csv')

In [14]:
df=pd.read_csv('1.13.csv')
df

Unnamed: 0.1,Unnamed: 0,Brands,Clothing Type,Sizes,Bust,Waist,Hips
0,0,7 for all mankind Size Charts,Shirts and Sweaters,XS,32.50,25.50,35.50
1,1,7 for all mankind Size Charts,Shirts and Sweaters,S,33.50,26.50,36.50
2,2,7 for all mankind Size Charts,Shirts and Sweaters,M,34.50,27.50,37.50
3,3,7 for all mankind Size Charts,Shirts and Sweaters,L,35.50,28.50,38.50
4,4,7 for all mankind Size Charts,Shirts and Sweaters,XS,32.50,25.50,35.50
...,...,...,...,...,...,...,...
83151,83151,Zara Size Charts,Jackets - Alpha sizes,XS,32.25,24.50,35.50
83152,83152,Zara Size Charts,Jackets - Alpha sizes,S,34.00,26.00,37.00
83153,83153,Zara Size Charts,Jackets - Alpha sizes,M,35.50,27.50,38.50
83154,83154,Zara Size Charts,Jackets - Alpha sizes,L,37.75,30.00,41.00


In [15]:
#Dropping the first column.
df = df.drop('Unnamed: 0', axis=1)
df

Unnamed: 0,Brands,Clothing Type,Sizes,Bust,Waist,Hips
0,7 for all mankind Size Charts,Shirts and Sweaters,XS,32.50,25.50,35.50
1,7 for all mankind Size Charts,Shirts and Sweaters,S,33.50,26.50,36.50
2,7 for all mankind Size Charts,Shirts and Sweaters,M,34.50,27.50,37.50
3,7 for all mankind Size Charts,Shirts and Sweaters,L,35.50,28.50,38.50
4,7 for all mankind Size Charts,Shirts and Sweaters,XS,32.50,25.50,35.50
...,...,...,...,...,...,...
83151,Zara Size Charts,Jackets - Alpha sizes,XS,32.25,24.50,35.50
83152,Zara Size Charts,Jackets - Alpha sizes,S,34.00,26.00,37.00
83153,Zara Size Charts,Jackets - Alpha sizes,M,35.50,27.50,38.50
83154,Zara Size Charts,Jackets - Alpha sizes,L,37.75,30.00,41.00


In [30]:
df = df[~df['Clothing Type'].str.contains('Category', na=False)]
df

Unnamed: 0,Brands,Clothing Type,Sizes,Bust,Waist,Hips
0,7 for all mankind Size Charts,Shirts and Sweaters,XS,32.50,25.50,35.50
1,7 for all mankind Size Charts,Shirts and Sweaters,S,33.50,26.50,36.50
2,7 for all mankind Size Charts,Shirts and Sweaters,M,34.50,27.50,37.50
3,7 for all mankind Size Charts,Shirts and Sweaters,L,35.50,28.50,38.50
4,7 for all mankind Size Charts,Shirts and Sweaters,XS,32.50,25.50,35.50
...,...,...,...,...,...,...
83151,Zara Size Charts,Jackets - Alpha sizes,XS,32.25,24.50,35.50
83152,Zara Size Charts,Jackets - Alpha sizes,S,34.00,26.00,37.00
83153,Zara Size Charts,Jackets - Alpha sizes,M,35.50,27.50,38.50
83154,Zara Size Charts,Jackets - Alpha sizes,L,37.75,30.00,41.00
