In [99]:
# this html page was downloaded on Monday, August 26th, 2024 at 9:05pm EST. 

import pandas as pd
from bs4 import BeautifulSoup

In [100]:
with open('/Users/anikamisra/Desktop/personal-projects/pink-tax/lululemon/lululemon_products_womens.html', 'r') as html_file:
    content = html_file.read()

soup = BeautifulSoup(content, 'html.parser')

In [101]:
product_tiles = soup.find_all('div', class_='product-tile')

# Lists to hold product names and prices
product_names = []
prices = []

for tile in product_tiles:
    # Check if the tile is a product tile
    if 'product-tile' in tile.get('class', []):
        # Extract the product name
        name_tag = tile.find('h3', class_='product-tile__product-name')
        if name_tag and name_tag.a:
            product_name = name_tag.a.get_text(strip=True)
        else:
            continue
        
        # Extract the price
        price_tag = tile.find('span', class_='price')
        if price_tag:
            price = price_tag.get_text(strip=True)
        else:
            continue
        
        # Append to the lists
        product_names.append(product_name)
        prices.append(price)

# Create a DataFrame
df = pd.DataFrame({
    'Product Name': product_names,
    'Price': prices
})

In [102]:
df.shape

(846, 2)

In [103]:
price_string_df = df[df['Price'].str.lower().str.contains('price', case=False, na=False)]
price_string_df.head()

Unnamed: 0,Product Name,Price
474,Scuba Oversized Hoodie \\nVelvet Cord,Sale Price$89Regular Price$148
475,"Fast and Free High-Rise Tight 25""",Sale Price$69Regular Price$128
476,"Hotty Hot High-Rise Lined Short 4""",Sale Price$49Regular Price$68
477,"Speed Up Low-Rise Lined Short 2.5""",Sale Price$39-$59Regular Price$68
478,"lululemon Align\'99 High-Rise Short 6""",Sale Price$39-$59Regular Price$64


Note that the product with index 478 has a double problem: Sale Price text AND a range. Our final dataset needs to show this price value as being 49 (the average of the range in the sale price). 

In [105]:
range_df = df[df['Price'].str.lower().str.contains('-', case=False, na=False)]
range_df.tail()

Unnamed: 0,Product Name,Price
747,It's Rulu Long-Sleeve Hoodie,Sale Price$74-$84Regular Price$118
783,Wundermost Ultra-Soft Nulu Long-Sleeve One-Sho...,Sale Price$49-$64Regular Price$88
785,Wundermost Ultra-Soft Nulu Turtleneck Bodysuit,Sale Price$34-$59Regular Price$88
815,"Wunder Train Aerobic High-Rise Tight 25"" \\nTwill",Sale Price$69-$79Regular Price$118
821,Power Pivot Everlux Tank Top,Sale Price$29-$49Regular Price$68


We also need to fix these products that have the price listed as a range. We need to ensure that the sale price is listed, not the regular price. The most fair way to do this would be to take the average price in the range, whether it is a sale price range or a regular range. 

In [106]:
range_df.shape

(75, 2)

Luckily, there are only 75 products that have a price that is actually a range. 

In [107]:
# extract all sales prices and sales ranges 
df['Price'] = df['Price'].apply(lambda item: item.replace("Sale Price", "").split('R')[0].strip())

# now handle the ranges 
df['Price'] = df['Price'].apply(lambda item: 
                                f"${(float(item.split('-')[0].replace('$', '')) + float(item.split('-')[1].replace('$', ''))) / 2:.2f}" 
                                if '-' in item
                                else item)

df['Price'] = df['Price'].str.replace('$', '', regex=False).str.replace(',', '').astype(float)

# test 
#a = df[df['Product Name'].str.contains("'99 High-Rise Short 6", case=False, na=False)]

Now for the final test. Does the product at index 478 have a new price of 49? 

In [108]:
a = df.iloc[478]
print(a)

Product Name    lululemon Align\'99 High-Rise Short 6"
Price                                             49.0
Name: 478, dtype: object


Great! Now we can repeat with the mens stuff and then clean all the product labels. 

In [124]:
df.to_csv("lululemon_womens_data.csv", index=False)

In [117]:
with open('/Users/anikamisra/Desktop/personal-projects/pink-tax/lululemon/lululemon_products_mens.html', 'r') as html_file:
    content = html_file.read()

soup = BeautifulSoup(content, 'html.parser')

product_tiles = soup.find_all('div', class_='product-tile')

# Lists to hold product names and prices
product_names = []
prices = []

for tile in product_tiles:
    # Check if the tile is a product tile
    if 'product-tile' in tile.get('class', []):
        # Extract the product name
        name_tag = tile.find('h3', class_='product-tile__product-name')
        if name_tag and name_tag.a:
            product_name = name_tag.a.get_text(strip=True)
        else:
            continue
        
        # Extract the price
        price_tag = tile.find('span', class_='price')
        if price_tag:
            price = price_tag.get_text(strip=True)
        else:
            continue
        
        # Append to the lists
        product_names.append(product_name)
        prices.append(price)

# Create a DataFrame
df_m = pd.DataFrame({
    'Product Name': product_names,
    'Price': prices
})

In [118]:
df_m.shape

(537, 2)

In [119]:
range_df_m = df_m[df_m['Price'].str.lower().str.contains('-', case=False, na=False)]
print("These are the number of products that have prices as ranges: ", range_df_m.shape[0])
range_df_m.head()

These are the number of products that have prices as ranges:  50


Unnamed: 0,Product Name,Price
289,Never Lost Keychain,$20-$24
292,The Mat 5mm \\nMade With FSC\'99 Certified Rubber,$94-$98
294,The Mat 3mm \\nMade With FSC\'99 Certified Rubber,$78-$88
346,"Pace Breaker Linerless Short 7""",Sale Price$39-$59Regular Price$68
347,License to Train Short-Sleeve Shirt,Sale Price$39-$54Regular Price$78


In [120]:
# extract all sales prices and sales ranges 
df_m['Price'] = df_m['Price'].apply(lambda item: item.replace("Sale Price", "").split('R')[0].strip())

# now handle the ranges 
df_m['Price'] = df_m['Price'].apply(lambda item: 
                                f"${(float(item.split('-')[0].replace('$', '')) + float(item.split('-')[1].replace('$', ''))) / 2:.2f}" 
                                if '-' in item
                                else item)

df_m['Price'] = df_m['Price'].str.replace('$', '', regex=False).str.replace(',', '').astype(float)

In [122]:
# final test 
a = df_m.iloc[442]
print(a)

Product Name    Commission Long-Sleeve Shirt \\nOxford
Price                                             54.0
Name: 442, dtype: object


That is indeed the average from 49 to 59. Great. 

In [125]:
df_m.to_csv("lululemon_mens_data.csv", index=False)