# 1 Collect Products
This notebook collects product text data from esker cycles website. The ultimate goal is to match articles to the products based on there provided data.

In [12]:
import pandas as pd
import numpy as np 
import os 
import requests
from bs4 import BeautifulSoup
import json       

In [13]:
# Load the product URLs from the file
with open('./data/esker-cycles-product-links.csv', 'r') as f:
    product_lines = [line.strip() for line in f if line.strip()]

print(f"Loaded {len(product_lines)} product entries.")

product_data = []

for i, line in enumerate(product_lines):
    try:
        name, url = line.split(",", 1)  # Split only on the first comma
        print(f"Scraping {i+1}/{len(product_lines)}: {name} ({url})")
        
        response = requests.get(url)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        content = soup.get_text(separator="\n", strip=True)

        product_data.append({
            'name': name,
            'url': url,
            'text': content
        })

    except Exception as e:
        print(f"Error scraping {line}: {e}")
    

Loaded 14 product entries.
Scraping 1/14: Product (Link)
Error scraping Product,Link: Invalid URL 'Link': No scheme supplied. Perhaps you meant https://Link?
Scraping 2/14: Hayduke (https://eskercycles.com/pages/hayduke)
Scraping 3/14: Hayduke Ti (https://eskercycles.com/pages/hayduke-ti)
Scraping 4/14: Howdy (https://eskercycles.com/pages/howdy)
Scraping 5/14: Howdy Ti (https://eskercycles.com/pages/howdy-ti)
Scraping 6/14: Japhy (https://eskercycles.com/pages/japhy)
Scraping 7/14: Japhy Ti (https://eskercycles.com/pages/japhy-ti)
Scraping 8/14: Lorax (https://eskercycles.com/pages/lorax)
Error scraping Lorax,https://eskercycles.com/pages/lorax: 404 Client Error: Not Found for url: https://eskercycles.com/pages/lorax
Scraping 9/14: Lorax Ti (https://eskercycles.com/pages/lorax-ti)
Scraping 10/14: Smokey (https://eskercycles.com/pages/smokey)
Scraping 11/14: Smokey Ti (https://eskercycles.com/pages/smokey-ti)
Scraping 12/14: Hayduke Ivs (https://eskercycles.com/pages/hayduke-lvs)
Scrap

## Save Dataset
We save the collected product data as a json file for the next steps.

In [14]:
# Save to JSON
os.makedirs('./intermediate_data', exist_ok=True)
with open('./intermediate_data/product-web-scarping.json', 'w', encoding='utf-8') as f:
    json.dump(product_data, f, indent=2, ensure_ascii=False)


## Next step
After you saved the dataset here, run the next step in the workflow [02-01-CleanArticles.ipynb](./02-01-CleanArticles.ipynb) or go back to [00-Workflow.ipynb](./00-Workflow.ipynb).

---

**Authors:**
[Salah Mohamoud](mailto:salah.mohamoud.dev@gmail.com),
[Sai Keertana Lakku](mailto:saikeertana005@gmail.com),
[Zhen Zhuang](mailto:zhuangzhen17cs@gmail.com),
[Nick Capaldini](mailto:nick.capaldini@ridethenextwave.com), Ride The Next Wave, May 19, 2025

---