In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Headers to simulate a browser request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
}

# Function to extract product details from a given URL
def extract_product_info(url, index):
    try:
        print(f"Extracting info for index: {index}")  # Display the index number
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract product details
        product_name = soup.find("span", class_="product-name product-name-bold").get_text(strip=True) if soup.find("span", class_="product-name product-name-bold") else "N/A"
        brand_name = soup.find("span", class_="brand-name", itemprop="name").get_text(strip=True) if soup.find("span", class_="brand-name", itemprop="name") else "N/A"
        rating = soup.find("span", itemprop="ratingValue").get_text(strip=True) if soup.find("span", itemprop="ratingValue") else "N/A"
        review_count = soup.find("meta", itemprop="reviewCount")['content'] if soup.find("meta", itemprop="reviewCount") else "N/A"
        price = soup.find("span", class_="price-sales price-sales-standard").get_text(strip=True) if soup.find("span", class_="price-sales price-sales-standard") else "N/A"
        ingredients = soup.find("div", class_="ingredients-content").get_text(strip=True) if soup.find("div", class_="ingredients-content") else "N/A"

        # Extract breadcrumb categories
        breadcrumb_elements = soup.find_all("div", class_="breadcrumb-element")
        categories = [element.get_text(strip=True) for element in breadcrumb_elements]

        # Separate categories
        subcategories = categories[0:3]
        while len(subcategories) < 3:
            subcategories.append("N/A")

        return {
            "subcategory1": subcategories[0],
            "subcategory2": subcategories[1],
            "subcategory3": subcategories[2],
            "product_name": product_name,
            "brand_name": brand_name,
            "rating": rating,
            "review_count": review_count,
            "price": price,
            "ingredients": ingredients,
            "product_url": url  # Move URL to the last position
        }
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return {
            "subcategory1": "Error",
            "subcategory2": "Error",
            "subcategory3": "Error",
            "product_name": "Error",
            "brand_name": "Error",
            "rating": "Error",
            "review_count": "Error",
            "price": "Error",
            "ingredients": "Error",
            "product_url": url  # Ensure the URL is present even on error
        }

# Function to scrape product information for all URLs in a DataFrame
def scrape_sephora_products(urls_df):
    product_data = []
    for index, url in enumerate(urls_df['Product URL']):
        product_info = extract_product_info(url, index)  # Pass the index
        product_data.append(product_info)
    return pd.DataFrame(product_data)

# Load your product_urls DataFrames (replace with actual loading code)
product_urls_make_up = pd.read_csv(r"C:\Users\celia\Downloads\product_urls_makeup.csv")
product_urls_treatment = pd.read_csv(r"C:\Users\celia\Downloads\product_urls_treatment.csv")
product_urls_body = pd.read_csv(r"C:\Users\celia\Downloads\product_urls_bodycare.csv")

# Call the function with each URLs DataFrame and concatenate the results
product_data_df_make_up = scrape_sephora_products(product_urls_make_up)
product_data_df_treatment = scrape_sephora_products(product_urls_treatment)
product_data_df_body = scrape_sephora_products(product_urls_body)

# Combine the results into one DataFrame
personal_care_combined_product_data_df = pd.concat([product_data_df_make_up, product_data_df_treatment, product_data_df_body], ignore_index=True)

# Save the combined DataFrame to a CSV file
personal_care_combined_product_data_df.to_csv(r"C:\Users\celia\Documents\IronhackDataAnalysis\MidProject\combined_product_data.csv", index=False)

# Display the head and tail of the resulting DataFrame
print(personal_care_combined_product_data_df.head())
print(personal_care_combined_product_data_df.tail())

Extracting info for index: 0
Extracting info for index: 1
Extracting info for index: 2
Extracting info for index: 3
Extracting info for index: 4
Extracting info for index: 5
Extracting info for index: 6
Extracting info for index: 7
Extracting info for index: 8
Extracting info for index: 9
Extracting info for index: 10
Extracting info for index: 11
Extracting info for index: 12
Extracting info for index: 13
Extracting info for index: 14
Extracting info for index: 15
Extracting info for index: 16
Extracting info for index: 17
Extracting info for index: 18
Extracting info for index: 19
Extracting info for index: 20
Extracting info for index: 21
Extracting info for index: 22
Extracting info for index: 23
Extracting info for index: 24
Extracting info for index: 25
Extracting info for index: 26
Extracting info for index: 27
Extracting info for index: 28
Extracting info for index: 29
Extracting info for index: 30
Extracting info for index: 31
Extracting info for index: 32
Extracting info for 

In [10]:
# Display the head and tail of the resulting DataFrame
print(personal_care_combined_product_data_df.head())

       subcategory1 subcategory2         subcategory3  \
0  Página de inicio   Maquillaje               Labios   
1  Página de inicio   Maquillaje               Rostro   
2  Página de inicio   Maquillaje  Estuches maquillaje   
3  Página de inicio   Maquillaje  Estuches maquillaje   
4  Página de inicio   Maquillaje               Rostro   

                                        product_name          brand_name  \
0  Dior Addict - Barra de labios brillante 90 % d...                Dior   
1  CC Red Correct - Tratamiento iluminador correc...            Erborian   
2     Moving Lights - Calendario de Adviento Premium  Sephora Collection   
3             Moving Lights - Calendario de Adviento  Sephora Collection   
4  Pillow Talk Iconic Blush and Glow Kit - Estuch...   Charlotte Tilbury   

  rating review_count        price  \
0    5.0            1  50,99  €(1)   
1    4.7          268  42,99  €(1)   
2    4.2            5  79,99  €(1)   
3    3.7            3  49,99  €(1)   
4    5.0  