In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm

# Suppose this is your Pandas DataFrame
df = pd.read_csv('Sales.csv')

# Function to check if a URL is valid (simplified)
def is_url_valid(url):
    if pd.isnull(url) or not url.startswith('http'):
        return False
    return True

# Function to check if the URL belongs to an online store
def is_online_store(url):
    if not is_url_valid(url):
        return "Error"
    
    try:
        response = requests.get(url, timeout=10)
        content = response.content
        soup = BeautifulSoup(content, 'html.parser')

        cart_terms = ["cart", "basket"]
        purchase_terms = ["buy", "purchase", "add to cart", "shop"]

        cart_elements = []
        for term in cart_terms:
            cart_elements += soup.find_all(text=lambda text: text and term in text.lower())

        purchase_buttons = []
        for term in purchase_terms:
            purchase_buttons += soup.find_all('button', text=lambda text: text and term in text.lower())
            purchase_buttons += soup.find_all('a', text=lambda text: text and term in text.lower())

        return bool(cart_elements or purchase_buttons)
    except Exception as e:
        print(f"Error processing {url}: {e}")
        return "Error"

# Remove duplicates while keeping all indices for later reassignment
unique_urls = df['Website'].drop_duplicates().reset_index(drop=True)

# Apply the function only to unique URLs
tqdm.pandas(desc="Processing unique websites")
unique_results = unique_urls.progress_apply(is_online_store)

# Map the results back to the original DataFrame
df['Is_Online_Store'] = df['Website'].map(dict(zip(unique_urls, unique_results)))

# Count and display the results
total = len(df)
valid_stores = len(df[df['Is_Online_Store'] == True])
invalid_stores = len(df[df['Is_Online_Store'] == False])
errors = len(df[df['Is_Online_Store'] == "Error"])

# Calculate percentages
percentage_valid = (valid_stores / total) * 100
percentage_invalid = (invalid_stores / total) * 100
percentage_errors = (errors / total) * 100

# Print results
print(f"Total sites: {total}")
print(f"Valid stores: {valid_stores} ({percentage_valid:.2f}%)")
print(f"Invalid stores: {invalid_stores} ({percentage_invalid:.2f}%)")
print(f"Errors: {errors} ({percentage_errors:.2f}%)")

# Save the results
df.to_csv('Results_Ecom.csv', index=False)