In [2]:
import os
import csv
import time
import shutil
import re
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter, Retry

INPUT_CSV = "email.csv"  # change if your file is named differently

if not os.path.exists(INPUT_CSV):
    raise FileNotFoundError(f"CSV file not found: {INPUT_CSV}")

# Setup requests session with retries
session = requests.Session()
retries = Retry(total=3, backoff_factor=0.5, status_forcelist=[429, 500, 502, 503, 504])
session.mount("http://", HTTPAdapter(max_retries=retries))
session.mount("https://", HTTPAdapter(max_retries=retries))
HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; Bot/1.0)"}

rows = []
# Read CSV using a fallback of encodings to handle files that are not UTF-8
encodings_to_try = ["utf-8", "utf-8-sig", "cp1252", "latin-1"]
read_success = False
for enc in encodings_to_try:
    try:
        with open(INPUT_CSV, newline='', encoding=enc) as f:
            reader = csv.DictReader(f)
            fieldnames = reader.fieldnames or []
            all_rows = list(reader)
        print(f"Opened CSV with encoding: {enc}")
        read_success = True
        break
    except UnicodeDecodeError:
        print(f"Encoding {enc} failed, trying next...")

if not read_success:
    raise UnicodeDecodeError("Unable to decode CSV file with tested encodings", b"", 0, 1, "encoding error")

# Clean up None headers and row keys (some CSVs have empty headers)
if None in fieldnames:
    print("Found None in CSV header, removing it")
    fieldnames = [fn for fn in fieldnames if fn is not None]

for row in all_rows:
    if None in row:
        row.pop(None, None)

# Ensure we have an 'email' column
if "email" not in fieldnames:
    fieldnames.append("email")

total = len(all_rows)

# Detect URL-like column (case-insensitive) and email column name
url_field = next((fn for fn in fieldnames if fn and re.search(r'url|website|site|link', fn, re.I)), None)
if not url_field:
    raise ValueError("No URL-like column found. Ensure your CSV has a column named 'url', 'URL', 'website', etc.")

email_field = next((fn for fn in fieldnames if fn and re.search(r'email|e-mail', fn, re.I)), None)
if not email_field:
    email_field = 'email'
    fieldnames.append(email_field)

for i, row in enumerate(all_rows, start=1):
    raw_url = row.get(url_field, "")
    url = raw_url.strip()
    print(f"[{i}/{total}] Fetching: {url}")

    if not url:
        row[email_field] = "No URL"
        rows.append(row)
        continue

    if not url.startswith(("http://", "https://")):
        url = "http://" + url

    try:
        r = session.get(url, headers=HEADERS, timeout=10)
        soup = BeautifulSoup(r.text, "html.parser")

        # Gather page text and mailto hrefs
        text = soup.get_text(separator=" ")
        hrefs = " ".join(a.get("href", "") for a in soup.find_all("a"))
        search_space = text + " " + hrefs

        emails = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", search_space)
        unique_emails = list(dict.fromkeys(emails))  # preserve order, remove duplicates

        row[email_field] = unique_emails[0] if unique_emails else "Not Found"

    except Exception as e:
        print(f"Error fetching {url}: {e}")
        row[email_field] = "Error"

    rows.append(row)
    time.sleep(1)  # polite delay

# Write results back to CSV (overwrite)
with open(INPUT_CSV, "w", newline='', encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(rows)

print("✅ Done — CSV updated")

Opened CSV with encoding: utf-8
[1/3] Fetching: https://sanasafinaz.com/
[2/3] Fetching: https://nishathotels.com/
[3/3] Fetching: https://www.royaltoninnhotel.com/
✅ Done — CSV updated
