In [None]:
# Regenerate the full script for a Jupyter Notebook that:
# - Loads domain list from CSV
# - Checks HTTP status and redirect
# - Fetches website content
# - Uses ChatGPT to describe the site
# - Saves the results to a new CSV

# Step 1: Install required libraries
!pip install requests beautifulsoup4 openai pandas --quiet

# Step 2: Import libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
import openai
from openai import OpenAI
from time import sleep

# Step 3: Set OpenAI API Key
openai.api_key = ""

# Step 4: Load domain list
df = pd.read_csv("/Users/amish.desai/AIML/Unmatched_Domains_10.csv")

# Step 5: Helper functions
def check_domain_status(domain):
    try:
        response = requests.get(f"http://{domain}", timeout=5, allow_redirects=False)
        status_code = response.status_code
        is_redirect = 300 <= status_code < 400
        return "Reachable", status_code, is_redirect
    except requests.exceptions.RequestException:
        return "Unreachable", None, False

def get_website_description_and_industry(domain):
    try:
        url = f"http://{domain}"
        response = requests.get(url, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')
        text = soup.get_text(separator=' ', strip=True)
        text = text[:3000]  # truncate for token limit

        prompt = (
            f"This is the content of a website:\n{text}\n\n"
            "Give me a short description of what this website is about, "
            "and identify the industry it operates in (e.g., healthcare, finance, retail, technology, education, etc.). "
            "Output in this format: Description: <...> Industry: <...>"
        )

        completion = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}]
        )
        output = completion.choices[0].message.content.strip()

        # Extract description and industry from response
        desc_part = output.split("Industry:")[0].replace("Description:", "").strip()
        industry_part = output.split("Industry:")[-1].strip()
        return desc_part, industry_part

    except Exception as e:
        return f"Description failed: {e}", "Unknown"


# Step 6: Process domains
results = []
for domain in df["DOMAIN"]:
    print(f"Processing: {domain}")
    status, code, is_redirect = check_domain_status(domain)
#  description = get_website_description(domain) if status == "Reachable" and code == 200 else "Not applicable"
    results.append((domain, status, code, is_redirect, get_website_description_and_industry(domain)[0],get_website_description_and_industry(domain)[1]))
    sleep(0.1)  # polite delay

# Step 7: Save results to CSV
results_df = pd.DataFrame(results, columns=["DOMAIN", "STATUS", "HTTP_CODE", "IS_REDIRECT", "DESCRIPTION","INDUSTRY"])
results_df.to_csv("/Users/amish.desai/AIML/domain_description_industry_results.csv", index=True)
print("âœ… Done! Results saved to 'domain_description_results.csv'.")
