In [17]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

def get_about_us_text(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')

        links = soup.find_all('a', href=True)
        about_links = [link['href'] for link in links if 'about' in link['href'].lower()]

        if not about_links:
            print(f"[{url}] ❌ No About link found.")
            return None

        about_url = about_links[0]
        if not about_url.startswith("http"):
            if about_url.startswith("/"):
                about_url = url.rstrip("/") + about_url
            else:
                about_url = url.rstrip("/") + "/" + about_url

        print(f"Scraping About Us from: {about_url}")
        about_page = requests.get(about_url, timeout=10)
        about_soup = BeautifulSoup(about_page.text, 'html.parser')

        paragraphs = about_soup.find_all('p')
        text = " ".join(p.get_text(strip=True) for p in paragraphs)
        return text[:1000]  # Limit for now

    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

def extract_keywords(text, top_n=10):
    text = re.sub(r'[^\w\s]', '', text.lower())
    cv = CountVectorizer(stop_words='english', max_features=1000)
    word_count_vector = cv.fit_transform([text])
    tfidf_transformer = TfidfTransformer()
    tfidf = tfidf_transformer.fit_transform(word_count_vector)
    scores = zip(cv.get_feature_names_out(), tfidf.toarray()[0])
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    return [word for word, score in sorted_scores[:top_n]]

def generate_sample_content(keywords):
    return f"We specialize in {', '.join(keywords[:-1])}, and {keywords[-1]} to serve our customers better."

# ---------- MAIN EXECUTION ----------
# Step 1: Read Excel
df = pd.read_excel("C:\\Users\\91765\\Downloads\\Apify_Scrapped_Data.xlsx")

# Step 2: Process only the first 2 websites
websites = df['website'].dropna().head(2).tolist()

for i, site in enumerate(websites, 1):
    print(f"\n🔹 Brand {i} — {site}")
    about_text = get_about_us_text(site)
    
    if about_text:
        print("✅ Extracted About Us Text:\n", about_text[:1000], "...\n")
        keywords = extract_keywords(about_text)
        print("🔑 Niche Keywords:", keywords)
        content = generate_sample_content(keywords)
        print("📝 Generated Content:", content)
    else:
        print("⚠️ Could not extract content.\n")






🔹 Brand 1 — http://oneclickcommunication.com/
Scraping About Us from: http://oneclickcommunication.com/about.html
✅ Extracted About Us Text:
 With a mission focused on strategic marketing, we're here to be your trusted partner, helping you navigate the complex marketing landscape.
                     Choose us for our expertise, experience, and unwavering dedication to your success.                            Our experienced team is passionate about helping your businessthrive in the ever-evolving digital landscape. 
                            Contact us to discoverhow we can elevate your brand and drive your marketing efforts. Expert Members Marketing Tactics Industry Experience Client Satisfaction Industries Served Whether you're an experienced entrepreneur or just starting out, our team of experts 
                            is here to guide you every step of the way, from concept to execution. Weâre a team of passionate professionals who are committed to helping you unlock yo