<a href="https://colab.research.google.com/github/ahteshamsalamatansari/CSVMERGE/blob/main/sifted_articles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 🕷️ Sifted.eu Article Scraper - Colab, FULL SMART VERSION
!pip install -q selenium undetected-chromedriver cloudscraper beautifulsoup4
!pip install -q pandas matplotlib ipywidgets tqdm fake-useragent nltk

import os, re, time, random, requests, cloudscraper, pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from google.colab import files
from tqdm.notebook import tqdm
import ipywidgets as widgets
from IPython.display import display, clear_output
import nltk
nltk.download('punkt', quiet=True)

class TextCleaner:
    def __init__(self):
        self.social = [r'https?://(?:www\.)?(?:facebook|fb|twitter|instagram|linkedin|youtube|tiktok|snapchat|pinterest)\.com/\S+', r'@\w+', r'#\w+']
        self.subscr = [
            r'subscribe\s+(?:to|now|here)?', r'sign\s+up\s+(?:for|to)?', r'get\s+(?:our|the)?\s*newsletter',
            r'join\s+(?:our|the)?\s*community', r'follow\s+us\s+on', r'connect\s+with\s+us', r'stay\s+updated',
            r"don't\s+miss\s+out", r'be\s+the\s+first\s+to\s+know', r'premium\s+(?:content|access|subscription)',
            r'upgrade\s+(?:to|your)\s+(?:account|plan)', r'unlock\s+(?:full|premium)\s+(?:content|access)',
            r'become\s+a\s+(?:member|subscriber)', r'limited\s+(?:time|access)', r'free\s+trial', r'pay\s*wall', r'subscription\s+required'
        ]
        self.figure = [
            r'figure\s+\d+', r'fig\.\s*\d+', r'image\s+\d+', r'chart\s+\d+', r'graph\s+\d+', r'source:\s*[^\n]+',
            r'credit:\s*[^\n]+', r'photo\s+(?:by|credit):?\s*[^\n]+', r'image\s+(?:by|credit):?\s*[^\n]+',
            r'getty\s+images?', r'shutterstock', r'unsplash', r'reuters', r'ap\s+photo', r'bloomberg'
        ]
    def clean_text(self, text):
        if not text: return ""
        text = re.sub(r'<[^>]+>', '', text)
        for pattern in self.social + self.subscr + self.figure:
            text = re.sub(pattern, '', text, flags=re.IGNORECASE)
        text = re.sub(r'https?://\S+', '', text)
        text = re.sub(r'www\.\S+', '', text)
        text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        nav_words = ['home', 'about', 'contact', 'privacy', 'terms', 'menu', 'search', 'login', 'register', 'logout', 'profile', 'settings', 'help']
        for word in nav_words: text = re.sub(r'\b' + word + r'\b', '', text, flags=re.IGNORECASE)
        return text.strip()

class SiftedScraper:
    def __init__(self):
        try:
            opts = Options()
            opts.add_argument('--headless'), opts.add_argument('--no-sandbox'), opts.add_argument('--disable-dev-shm-usage')
            opts.add_argument('--disable-blink-features=AutomationControlled')
            self.driver = webdriver.Chrome(options=opts)
        except Exception as e: self.driver = None
        self.cloudscraper_session = cloudscraper.create_scraper()
        self.cleaner = TextCleaner()
    def extract_with_cloudscraper(self, url):
        try:
            html = self.cloudscraper_session.get(url, timeout=30).text
            return self.parse_article(html, url)
        except: return None
    def extract_with_selenium(self, url):
        try:
            if not self.driver: return None
            self.driver.get(url)
            time.sleep(random.uniform(4,6))
            soup = BeautifulSoup(self.driver.page_source, 'html.parser')
            return self.parse_article(str(soup), url)
        except: return None

    def parse_article(self, html, url):
        soup = BeautifulSoup(html, 'html.parser')
        title_elem = soup.select_one('h1, h1.title, .article-title, [data-testid="headline"]')
        title = self.cleaner.clean_text(title_elem.text) if title_elem else ""
        # --- Extract only the main narrative, skip all meta/pro/garbage lines ---
        container = soup.select_one('article, .article-content, .post-content, main, [class*=content]')
        if not container:
            paragraphs = soup.find_all('p')
        else:
            paragraphs = container.find_all('p')
        # Clean all <p> blocks
        full_paragraphs = [self.cleaner.clean_text(p.get_text()) for p in paragraphs if p.get_text(strip=True)]
        header_words = ["Analysis", "Pro", "Recommended", "minute read", "By ", "Updated", "# ", "Read more", "Sifted", "Kai Nicol-Schwarz"]
        # Remove header/meta lines & find real start ("Very few...", "The ...", "In ...")
        body_candidates = [p for p in full_paragraphs if len(p)>40 and not any(w in p for w in header_words)]
        story_start = 0
        for i, para in enumerate(body_candidates):
            # Adjust to your language/case needs for catching good article starters
            if re.match(r'([A-Z][a-z ,\'\-]+){3,}', para) or re.match(r'Very few companies', para):
                story_start = i; break
        content = ' '.join(body_candidates[story_start:]).strip()
        author_elem = soup.select_one('.author, .byline, [class*=author]')
        author = self.cleaner.clean_text(author_elem.text) if author_elem else ""
        date_elem = soup.select_one('time, .date, .published, [datetime]')
        date = date_elem.get('datetime') if date_elem and date_elem.has_attr('datetime') else (date_elem.text if date_elem else "")
        return {"url": url, "title": title, "content": content, "author": author, "date": date}

    def scrape(self, urls):
        results = []
        for url in tqdm(urls, desc="Scraping articles"):
            d = self.extract_with_cloudscraper(url) or self.extract_with_selenium(url)
            if d and d["content"]:
                d["word_count"] = len(d["content"].split())
                d["reading_time"] = max(1, d["word_count"]//200)
                results.append(d)
        if self.driver: self.driver.quit()
        return results

def run_interface():
    file_upload = widgets.FileUpload(accept='.txt,.csv', multiple=False)
    url_box = widgets.Textarea(value='', placeholder='Paste URLs here (one per line)', layout=widgets.Layout(width='100%', height='90px'))
    run_btn = widgets.Button(description="🚀 Start Scraping", button_style='success')
    out = widgets.Output(); status = widgets.HTML()

    def on_start(b):
        with out:
            clear_output()
            lines = []
            if file_upload.value:
                content = list(file_upload.value.values())[0]['content'].decode('utf-8')
                lines = content.strip().splitlines()
            elif url_box.value:
                lines = url_box.value.strip().splitlines()
            urls = []
            for line in lines:
                m = re.search(r'https?://[^\s\)\]]+', line)
                if m: urls.append(m.group())
            if not urls:
                print("❌ No URLs found."); return
            status.value = f"<b>Scraping {len(urls)} articles...</b>"
            scraper = SiftedScraper()
            data = scraper.scrape(urls)
            if not data:
                print("❌ No articles scraped."); return
            df = pd.DataFrame(data)[["url", "title", "content", "author", "date", "word_count", "reading_time"]]
            df.to_csv('sifted_articles.csv', index=False)
            files.download('sifted_articles.csv')
            print("\n✅ Scraping done! CSV downloaded.\n")
            try:
                plt.figure(figsize=(7,4))
                df.word_count.plot(kind='bar')
                plt.title("Article Word Counts")
                plt.ylabel("Words"); plt.xlabel("Article")
                plt.tight_layout()
                plt.show()
            except: pass

    def check_action(change):
        # auto click run_btn if upload or URL entry is non-empty
        if (file_upload.value or url_box.value.strip()):
            run_btn.click()
    file_upload.observe(check_action, names='value')
    url_box.observe(check_action, names='value')
    run_btn.on_click(on_start)
    ui = widgets.VBox([
        widgets.HTML("<b>Upload .txt/.csv file or paste Sifted URLs — scraping starts as soon as input is given:</b>"),
        file_upload, url_box, run_btn, status, out
    ])
    display(ui)

run_interface()
