<a href="https://colab.research.google.com/github/ahteshamsalamatansari/colabcodes/blob/main/techfunding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ==============================================
# ðŸš€ One-Click Tech Funding News Scraper (Colab)
# ==============================================

# Install dependencies
!pip install requests beautifulsoup4 tqdm

# Imports
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
import re
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

# --------------------------
# SCRAPER CLASS
# --------------------------
class TechNewsContentScraper:
    def __init__(self, max_workers=20, delay_between_requests=0.1):
        self.max_workers = max_workers
        self.delay_between_requests = delay_between_requests
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
        })
        self.results = []
        self.failed_urls = []
        self.lock = threading.Lock()

    def clean_text(self, text):
        if not text: return ""
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def extract_article_content(self, soup, url):
        try:
            title = ""
            for selector in ['h1', '.entry-title', '.post-title']:
                elem = soup.select_one(selector)
                if elem:
                    title = self.clean_text(elem.get_text())
                    break

            content = ""
            main = None
            for selector in ['.entry-content','.post-content','article']:
                main = soup.select_one(selector)
                if main: break

            if main:
                for bad in main.select('script,style,aside,nav,.share,.ads'):
                    bad.decompose()
                parts = [self.clean_text(e.get_text()) for e in main.find_all(['p','h2','h3']) if len(e.get_text())>20]
                content = " ".join(parts)

            return title, content
        except:
            return "", ""

    def scrape_single_url(self, url):
        try:
            time.sleep(self.delay_between_requests)
            r = self.session.get(url, timeout=30)
            r.raise_for_status()
            soup = BeautifulSoup(r.content, 'html.parser')
            title, content = self.extract_article_content(soup, url)
            return {'url':url,'title':title,'content':content,'status':'success','content_length':len(content)}
        except Exception as e:
            return {'url':url,'title':'','content':'','status':str(e),'content_length':0}

    def scrape_all_urls(self, urls):
        results = []
        with ThreadPoolExecutor(max_workers=self.max_workers) as ex:
            futures = {ex.submit(self.scrape_single_url,u):u for u in urls}
            with tqdm(total=len(urls),desc="Scraping") as bar:
                for f in as_completed(futures):
                    results.append(f.result())
                    bar.update(1)
        return results

# --------------------------
# MAIN RUN FUNCTION
# --------------------------
def run_complete_scraper(urls_file_path, output_file='scraped_articles.csv', max_workers=20):
    scraper = TechNewsContentScraper(max_workers=max_workers, delay_between_requests=0.05)
    with open(urls_file_path,'r') as f:
        urls = [u.strip() for u in f if u.strip()]
    print(f"ðŸ“‚ Loaded {len(urls)} URLs")
    results = scraper.scrape_all_urls(urls)
    df = pd.DataFrame(results)
    df.to_csv(output_file,index=False,encoding='utf-8')
    print(f"ðŸ’¾ Saved {len(df)} records to {output_file}")
    return df

# --------------------------
# ðŸš€ AUTO EXECUTION
# --------------------------
# Upload URL file (urls.txt) automatically
from google.colab import files
uploaded = files.upload()
filename = list(uploaded.keys())[0]  # first uploaded file

# Run scraper
df = run_complete_scraper(filename, 'scraped_articles.csv', max_workers=20)

# Show preview
df.head()

# --------------------------
# ðŸ“Š Visualization
# --------------------------
import matplotlib.pyplot as plt

# Success vs Failed
df['success'] = df['status'] == 'success'
counts = df['success'].value_counts()
plt.pie(counts, labels=['Success','Failed'], autopct='%1.1f%%')
plt.title("Scraping Success Rate")
plt.show()

# Content length distribution
df[df['success']]['content_length'].hist(bins=40, figsize=(8,4))
plt.title("Article Length Distribution")
plt.xlabel("Characters")
plt.ylabel("Frequency")
plt.show()
