In [15]:
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
import pandas as pd

In [16]:
def scrape(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes
        return response.text
    except requests.RequestException as e:
        print(f"Error scraping {url}: {e}")
        return None

In [17]:
def get_ai_articles(soup):
    articles = soup.find_all('article', class_='post-block')
    print(f"Found {len(articles)} articles in total")
    ai_articles = []
    for article in articles:
        title_elem = article.find('h2', class_='post-block__title')
        if title_elem:
            title = title_elem.text.strip()
            if 'ai' in title.lower() or 'artificial intelligence' in title.lower():
                link_elem = article.find('a', class_='post-block__title__link')
                excerpt_elem = article.find('div', class_='post-block__content')
                if link_elem and excerpt_elem:
                    link = link_elem['href']
                    excerpt = excerpt_elem.text.strip()
                    ai_articles.append({'title': title, 'link': link, 'excerpt': excerpt})
    print(f"Found {len(ai_articles)} AI-related articles")
    return ai_articles

In [18]:
# Scrape AI-related news from TechCrunch
url_scrape = "https://techcrunch.com/"
html_doc = scrape(url_scrape)
if html_doc:
    soup = BeautifulSoup(html_doc, 'html.parser')

    # Get AI-related articles
    ai_articles = get_ai_articles(soup)

    if ai_articles:
        # Initialize summarizer with bart-large-cnn for summarization
        summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

        # Summarize articles
        summaries = []
        for article in ai_articles:
            print(f"Processing article: {article['title']}")
            article_html = scrape(article['link'])
            if article_html:
                article_soup = BeautifulSoup(article_html, 'html.parser')
                article_content = article_soup.find('div', class_='article-content')
                if article_content:
                    article_text = article_content.text.strip()
                    summary = summarize_text(article_text, summarizer)
                    if summary:
                        summaries.append({
                            'title': article['title'],
                            'link': article['link'],
                            'summary': summary
                        })
                else:
                    print(f"Couldn't find article content for {article['title']}")
            else:
                print(f"Couldn't scrape article page for {article['title']}")

        # Create a DataFrame with the results
        df = pd.DataFrame(summaries)

        # Display the results
        if not df.empty:
            print(df.to_string(index=False))
        else:
            print("No summaries were generated.")

    else:
        print("No AI-related articles found.")
else:
    print("Failed to scrape the TechCrunch homepage.")

Found 0 articles in total
Found 0 AI-related articles
No AI-related articles found.


In [19]:
#TODO Widen search, UI?