In [1]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import time
import logging
import json

In [2]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()


In [3]:
def fetch_techcrunch_data(start_date, end_date):
    """Fetch TechCrunch data for a given date range."""
    # Convert dates to datetime objects
    start_date = datetime.strptime(start_date, "%Y-%m-%d")
    end_date = datetime.strptime(end_date, "%Y-%m-%d")

    techcrunch_urls = []

    while start_date <= end_date:
        # Format the date as required by the URL
        year = start_date.strftime("%Y")
        month = start_date.strftime("%m")
        day = start_date.strftime("%d")
        url = f"https://techcrunch.com/{year}/{month}/{day}/"

        try:
            # Send a GET request to the URL
            response = requests.get(url)
            response.raise_for_status()  # Raise an exception for HTTP errors
            logger.info(f"Successfully fetched data for {url}")
            techcrunch_urls.append(url)  # Append the URL to the list
        except requests.HTTPError as errh:
            logger.error(f"HTTP Error for {url}: {errh}")
        except requests.ConnectionError as errc:
            logger.error(f"Error connecting for {url}: {errc}")
        except requests.Timeout as errt:
            logger.error(f"Timeout for {url}: {errt}")
        except requests.RequestException as err:
            logger.error(f"Something went wrong for {url}: {err}")

        # Increment the date by one day
        start_date += timedelta(days=1)

    logger.info(f"Collected {len(techcrunch_urls)} URLs:")
    for url in techcrunch_urls:
        logger.info(url)

    return techcrunch_urls



In [29]:
#Collecting daywise urls
techcrunch_urls = fetch_techcrunch_data( "2024-07-26","2024-10-24",)
logger.info(f"Collected {len(techcrunch_urls)} URLs:")

2024-10-24 12:21:30,858 - INFO - Successfully fetched data for https://techcrunch.com/2024/07/26/
2024-10-24 12:21:31,313 - INFO - Successfully fetched data for https://techcrunch.com/2024/07/27/
2024-10-24 12:21:31,765 - INFO - Successfully fetched data for https://techcrunch.com/2024/07/28/
2024-10-24 12:21:32,257 - INFO - Successfully fetched data for https://techcrunch.com/2024/07/29/
2024-10-24 12:21:32,622 - INFO - Successfully fetched data for https://techcrunch.com/2024/07/30/
2024-10-24 12:21:33,107 - INFO - Successfully fetched data for https://techcrunch.com/2024/07/31/
2024-10-24 12:21:33,641 - INFO - Successfully fetched data for https://techcrunch.com/2024/08/01/
2024-10-24 12:21:34,122 - INFO - Successfully fetched data for https://techcrunch.com/2024/08/02/
2024-10-24 12:21:34,590 - INFO - Successfully fetched data for https://techcrunch.com/2024/08/03/
2024-10-24 12:21:35,011 - INFO - Successfully fetched data for https://techcrunch.com/2024/08/04/
2024-10-24 12:21:35,

In [24]:
def extract_article_links(url):
    """
    Extracts article links from the given URL.
    
    Args:
        url (str): URL of the webpage containing articles.
    
    Returns:
        list: A list of article links found on the webpage.
    """

    try:
        # Send a GET request to the URL
        response = requests.get(url)
        logger.info(f"Request sent to: {url}")
        
        # Check if the request was successful
        if response.status_code != 200:
            logger.error(f"Failed to retrieve the webpage: {url} (Status code: {response.status_code})")
            return []

        logger.info(f"Webpage retrieved successfully: {url}")

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        logger.debug(f"HTML content parsed: {url}")

        # Find all the specified div elements on the webpage
        div_elements = soup.find_all('div', class_='wp-block-column is-layout-flow wp-block-column-is-layout-flow')
        logger.debug(f"Div elements found: {len(div_elements)}")

        # Exclude div elements with the style attribute
        div_elements = [div for div in div_elements if not div.has_attr('style')]
        logger.debug(f"Div elements filtered: {len(div_elements)}")

        # Find all article links within the div elements
        article_links = [link for div in div_elements for link in div.find_all('a', class_='loop-card__title-link')]
        logger.debug(f"Article links found: {len(article_links)}")

        # Extract and store the link URLs 
        links = [link.get('href') for link in article_links]
        logger.info(f"Article links extracted: {len(links)}")

        return links

    except Exception as e:
        logger.error(f"An error occurred: {e}")
        return []

In [31]:
url_list = techcrunch_urls

In [32]:
all_links = []
for url in url_list:
    logger.info(f"Extracting links from: {url}")
    links = extract_article_links(url)
    all_links.extend(links)

logger.info("All links extraction completed")
logger.info(f"Total links extracted: {len(all_links)}")


2024-10-24 12:28:59,328 - INFO - Extracting links from: https://techcrunch.com/2024/07/26/
2024-10-24 12:28:59,863 - INFO - Request sent to: https://techcrunch.com/2024/07/26/
2024-10-24 12:28:59,870 - INFO - Webpage retrieved successfully: https://techcrunch.com/2024/07/26/
2024-10-24 12:28:59,942 - INFO - Article links extracted: 15
2024-10-24 12:28:59,944 - INFO - Extracting links from: https://techcrunch.com/2024/07/27/
2024-10-24 12:29:00,471 - INFO - Request sent to: https://techcrunch.com/2024/07/27/
2024-10-24 12:29:00,471 - INFO - Webpage retrieved successfully: https://techcrunch.com/2024/07/27/
2024-10-24 12:29:00,552 - INFO - Article links extracted: 8
2024-10-24 12:29:00,556 - INFO - Extracting links from: https://techcrunch.com/2024/07/28/
2024-10-24 12:29:00,986 - INFO - Request sent to: https://techcrunch.com/2024/07/28/
2024-10-24 12:29:00,987 - INFO - Webpage retrieved successfully: https://techcrunch.com/2024/07/28/
2024-10-24 12:29:01,024 - INFO - Article links extr

In [26]:
def scrape_article(url):
    """
    Scrapes an article from the provided URL.

    Args:
        url (str): The URL of the article.

    Returns:
        dict: A dictionary representing the scraped article.
    """
    logger.info(f"Scraping article from {url}...")
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code != 200:
        logger.warning(f"Failed to retrieve the article. Status code: {response.status_code}")
        return {"title": "Failed to retrieve", "passage": ""}

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the title of the article
    title_tag = soup.find('h1', class_='article-hero__title wp-block-post-title') or soup.find('h1', class_='wp-block-post-title has-body-1-font-size')
    if title_tag is not None:
        title = title_tag.text
    else:
        logger.warning(f"Failed to find title tag in {url}")
        title = 'Failed to find title'

    # Find the passage of the article
    passage = ''
    article_content = soup.find('div', class_='entry-content wp-block-post-content is-layout-constrained wp-block-post-content-is-layout-constrained')
    if article_content is not None:
        paragraphs = article_content.find_all('p')
        for paragraph in paragraphs:
            if paragraph.get('id') != 'speakable-summary':
                passage += paragraph.text + '\n'
    else:
        logger.warning(f"Failed to find article content tag in {url}")

    if not passage:
        logger.warning(f"Failed to find any paragraph tags in {url}")

    # Save the article
    article = {
        "title": title.strip(),
        "passage": passage.strip()
    }
    logger.info(f"Scraped article from {url}\n")
    return article

In [33]:

# Initialize empty list to hold scraped articles
scraped_articles = []


In [34]:
url_index = 0

In [35]:
while url_index < len(all_links):
    article = scrape_article(all_links[url_index])
    scraped_articles.append(article)
    url_index += 1


2024-10-24 12:30:06,283 - INFO - Scraping article from https://techcrunch.com/2024/07/26/stripe-acquires-payment-processing-startup-lemon-squeezy/...
2024-10-24 12:30:06,849 - INFO - Scraped article from https://techcrunch.com/2024/07/26/stripe-acquires-payment-processing-startup-lemon-squeezy/

2024-10-24 12:30:06,853 - INFO - Scraping article from https://techcrunch.com/2024/07/26/apples-icloud-private-relay-hit-by-outages-for-some-users/...
2024-10-24 12:30:07,360 - INFO - Scraped article from https://techcrunch.com/2024/07/26/apples-icloud-private-relay-hit-by-outages-for-some-users/

2024-10-24 12:30:07,360 - INFO - Scraping article from https://techcrunch.com/2024/07/26/legal-tech-vc-brawls-and-saying-no-to-big-offers/...
2024-10-24 12:30:07,823 - INFO - Scraped article from https://techcrunch.com/2024/07/26/legal-tech-vc-brawls-and-saying-no-to-big-offers/

2024-10-24 12:30:07,827 - INFO - Scraping article from https://techcrunch.com/2024/07/26/apple-signs-the-white-houses-commi

In [38]:
# Convert the list to JSON and save it to a file
with open('data2.json', 'w') as json_file:
    json.dump(scraped_articles, json_file, indent=4)

print('JSON data saved to data2.json')

JSON data saved to data2.json


In [40]:
# Open and load the JSON file
with open('Techcrunch_2023-10-24-to-2024-10-24.json') as json_file:
    data = json.load(json_file)

# Check the length of the JSON object (number of entities)
length = len(data)

print(f'The length of the JSON object is: {length}')

The length of the JSON object is: 7263
