## Project 7.1 - Scraping

The aim of this task is to obtain news headers from the financial times about BTC, so that they can be connected to the BTC price data obtained via AlphaVantage's API to derive insights into what may have influenced price fluctuations.

This notebook will only be used to retrieve the data from the FT via scrapping as an exercise.

In [2]:
# relevant modules
import pandas as pd
import requests
import bs4
from bs4 import *
import os
import lxml
from datetime import datetime
from dateutil.relativedelta import relativedelta


In [13]:
def scrape_ft_bitcoin_articles():
    # Base URL of the Financial Times Bitcoin page
    base_url = 'https://www.ft.com/bitcoin'
    
    # Initialize the page counter
    page_number = 1
    
    # Prepare lists to store the extracted data
    article_dates = []
    article_titles = []
    
    # Define the cutoff date (exactly 6 months ago from today)
    cutoff_date = datetime.now().date() - relativedelta(months=6)

    while True:
        # Construct the URL for the current page
        url = f'{base_url}?page={page_number}'
        
        # Make the HTTP request
        result = requests.get(url)
        
        # Parse the HTML using BeautifulSoup
        soup = bs4.BeautifulSoup(result.text, 'html.parser')
        
        # Extract dates and titles
        dates = soup.select('time.o-date')
        titles = soup.select('a.js-teaser-heading-link')
        
        # Extract date and title text
        for date, title in zip(dates, titles):
            # Parse the date to keep only the date component
            parsed_date = datetime.fromisoformat(date.get('datetime')).date()
            
            # Check if the article date is within the last 6 months
            if parsed_date < cutoff_date:
                df = pd.DataFrame({
                    'Date': article_dates,
                    'Title': article_titles
                })
                df['Date'] = pd.to_datetime(df['Date'])
                return df
            
            article_dates.append(parsed_date)
            article_titles.append(title.get_text(strip=True))
        
        # Check for the presence of the "next page" link
        next_page_link = soup.find('a', class_='o-buttons-icon--arrow-right')
        
        # If there is no link to the next page, break out of the loop
        if not next_page_link:
            break
        
        # Increment the page number for the next iteration
        page_number += 1
    
    # Create a DataFrame
    df = pd.DataFrame({
        'Date': article_dates,
        'Title': article_titles
    })
    
    # Ensure the Date column is in datetime dtype
    df['Date'] = pd.to_datetime(df['Date'])
    
    return df

# Call the function and print the DataFrame
bitcoin_articles_df = scrape_ft_bitcoin_articles()
print(bitcoin_articles_df)


          Date                                              Title
0   2024-05-30     European bitcoin ETPs suffer mounting outflows
1   2024-05-24  British-Chinese bitcoin money launderer jailed...
2   2024-05-24                      Cryptofinance: into the ether
3   2024-05-23  SEC paves way for ethereum ETFs in boost for c...
4   2024-05-22           First UK crypto ETPs to launch on May 28
..         ...                                                ...
100 2023-12-07                               The return of crypto
101 2023-12-05                       Bitcoin’s bounceback déjà vu
102 2023-12-05                     The jobs market is still tight
103 2023-12-05  Buying frenzy puts some Grayscale crypto funds...
104 2023-12-04  Bitcoin price surges above $42,000 as rate cut...

[105 rows x 2 columns]


In [14]:
bitcoin_articles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105 entries, 0 to 104
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    105 non-null    datetime64[ns]
 1   Title   105 non-null    object        
dtypes: datetime64[ns](1), object(1)
memory usage: 1.8+ KB


In [5]:
bitcoin_articles_df.head()

Unnamed: 0,Date,Title
0,2024-05-30,European bitcoin ETPs suffer mounting outflows
1,2024-05-24,British-Chinese bitcoin money launderer jailed...
2,2024-05-24,Cryptofinance: into the ether
3,2024-05-23,SEC paves way for ethereum ETFs in boost for c...
4,2024-05-22,First UK crypto ETPs to launch on May 28


In [6]:
import requests
import bs4
import pandas as pd
from datetime import datetime, timedelta

def scrape_ft_bitcoin_articles():
    # Base URL of the Financial Times Bitcoin page
    base_url = 'https://www.ft.com/bitcoin'
    
    # Initialize the page counter
    page_number = 1
    
    # Prepare lists to store the extracted data
    article_dates = []
    article_titles = []
    
    # Define the cutoff date (6 months ago from today)
    cutoff_date = datetime.now().date() - timedelta(days=6*30)  # Approximation of 6 months

    while True:
        # Construct the URL for the current page
        url = f'{base_url}?page={page_number}'
        
        # Make the HTTP request
        result = requests.get(url)
        
        # Parse the HTML using BeautifulSoup
        soup = bs4.BeautifulSoup(result.text, 'html.parser')
        
        # Extract dates and titles
        dates = soup.select('time.o-date')
        titles = soup.select('a.js-teaser-heading-link')
        
        # Extract date and title text
        for date, title in zip(dates, titles):
            # Parse the date to keep only the date component
            parsed_date = datetime.fromisoformat(date.get('datetime')).date()
            
            # Check if the article date is within the last 6 months
            if parsed_date < cutoff_date:
                return pd.DataFrame({
                    'Date': article_dates,
                    'Title': article_titles
                })
            
            article_dates.append(parsed_date)
            article_titles.append(title.get_text(strip=True))
        
        # Check for the presence of the "next page" link
        next_page_link = soup.find('a', class_='o-buttons-icon--arrow-right')
        
        # If there is no link to the next page, break out of the loop
        if not next_page_link:
            break
        
        # Increment the page number for the next iteration
        page_number += 1
    
    # Create a DataFrame
    df = pd.DataFrame({
        'Date': article_dates,
        'Title': article_titles
    })
    
    return df

# Call the function and print the DataFrame
bitcoin_articles_df = scrape_ft_bitcoin_articles()
print(bitcoin_articles_df)


           Date                                              Title
0    2024-05-30     European bitcoin ETPs suffer mounting outflows
1    2024-05-24  British-Chinese bitcoin money launderer jailed...
2    2024-05-24                      Cryptofinance: into the ether
3    2024-05-23  SEC paves way for ethereum ETFs in boost for c...
4    2024-05-22           First UK crypto ETPs to launch on May 28
..          ...                                                ...
100  2023-12-07                               The return of crypto
101  2023-12-05                       Bitcoin’s bounceback déjà vu
102  2023-12-05                     The jobs market is still tight
103  2023-12-05  Buying frenzy puts some Grayscale crypto funds...
104  2023-12-04  Bitcoin price surges above $42,000 as rate cut...

[105 rows x 2 columns]


In [7]:
def scrape_ft_bitcoin_articles():
    # Base URL of the Financial Times Bitcoin page
    base_url = 'https://www.ft.com/bitcoin'
    
    # Initialize the page counter
    page_number = 1
    
    # Prepare lists to store the extracted data
    article_dates = []
    article_titles = []
    
    # Define the cutoff date (exactly 6 months ago from today)
    cutoff_date = datetime.now().date() - relativedelta(months=6)

    while True:
        # Construct the URL for the current page
        url = f'{base_url}?page={page_number}'
        
        # Make the HTTP request
        result = requests.get(url)
        
        # Parse the HTML using BeautifulSoup
        soup = bs4.BeautifulSoup(result.text, 'html.parser')
        
        # Extract dates and titles
        dates = soup.select('time.o-date')
        titles = soup.select('a.js-teaser-heading-link')
        
        # Extract date and title text
        for date, title in zip(dates, titles):
            # Parse the date to keep only the date component
            parsed_date = datetime.fromisoformat(date.get('datetime')).date()
            
            # Check if the article date is within the last 6 months
            if parsed_date < cutoff_date:
                return pd.DataFrame({
                    'Date': article_dates,
                    'Title': article_titles
                })
            
            article_dates.append(parsed_date)
            article_titles.append(title.get_text(strip=True))
        
        # Check for the presence of the "next page" link
        next_page_link = soup.find('a', class_='o-buttons-icon--arrow-right')
        
        # If there is no link to the next page, break out of the loop
        if not next_page_link:
            break
        
        # Increment the page number for the next iteration
        page_number += 1
    
    # Create a DataFrame
    df = pd.DataFrame({
        'Date': article_dates,
        'Title': article_titles
    })
    
    return df

# Call the function and print the DataFrame
bitcoin_articles_df = scrape_ft_bitcoin_articles()
print(bitcoin_articles_df)


           Date                                              Title
0    2024-05-30     European bitcoin ETPs suffer mounting outflows
1    2024-05-24  British-Chinese bitcoin money launderer jailed...
2    2024-05-24                      Cryptofinance: into the ether
3    2024-05-23  SEC paves way for ethereum ETFs in boost for c...
4    2024-05-22           First UK crypto ETPs to launch on May 28
..          ...                                                ...
100  2023-12-07                               The return of crypto
101  2023-12-05                       Bitcoin’s bounceback déjà vu
102  2023-12-05                     The jobs market is still tight
103  2023-12-05  Buying frenzy puts some Grayscale crypto funds...
104  2023-12-04  Bitcoin price surges above $42,000 as rate cut...

[105 rows x 2 columns]


In [8]:
bitcoin_articles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105 entries, 0 to 104
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Date    105 non-null    object
 1   Title   105 non-null    object
dtypes: object(2)
memory usage: 1.8+ KB
