In [0]:
#check python version
import sys
print(sys.version)

# 1.Data Collection 

The data sources used in this project are divided into three main categories:
- Asset Data: Historical OHLCV (Open, High, Low, Close, Volume) data for multiple financial assets were obtained through the Yahoo Finance API.
- Hashrate Data: Blockchain computational metric,hashrate was collected via the Blockchain API.
- Reddit Data: Submission data from a curated list of the top Bitcoin or cryptocurrency related subreddits (e.g., r/Bitcoin, r/Cryptocurrency, r/BitcoinMarkets) was collected using the PRAW Reddit API, supplemented by large-scale datasets from Academic Torrents.

In [0]:
# Data collection libraries
import yfinance as yf
import requests
from pyspark.sql import SparkSession
import pandas as pd
import datetime as dt

## 1.1 BaseCollection --- Parent Class 

BaseCollection serves as the parent class for all types of data collection. It defines the common attributes and core functionality that every data collection process must have.

Instance attributes:
  - name (str): Name of the data collection instance.
  - save_path (str): Path where the collected data will be saved. e.g. It might be a path in your local file system or cloud-based storage location. In my case, DataBricks environment is employed, and the collected data is stored in underlying GCS.

Instance functions:
  - save_data(data): Saves the collected data to the specified path in Parquet format. Parquet is chosen because it efficiently compresses large datasets, significantly reducing storage costs. This is particularly important for large-scale data sources such as Reddit textual data. 

In [0]:

# Parent Class
class BaseCollection():
  def __init__(self, name, save_path):
    self.name = name
    self.save_path = save_path
    self.spark = SparkSession.builder.getOrCreate()
  
  def save_data(self, data):
    # if data is not spark datafram, then convert it to spark df
    if not hasattr(data, "write"):
      df = self.spark.createDataFrame(data)
    else:
      df = data  
    save_path = f"{self.save_path}/{self.name}"
    df.write.mode("overwrite").parquet(save_path)

    print(f'{self.name} has been sucessfully saved')

  def normalize_to_utc(self, data):
    if data.index.tz == dt.timezone.utc:
      print(f'Datetime of {self.name} is utc')
    else:
      print(f'Datetime of {self.name} is not utc, it is converting...')
      data.index = data.index.tz_localize("UTC")

      return data

## 1.2 AssetCollection 
AssetCollection is a subclass of BaseCollection designed to collect financial asset data. Specifically, OHLCV (Open, High, Low, Close, Volume) information using the yfinance library. 


- Inherits instance attributes (**self.name, self.save_path**) and the **save_data** method
- Adds functionality specific to financial data collection. 
- Adds functionality to check the number of rows in the fetched data.
  


In [0]:
# Collecting asset OHLCV data via yfinance API
class AssetCollection(BaseCollection):
  def __init__(self, name, save_path):
    super().__init__(name, save_path) 
    self.data = None 

  def fetch_data(self, ticker, start_date = '2017-01-01', end_date = '2025-06-16', interval = '1d'):
      try:
          self.data = yf.download(ticker, start=start_date, end=end_date)

          # check if datetime is utc or not
          self.data = self.normalize_to_utc(self.data)  

          # convert index to column, otherwise it will be lost when load data using spark
          self.data = self.data.reset_index()
          
          return self.data
       
      except Exception as e:
          print(f"Error fetching data for {ticker}: {e}")
          self.data = None
          return None    
      
  def __len__(self):
    if self.data is None:
        return 0
    elif hasattr(self.data, "count"):   # if spark df
        return self.data.count()
    else:                        # pandas df, or list
        return len(self.data)

## 1.3 HashRateCollection

Explanation (from https://www.blockchain.com/explorer/charts/hash-rate)
- Mining hashrate is a key security metric. The more hashing (computing) power in the network, the greater its security and its overall resistance to attack. Although Bitcoinâ€™s exact hashing power is unknown, it is possible to estimate it from the number of blocks being mined and the current block difficulty.

Notes (from https://www.blockchain.com/explorer/charts/hash-rate)
- Daily numbers (raw values) may periodically rise or drop as a result of the randomness of block discovery : even with a hashing power constant, the number of blocks mined can vary in day. Our analysts have found that looking at a 7 day average is a better representation of the underlying power.


In [0]:
# Collecting hash rate data via Blockchain API
class HashrateCollection(BaseCollection):
  def __init__(self, name, save_path):
    super().__init__(name, save_path) 
    self.data = None
    self.df = None 

  def fetch_data(self, sampled = False, start_date = '2017-01-01', end_date = '2025-06-15'):
      url = "https://api.blockchain.info/charts/hash-rate"
      params = {
            'timespan': 'all',  # Since the Blockchain.com API does not support start and end parameters, the fetched data have to be filtered on the client side to select the desired date range.
            'format': 'json',
            'sampled': str(sampled).lower()
            }
      try:
          response = requests.get(url, params=params)
          response.raise_for_status() # if response is not 200, raise exception
          self.data = response.json() # collected as json fomat
          self.df = pd.DataFrame(data=self.data['values'])  # create dataframe from 'values' in json data 
          self.df = self.df.rename(columns={'x': 'Date', 'y': 'HashRate'})   # x -> Date, y -> HashRate
          self.df['Date'] = pd.to_datetime(self.df['Date'], unit='s') # convert Date to datetime
          self.df.set_index('Date', inplace=True) # set Date as index

          # check if datetime is utc or not
          self.df = self.normalize_to_utc(self.df)

          # filter by start_date and end_date
          self.df = self.df[(self.df.index >= start_date) & (self.df.index <= end_date)]

          # convert index to column, otherwise it will be lost when load data using spark
          self.df = self.df.reset_index()

          return self.df
      
      except Exception as e:
          print(f"Error fetching data: {e}")
          self.data = None
          return None


## 1.4 RedditCollection


## 1.5 Example

In [0]:
# Define save path
save_path = '/mnt/demo'

In [0]:
# Set btc, eth, sp500, xrp, ltc tickers 
btc_ticker = 'BTC-USD'
eth_ticker = 'ETH-USD'
sp500_ticker = '^GSPC'
xrp_ticker = 'XRP-USD'
ltc_ticker = 'LTC-USD'

tickers = [btc_ticker, eth_ticker, sp500_ticker, xrp_ticker, ltc_ticker]

for ticker in tickers:
  asset = AssetCollection(ticker, save_path)
  data = asset.fetch_data(ticker)
  print(data)
  # asset.save_data(data)


In [0]:
# Collect hashrate
hashrate = HashrateCollection('hashrate', save_path)
data = hashrate.fetch_data()
print(data)
# Save data on the defined path
# hashrate.save_data(data)