In [130]:
import requests
import pandas as pd
import re
from bs4 import BeautifulSoup
import json
from tqdm import tqdm

url = 'https://coinmarketcap.com/historical/20191103/'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                  'AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/115.0.0.0 Safari/537.36'
}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')

In [135]:
def scrape_data(start_date, end_date, tickers):
    dates = pd.date_range(start=start_date, end=end_date, freq='W-SUN').strftime('%Y%m%d').tolist()
    all_data = pd.DataFrame()
    for date in tqdm(dates):
        try:
            data = scrape_market_cap(date, tickers)
        except:
            print('Error trying to scrape data')
            data = pd.DataFrame()
        all_data = pd.concat([all_data, data])
    
    all_data['date'] = pd.to_datetime(all_data['date'])
    all_data = all_data.pivot(index='date', columns='symbol', values='Market Cap (USD)')
    
    return all_data

def scrape_market_cap(date, tickers):
    """
    Scrape market ap data for a given snapshot date from CoinMarketCap.
    'date' should be in 'YYYYMMDD' format.
    'tickers' is a list of ticker symbols to filter.
    """
    url = f'https://coinmarketcap.com/historical/{date}/'
    headers = {'User-Agent': 'Mozilla/5.0'}  # Use a proper user-agent to avoid request issues
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print(f"Failed to retrieve data for {date}")

    soup = BeautifulSoup(response.content, 'html.parser')
    soup_str = str(soup)
    # Define your markers (adjust if necessary)
    start_marker = '{\\"data\\":['
    end_marker = '}}}]'
    # Find the start and end positions
    start_index = soup_str.find(start_marker)
    end_index = soup_str.find(end_marker, start_index)
    
    json_str = soup_str[start_index:end_index+len(end_marker)] + '}'
    fixed_str = json_str.encode('utf-8').decode('unicode_escape')
    data_dict = json.loads(fixed_str)

    df = pd.DataFrame(data_dict['data'])
    df['Market Cap (USD)'] = pd.DataFrame(pd.DataFrame(df['quote'].values.tolist())['USD'].values.tolist())['marketCap']
    filtered_df = df[df['symbol'].isin(tickers)][['symbol', 'Market Cap (USD)']]
    filtered_df['date'] = date

    return filtered_df
    

In [136]:
dates = ['20191103', '20191110']
tickers = ['BTC', 'ETH', 'XRP', 'BAT', 'DOGE']

all_data = scrape_data('20191103', '2019-12-24', tickers)
all_data

100%|██████████| 8/8 [00:01<00:00,  7.99it/s]


symbol,BAT,BTC,DOGE,ETH,XRP
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-11-03,314035300.0,166495300000.0,321384900.0,19777350000.0,12596460000.0
2019-11-10,334550100.0,163364400000.0,338534500.0,20559120000.0,12132390000.0
2019-11-17,376897100.0,154861800000.0,323291500.0,20102650000.0,11485050000.0
2019-11-24,282446600.0,127327400000.0,289036400.0,15523460000.0,9663858000.0
2019-12-01,267550900.0,134215100000.0,280260600.0,16443650000.0,9756884000.0
2019-12-08,258701600.0,136847400000.0,273855000.0,16464730000.0,9974529000.0
2019-12-15,261469300.0,129481400000.0,263792800.0,15589120000.0,9467680000.0
2019-12-22,245105200.0,136088000000.0,259272000.0,14428650000.0,8532973000.0
