### Libraries

In [3]:
# importing libraries

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import random
import time
from bs4 import BeautifulSoup
import bs4
from selenium.common.exceptions import TimeoutException
import pandas as pd
import os

### Setting URL and Webdriver

In [4]:
# Set the URL and create the webdriver

url = 'https://opensea.io/collection/boredapeyachtclub/activity?search[eventTypes][0]=OFFER_ENTERED&search[eventTypes][1]=AUCTION_CREATED'
driver = webdriver.Chrome()
driver.get(url)

### Loop 1: to load elements and apend it to html_list

In [5]:
# create empty hrml list

html_list = []

# initialize the number of elements loaded to be zero

num_elements_loaded = 0

# intialising the loop

while True:
    timeout = random.randint(2, 10) # seting a random value for the timeout for every iteration

    # The num_elements variable is used to count the number of elements that have been loaded.

    num_elements = len(driver.find_elements(by=By.CSS_SELECTOR, value=".sc-fe5f9c83-0.iFPiFs"))
    num_elements_loaded += num_elements # add the len of num_elemment to number of elements loaded in every iteraion
    print(f'number of elements found: {num_elements_loaded}')

    # If 500 elements are loaded, break the loop

    if num_elements_loaded >= 2000:
        print(f'Elements loaded {num_elements_loaded}, breaking the loop.')
        break

    # # Wait for a random amount of time before reloading more elements.

    time.sleep(timeout) 
    
    # Scroll down to load more elements

    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Wait for more elements to load

    try:
        WebDriverWait(driver, 1000).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".sc-fe5f9c83-0.iFPiFs")))
    except TimeoutException:
        # If timeout occurs, increase the timeout and continue scrolling

        continue

    # Store the HTML of the current scroll in a list

    html_list.append(driver.page_source)

# quit the driver

driver.quit()

number of elements found: 9
number of elements found: 9
number of elements found: 20
number of elements found: 31
number of elements found: 42
number of elements found: 53
number of elements found: 64
number of elements found: 75
number of elements found: 86
number of elements found: 97
number of elements found: 108
number of elements found: 119
number of elements found: 130
number of elements found: 141
number of elements found: 141
number of elements found: 152
number of elements found: 163
number of elements found: 174
number of elements found: 185
number of elements found: 196
number of elements found: 207
number of elements found: 218
number of elements found: 229
number of elements found: 240
number of elements found: 251
number of elements found: 262
number of elements found: 273
number of elements found: 284
number of elements found: 295
number of elements found: 306
number of elements found: 317
number of elements found: 328
number of elements found: 339
number of elements fou

In [None]:
html_list

### Loop 2: to parse html string and write html_list to html file

In [6]:
for i, html in enumerate(html_list):
    # Parse the HTML string using BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')
    # Write the prettified HTML to a file with a unique filename
    with open(f'output{i}.html', 'w', encoding='utf-8') as file:
        file.write(soup.prettify())

### Extraction functions

In [7]:
# Define the extraction functions

def text_extractor(node):
    return node.text
def string_extractor(node):
    if type(node.contents[0]) is bs4.element.NavigableString:
        return node.contents[0]
    else:
        return None

### Classes dictionary: contains the child elements info.

In [8]:
# create classes dictionary

classes = {
    'list_offer': {
        'element': 'h6',
        'cls': 'sc-29427738-0 sc-bdnxRM figDpC iPAlIP',
        'extr': text_extractor
    },
    'id': {
        'element': 'div',
        'cls': 'sc-fe5f9c83-0 iFPiFs',
        'extr': text_extractor
    },
    'collection': {
        'element': 'span',
        'cls': 'sc-29427738-0 dVNeWL',
        'extr': string_extractor
    },
    'crypto_price': {
        'element': 'div',
        'cls': 'sc-fe5f9c83-0 mGAUR Price--amount',
        'extr': text_extractor
    },
    'usd_price': {
        'element': 'div',
        'cls': 'sc-fe5f9c83-0 mGAUR Price--fiat-amount',
        'extr': text_extractor
    },
    'rarity': {
        'element': 'p',
        'cls': 'sc-29427738-0 sc-bdnxRM sc-d400cbf1-3 debcQ bCCtkE',
        'extr': string_extractor
    },
    'qty': {
        'element': 'div',
        'cls': 'sc-fe5f9c83-0 mGAUR',
        'extr': string_extractor
    },
    'from': {
        'element': 'a',
        'cls': 'sc-1f719d57-0 hoTuIF sc-29427738-0 ikrGyo AccountLink--ellipsis-overflow',
        'extr': text_extractor
    },
    'to': {
        'element': 'p',
        'cls': 'sc-29427738-0 sc-bdnxRM sc-d400cbf1-3 kOfPAj lmxNOf',
        'extr': text_extractor
    },
    'time': {
        'element': 'div',
        'cls': 'sc-29427738-0 sc-c3ae1b73-0 ikrGyo hDtZvT',
        'extr': text_extractor
    }
}

### Loop 3: to read html file and create a dataframe of all required elements.

In [12]:
# Create an empty list to store the extracted data

data = []

# Loop through the html file and extract data from each file

for i in range(185):
    filename = f'output{i}.html'
    with open(filename, 'r', encoding='utf-8') as file:
        html = file.read()
    soup = BeautifulSoup(html, 'html.parser')
    result = {}

    # Loop through the classes dictionary and extract data for each class
    
    for key in classes:
        tmp_data = []
        collection = soup.find_all(classes[key]['element'], class_=classes[key]['cls'])

        # Loop through the matched elements and extract the data using the extraction function defined in the classes dictionary

        for c in collection:
            value = classes[key]['extr'](c)

            # If the extracted value is not None, append it to the temporary list

            if value is not None:
                tmp_data.append(value)

        # If the result dictionary is empty or the length of the temporary list is equal to the length of the list for the first key in the result dictionary, add the temporary list to the result dictionary for the current key

        if not result or len(tmp_data) == len(list(result.values())[0]):
            result[key] = tmp_data
    data.append(result) # Append the result dictionary to the data list

# Concatenate the data from all files into a single DataFrame
df = pd.concat([pd.DataFrame(d) for d in data], ignore_index=True)

In [13]:
df

Unnamed: 0,list_offer,id,crypto_price,usd_price,from,to,time,collection,rarity,qty
0,\n Offer\n ...,\n 85\n ...,\n 61 WETH\n ...,"\n $106,992.17\n ...",\n\n NFTinitcom_Digital-F...,\n ---\n,\n\n 17m ago\n ...,,,
1,\n List\n,\n 2819\n ...,\n 61.6969 ETH\n ...,"\n $108,214.51\n ...",\n\n franklinisbored\n ...,\n ---\n,\n\n 41m ago\n ...,,,
2,\n List\n,\n 3055\n ...,\n 61.4999 ETH\n ...,"\n $107,868.98\n ...",\n\n franklinisbored\n ...,\n ---\n,\n\n 41m ago\n ...,,,
3,\n List\n,\n 7992\n ...,\n 62.6262 ETH\n ...,"\n $109,844.48\n ...",\n\n franklinisbored\n ...,\n ---\n,\n\n 41m ago\n ...,,,
4,\n List\n,\n 6774\n ...,\n 61.4999 ETH\n ...,"\n $107,868.98\n ...",\n\n franklinisbored\n ...,\n ---\n,\n\n 41m ago\n ...,,,
...,...,...,...,...,...,...,...,...,...,...
1986,\n Offer\n ...,\n 1396\n ...,\n 0.004 WETH\n ...,\n $7.02\n ...,\n\n NFTinitcom_KRAKEN1\n...,\n ---\n,\n\n 4d ago\n ...,,,
1987,\n Offer\n ...,\n 2728\n ...,\n 0.004 WETH\n ...,\n $7.02\n ...,\n\n NFTinitcom_KRAKEN1\n...,\n ---\n,\n\n 4d ago\n ...,,,
1988,\n Offer\n ...,\n 64\n ...,\n 0.004 WETH\n ...,\n $7.02\n ...,\n\n NFTinitcom_KRAKEN1\n...,\n ---\n,\n\n 4d ago\n ...,,,
1989,\n Offer\n ...,\n 1840\n ...,\n 0.004 WETH\n ...,\n $7.02\n ...,\n\n NFTinitcom_KRAKEN1\n...,\n ---\n,\n\n 4d ago\n ...,,,


In [14]:
# replace \n from the dataframe

df = df.replace('\n', '', regex=True)
df.reset_index()

Unnamed: 0,index,list_offer,id,crypto_price,usd_price,from,to,time,collection,rarity,qty
0,0,Offer,85 ...,61 WETH ...,"$106,992.17 ...",NFTinitcom_Digital-Fa......,---,17m ago,,,
1,1,List,2819 ...,61.6969 ETH ...,"$108,214.51 ...",franklinisbored ...,---,41m ago,,,
2,2,List,3055 ...,61.4999 ETH ...,"$107,868.98 ...",franklinisbored ...,---,41m ago,,,
3,3,List,7992 ...,62.6262 ETH ...,"$109,844.48 ...",franklinisbored ...,---,41m ago,,,
4,4,List,6774 ...,61.4999 ETH ...,"$107,868.98 ...",franklinisbored ...,---,41m ago,,,
...,...,...,...,...,...,...,...,...,...,...,...
1986,1986,Offer,1396 ...,0.004 WETH ...,$7.02 ...,NFTinitcom_KRAKEN1 ...,---,4d ago,,,
1987,1987,Offer,2728 ...,0.004 WETH ...,$7.02 ...,NFTinitcom_KRAKEN1 ...,---,4d ago,,,
1988,1988,Offer,64 ...,0.004 WETH ...,$7.02 ...,NFTinitcom_KRAKEN1 ...,---,4d ago,,,
1989,1989,Offer,1840 ...,0.004 WETH ...,$7.02 ...,NFTinitcom_KRAKEN1 ...,---,4d ago,,,


In [None]:
# Write the DataFrame to a CSV file
df.to_csv('bored_ape_yacht_club.csv', index=False)