In [30]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.chrome.service import Service

In [15]:
base_url = 'https://www.metal-archives.com'
letters_url = 'https://www.metal-archives.com/lists'
headers = {'User-Agent': 'Mozilla/5.0'}

In [36]:
def get_band_info(band_url):
    response = requests.get(band_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    band_data = {}

    country_tag = soup.find('dt', text='Country of origin:')
    if country_tag:
        band_data['Country of origin'] = country_tag.find_next('dd').text.strip()

    location_tag = soup.find('dt', text='Location:')
    if location_tag:
        band_data['Location'] = location_tag.find_next('dd').text.strip()

    status_tag = soup.find('dt', text='Status:')
    if status_tag:
        band_data['Status'] = status_tag.find_next('dd').text.strip()

    formed_tag = soup.find('dt', text='Formed in:')
    if formed_tag:
        band_data['Formed in'] = formed_tag.find_next('dd').text.strip()

    genre_tag = soup.find('dt', text='Genre:')
    if genre_tag:
        band_data['Genre'] = genre_tag.find_next('dd').text.strip()

    themes_tag = soup.find('dt', text='Themes:')
    if themes_tag:
        band_data['Themes'] = themes_tag.find_next('dd').text.strip()

    last_label_tag = soup.find('dt', text='Last label:')
    if last_label_tag:
        band_data['Last label'] = last_label_tag.find_next('dd').text.strip()

    years_active_tag = soup.find('dt', text='Years active:')
    if years_active_tag:
        band_data['Years active'] = years_active_tag.find_next('dd').text.strip()

    return band_data

def get_discography_url(band_url):
    band_id = band_url.split('/')[-1]
    return f'{base_url}/band/discography/id/{band_id}/tab/all'

def get_discography(discography_url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(discography_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    discography = []

    discography_table = soup.find('table', class_='display discog')
    if discography_table:
        rows = discography_table.find_all('tr')[1:]  # Skip header row
        for row in rows:
            columns = row.find_all('td')
            name = columns[0].text.strip()
            disc_type = columns[1].text.strip()
            year = columns[2].text.strip()
            
            # Extract the value of reviews for each album
            reviews_column = columns[3].find('span', class_='num_votes')
            reviews_value = reviews_column.text.strip() if reviews_column else 'N/A'
            
            discography.append({'Name': name, 'Type': disc_type, 'Year': year, 'Reviews': reviews_value})

    return discography

def scrape_band_data_for_letter(letter):
    letter_url = f'{letters_url}{letter}'
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(letter_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    band_page_links = []

    band_table = soup.find('table', class_='bands')
    if band_table:
        band_links = band_table.find_all('a', class_='link')
        band_page_links = [f'{base_url}{link["href"]}' for link in band_links]

    return band_page_links

def scrape_all_bands_data():
    all_bands_data = {}
    for letter in range(ord('A'), ord('Z') + 1):
        letter = chr(letter)
        all_bands_data[letter] = scrape_band_data_for_letter(letter)
    return all_bands_data

def get_band_page_links(letter):
    base_url = f'https://www.metal-archives.com/lists/{letter}'
    driver_path = "chromedriver-win64/chromedriver.exe"
    
    service = Service(executable_path=driver_path)
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # To run the browser in headless mode

    driver = webdriver.Chrome(service=service, options=options)
    driver.get(base_url)

    band_page_links = []

    while True:
        wait = WebDriverWait(driver, 10)
        try:
            wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'display')))
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            band_table = soup.find('table', class_='display')
            band_links = band_table.find_all('a', class_='link')
            band_page_links.extend([link['href'] for link in band_links])

            next_button = driver.find_element_by_class_name('paginate_button.next')
            if 'disabled' in next_button.get_attribute('class'):
                break

            next_button.click()
        except Exception as e:
            print(f"Error occurred: {e}")
            break

In [37]:
get_band_page_links('A')

Error occurred: 'WebDriver' object has no attribute 'find_element_by_class_name'


In [13]:
if __name__ == '__main__':
    letter = 'A'
    bands_data_for_letter = scrape_band_data_for_letter(letter)

    print(f'Letter: {letter}')
    for band in bands_data_for_letter:
        print(f'Name: {band["Name"]}')
        print('Band Info:')
        for key, value in band["Band Info"].items():
            print(f'{key}: {value}')
        print('Discography:')
        for album in band["Discography"]:
            print(f'Album Name: {album["Name"]}, Type: {album["Type"]}, Year: {album["Year"]}')
        print('-' * 50)


Letter: A


In [None]:
# if __name__ == '__main__':
#     all_bands_data = scrape_all_bands_data()
#     for letter, bands in all_bands_data.items():
#         print(f'Letter: {letter}')
#         for band in bands:
#             print(f'Name: {band["Name"]}')
#             print('Band Info:')
#             for key, value in band["Band Info"].items():
#                 print(f'{key}: {value}')
#             print('Discography:')
#             for album in band["Discography"]:
#                 print(f'Album Name: {album["Name"]}, Type: {album["Type"]}, Year: {album["Year"]}')
#             print('-' * 50)