In [4]:
# Was repräsentiert das heutige Deutschland?

import requests 
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
from selenium.webdriver.chrome.options import Options

In [53]:
all_artists = []
all_songs = []
all_pairs = []  # Store artist-song pairs to ensure consistency
max_back_clicks = 6
click_count = 0

try:
    service = Service(ChromeDriverManager().install())
    print("Setting up Chrome options")
    
    driver = webdriver.Chrome(service=service)
    print("Initializing webdriver")
    print("Navigating to URL...")
    url = "https://www.offiziellecharts.de/charts/single-jahr"
    driver.get(url)
    print("Page loaded successfully")
    
    while click_count < max_back_clicks:
        print(f"Processing page {click_count + 1} of {max_back_clicks}")
        
        try:
            wait = WebDriverWait(driver, 60)
            items = wait.until(
                EC.presence_of_all_elements_located((By.CLASS_NAME, "wrap"))
            )
            print(f"Found {len(items)} chart items on this page")
            
            for item in items:
                # Get artist and song from the same chart item to ensure they match
                try:
                    artist_elem = item.find_element(By.CLASS_NAME, 'info-artist')
                    song_elem = item.find_element(By.CLASS_NAME, 'info-title')
                    
                    artist_text = artist_elem.text.strip()
                    song_text = song_elem.text.strip()
                    
                    if artist_text and song_text:
                        all_artists.append(artist_text)
                        all_songs.append(song_text)
                        all_pairs.append((artist_text, song_text))
                    else:
                        print(f"Found incomplete entry: Artist: '{artist_text}', Song: '{song_text}'")
                except Exception as e:
                    print(f"Error processing an item: {e}")
            
            # Check if we need to click "Back" button
            if click_count < max_back_clicks - 1:  # Don't try on last iteration
                try:
                    back_button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'period-navi')))
                    print(f"Clicking 'Back' button to navigate to the previous page...")
                    back_button.click()
                    time.sleep(2)  # Give page time to load
                    click_count += 1
                except TimeoutException:
                    print("No 'Back' button found or not clickable. Stopping pagination.")
                    break
            else:
                click_count += 1
        
        except TimeoutException:
            print("Timeout: Could not find chart items within 60 seconds")
            break
except Exception as e:
    print(f"An error occurred: {e}")
finally:
    driver.quit()
    print("Webdriver closed")

# After collecting all data
print(f"Collected {len(all_pairs)} complete artist-song pairs")
print(f"Total artists: {len(all_artists)}, Total songs: {len(all_songs)}")

# Save the data to CSV
import csv
with open('charts_data.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['Position', 'Song', 'Artist'])
    for i, (artist, song) in enumerate(all_pairs):
        writer.writerow([i+1, song, artist])
print("Data saved to charts_data.csv")

Setting up Chrome options
Initializing webdriver
Navigating to URL...
Page loaded successfully
Processing page 1 of 6
Found 100 chart items on this page
Clicking 'Back' button to navigate to the previous page...
Processing page 2 of 6
Found 100 chart items on this page
Clicking 'Back' button to navigate to the previous page...
Processing page 3 of 6
Found 100 chart items on this page
Clicking 'Back' button to navigate to the previous page...
Processing page 4 of 6
Found 100 chart items on this page
Clicking 'Back' button to navigate to the previous page...
Processing page 5 of 6
Found 100 chart items on this page
Clicking 'Back' button to navigate to the previous page...
Processing page 6 of 6
Found 100 chart items on this page
Webdriver closed
Collected 600 complete artist-song pairs
Total artists: 600, Total songs: 600
Data saved to charts_data.csv


In [5]:
df_music = pd.read_csv("charts_data.csv")

In [6]:
df_music.head()

Unnamed: 0.1,Unnamed: 0,Position,Song,Artist,Feature
0,0,1,I Like The Way You Kiss Me,Artemas,Nein
1,1,2,Wunder,Ayliva x Apache 207,Ja
2,2,3,Beautiful Things,Benson Boone,Nein
3,3,4,Stumblin' In,Cyril,Nein
4,4,5,Vois sur ton chemin,Bennett,Nein


In [64]:
df_music["Artist"].dtypes

dtype('O')

In [62]:
df_music["Artist"] == df_music["Artist"].astype(str)

0      True
1      True
2      True
3      True
4      True
       ... 
595    True
596    True
597    True
598    True
599    True
Name: Artist, Length: 600, dtype: bool

In [65]:
# Jetzt will ich die Gemeinsamkeiten zwischen all diesen Liedern herausfinden
# Erst möchte ich ermitteln, wie viele der Songs mehr als einen Artist haben (also ein Feature)

df_music["Feature"] = df_music["Artist"].apply(
    lambda artist: "Ja" if any(keyword in artist for keyword in ["&", "feat.", "x"]) else "Nein"
)


In [66]:
df_music.head()

Unnamed: 0,Position,Song,Artist,Feature
0,1,I Like The Way You Kiss Me,Artemas,Nein
1,2,Wunder,Ayliva x Apache 207,Ja
2,3,Beautiful Things,Benson Boone,Nein
3,4,Stumblin' In,Cyril,Nein
4,5,Vois sur ton chemin,Bennett,Nein


In [67]:
#Wie viele Features in 5 Jahren Top 100 Single-Charts?

df_music["Feature"].value_counts()

Feature
Nein    359
Ja      241
Name: count, dtype: int64

In [3]:
# Genre
# Sprache des Songs
# Durchschnittliche Songlänge
# Feature?
# Geschlecht der Artists?

#Now I want to loop through all the pages

base_url = "https://www.last.fm/tag/german/artists?page="

artist_fm = []

for page in range(1, 478):
    url = base_url + str(page)
    req = requests.get(url)
    print(f"Scraping url{url}")

    if req.status_code != 200:
        print(f"Fehler beim Abrufen der Seite {page}: Statuscode {req.status_code}")
        continue
    
    soup = BeautifulSoup(req.text, 'html.parser')
    artists = soup.find_all(class_ = "big-artist-list-title")
    print(f"Seite {page}: {len(artists)} Künstler gefunden")  # Debugging-Ausgabe
    
    for i in range(20):
        artist_fm.append(artists[i].text)
        # if page>1:
        #     print(f"{(i)+page*20}" + artists[i].text)
        # else:
        print(f"{i}" + artists[i].text)

Scraping urlhttps://www.last.fm/tag/german/artists?page=1
Seite 1: 21 Künstler gefunden
0PASHANIM
1Bonez MC
2Ski Aggu
3Nena
4Die drei !!!
5Apache 207
6Clueso
7Peter Fox
8Reezy
9Wir sind Helden
10Simone Sommerland
11OG Keemo
12T-Low
13Die Fantastischen Vier
14Nina Chuba
15AYLIVA
16Besomorph
17Dardan
18SDP
19Yasumu
Scraping urlhttps://www.last.fm/tag/german/artists?page=2
Seite 2: 21 Künstler gefunden
0Herbert Grönemeyer
1Nikolai Tal
2Beyazz
3Kalim
4Bosse
5Silbermond
6Miksu / Macloud
7Michael Schulte
8Element of Crime
9Jennifer Rostock
10Juli
11Blumentopf
12Edwin Rosen
13Sierra Kidd
14Mark Forster
15Olexesh
16Cheriimoya
17Revolverheld
18Lizot
192raumwohnung
Scraping urlhttps://www.last.fm/tag/german/artists?page=3
Seite 3: 21 Künstler gefunden
0HOODBLAQ
1Rosenstolz
2Schmyt
3Xavier Naidoo
4Annett Louisan
5Lugatti & 9ine
6Neptunica
7Die Prinzen
8SXID
9BERQ
10Ennio
11Die Schule der magischen Tiere
12Yakary
13Freundeskreis
14Max Herre
15Tim Bendzko
16Musso
17MIA.
18Christina Stürmer
19badmóm

IndexError: list index out of range

In [7]:
artist_fm

NameError: name 'artist_fm' is not defined

In [48]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd
from selenium.common.exceptions import NoSuchElementException, TimeoutException

def setup_driver():
    chrome_options = Options()
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.maximize_window()
    return driver

def handle_privacy_popup(driver, wait):
    try:
        print("Checking for privacy notice...")
        
        # First check if there's an iframe
        iframes = driver.find_elements(By.TAG_NAME, 'iframe')
        if iframes:
            for iframe in iframes:
                try:
                    driver.switch_to.frame(iframe)
                    
                    # Try different button selectors
                    button_selectors = [
                        "button.sp_choice_type_11",
                        "//button[contains(text(), 'Zustimmen')]",
                        "//button[contains(text(), 'Accept all')]",
                        "//button[contains(@title, 'Accept all')]",
                        "#notice button.sp_choice_type_ACCEPT_ALL"
                    ]
                    
                    for selector in button_selectors:
                        try:
                            if selector.startswith("//"):
                                button = driver.find_element(By.XPATH, selector)
                            else:
                                button = driver.find_element(By.CSS_SELECTOR, selector)
                            button.click()
                            print("Successfully clicked privacy button")
                            driver.switch_to.default_content()
                            return True
                        except:
                            continue
                    
                    # If no button was found in this iframe, switch back and try the next one
                    driver.switch_to.default_content()
                except:
                    # If there was an error with this iframe, switch back and try the next one
                    driver.switch_to.default_content()
                    continue
        
        # If we couldn't find a button in any iframe, or if there are no iframes,
        # the popup might not be present or might be already handled
        print("No privacy popup found or it was already handled")
        return True
            
    except Exception as e:
        print(f"Error handling privacy popup: {str(e)}")
        driver.switch_to.default_content()
        return False

def extract_genres(driver, wait, song):
    """Extract genres using the provided XPath and iterate through all genre tags"""
    try:
        # Wait for the spotify-tags section to load
        spotify_tags_container = wait.until(
            EC.presence_of_element_located((By.ID, "spotify-tags"))
        )
        
        # First, check if we can find any genre tags using your specific XPath
        base_xpath = "//*[@id='spotify-tags']/div/div[2]/a"
        genres = []
        
        # Try to find the first genre tag
        try:
            # Find all genre tags dynamically
            genre_elements = driver.find_elements(By.XPATH, base_xpath)
            
            if genre_elements:
                for element in genre_elements:
                    genre_text = element.text.strip()
                    if genre_text:
                        genres.append(genre_text)
                
                print(f"Found {len(genres)} genres for '{song}': {genres}")
                return genres
            else:
                print(f"No genre elements found for '{song}' using XPath")
        except NoSuchElementException:
            print(f"No genre elements found for '{song}' using XPath")
            
        # If the specific XPath failed, try alternative methods to find genres
        alternative_selectors = [
            ".tag.is-info",
            "//div[@id='spotify-tags']//a",
            "//div[contains(@class, 'tags')]//a",
            "//a[contains(@href, '/genre/')]"
        ]
        
        for selector in alternative_selectors:
            try:
                if selector.startswith("//"):
                    elements = driver.find_elements(By.XPATH, selector)
                else:
                    elements = driver.find_elements(By.CSS_SELECTOR, selector)
                
                if elements:
                    for element in elements:
                        genre_text = element.text.strip()
                        if genre_text:
                            genres.append(genre_text)
                    
                    if genres:
                        print(f"Found {len(genres)} genres using alternative selector: {selector}")
                        return genres
            except:
                continue
        
        # If we couldn't find any genres with any method
        print(f"No genres found for '{song}' after trying all methods")
        return []
        
    except Exception as e:
        print(f"Error extracting genres for '{song}': {str(e)}")
        return []

def scrape_genres(df_music, output_file='genres_results.csv', save_partial=True):
    driver = setup_driver()
    wait = WebDriverWait(driver, 15)
    short_wait = WebDriverWait(driver, 5)
    genres_list = []
    processed_songs = []
    
    try:
        # Navigate to the website
        url = "https://www.chosic.com/music-genre-finder"
        print("Navigating to URL...")
        driver.get(url)
        
        # Handle privacy popup
        handle_privacy_popup(driver, wait)
        
        # Wait for the page to be fully loaded
        time.sleep(2)
        
        # Process each song
        for idx, song in enumerate(df_music["Song"]):
            try:
                print(f"\nProcessing song {idx+1}/{len(df_music)}: '{song}'")
                processed_songs.append(song)
                
                # Try multiple selectors for the search input
                search_selectors = [
                    "input.search-input",
                    "#search",
                    "//input[@placeholder='Search for a song...']",
                    "//input[@type='text' and contains(@class, 'search')]",
                    "#search-wrap input",
                    ".search-container input"
                ]
                
                search_input = None
                for selector in search_selectors:
                    try:
                        if selector.startswith("//"):
                            search_input = short_wait.until(EC.presence_of_element_located((By.XPATH, selector)))
                        else:
                            search_input = short_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
                        if search_input:
                            print(f"Found search input using selector: {selector}")
                            break
                    except:
                        continue
                
                if not search_input:
                    raise Exception("Could not find search input with any selector")
                
                # Scroll element into view
                driver.execute_script("arguments[0].scrollIntoView(true);", search_input)
                time.sleep(2)
                
                # Clear the field
                search_input.clear()
                driver.execute_script("arguments[0].value = '';", search_input)
                
                # Click and send keys directly
                search_input.click()
                search_input.send_keys(song)
                time.sleep(1)
                
                # Check for search results
                try:
                    result_selectors = [
                        '//*[@id="form-suggestions"]/span[1]',
                        '#form-suggestions > span:nth-child(1)'
                    ]
                    
                    result_element = None
                    for selector in result_selectors:
                        try:
                            if selector.startswith("//"):
                                result_element = short_wait.until(EC.element_to_be_clickable((By.XPATH, selector)))
                            else:
                                result_element = short_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, selector)))
                            if result_element:
                                print(f"Found search result using selector: {selector}")
                                break
                        except:
                            continue
                    
                    if result_element:
                        # Click the first search result
                        print("Clicking on first search result")
                        result_element.click()
                    else:
                        # If no results are found, just press Enter
                        print("No search results found, pressing Enter")
                        search_input.send_keys(Keys.ENTER)
                        
                except Exception as e:
                    print(f"Could not find search results, using Enter key instead: {str(e)}")
                    search_input.send_keys(Keys.ENTER)
                
                # Wait for results page to load
                time.sleep(3)
                
                # Extract genres using our dedicated function
                song_genres = extract_genres(driver, wait, song)
                genres_list.append(song_genres)
                
                # Save partial results after each song to avoid losing data
                if save_partial:
                    partial_results_df = pd.DataFrame({
                        'Song': processed_songs,
                        'Genres': genres_list
                    })
                    partial_results_df.to_csv(f'partial_{output_file}', index=False)
                
                # Navigate back to the search page
                driver.back()
                time.sleep(2)
                
            except Exception as e:
                print(f"Error processing song '{song}': {str(e)}")
                genres_list.append([])
                # Return to the main page if there's an error
                try:
                    driver.get(url)
                    time.sleep(2)
                except:
                    pass
                
                # Save partial results after each error to ensure we don't lose data
                if save_partial:
                    partial_results_df = pd.DataFrame({
                        'Song': processed_songs,
                        'Genres': genres_list
                    })
                    partial_results_df.to_csv(f'partial_{output_file}', index=False)
                
    except Exception as e:
        print(f"Fatal error: {str(e)}")
    finally:
        driver.quit()
        
    # Create a DataFrame with results
    results_df = pd.DataFrame({
        'Song': processed_songs,
        'Genres': genres_list
    })
    
    # Save final results
    results_df.to_csv(output_file, index=False)
        
    return results_df



results = scrape_genres(df_music, output_file='my_genres.csv')

TypeError: scrape_genres() missing 1 required positional argument: 'df_music'

In [35]:
len(df_music)

600

In [8]:
results_df = pd.read_csv('partial_my_genres.csv')

In [9]:
results_df

Unnamed: 0,Song,Genres
0,I Like The Way You Kiss Me,"['pop', 'hyperpop', 'sped up', 'dark r&b', 'si..."
1,Wunder,"['german hip hop', 'german pop']"
2,Beautiful Things,['singer-songwriter pop']
3,Stumblin' In,"['classic uk pop', 'glam rock']"
4,Vois sur ton chemin,['techno']
...,...,...
466,Colt,['comedy rap']
467,Death Bed,"['sad rap', 'sad lo-fi', 'bedroom pop']"
468,Memories,['pop']
469,2012,['r&b']


In [10]:
def setup_driver():
    chrome_options = Options()
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.maximize_window()
    return driver

def handle_privacy_popup(driver, wait):
    try:
        print("Checking for privacy notice...")
        
        # First check if there's an iframe
        iframes = driver.find_elements(By.TAG_NAME, "iframe")
        if iframes:
            for iframe in iframes:
                try:
                    driver.switch_to.frame(iframe)
                    
                    # Try different button selectors
                    button_selectors = [
                        '//*[@id="notice"]/div[3]/div[2]/button',
                        '//button[contains(text(), "Zustimmen")]',
                        '//button[contains(text(), "Accept all")]',
                        '//button[contains(@title, "Accept all")]',
                        '#notice button.sp_choice_type_ACCEPT_ALL'
                    ]
                    
                    for selector in button_selectors:
                        try:
                            if selector.startswith("//"):
                                button = driver.find_elements(By.XPATH, selector)
                            else:
                                button = driver.find_elements(By.CSS_SELECTOR, selector)
                            button.click()
                            print("Successfully clicked privacy button")
                            driver.switch_to.default_content()
                            return True
                        except:
                            continue
                    
                    # If no button was found in this iframe, switch back and try the next one
                    driver.switch_to.default_content()
                except:
                    # If there was an error with this iframe, switch back and try the next one
                    driver.switch_to.default_content()
                    continue
        
        # If we couldn't find a button in any iframe, or if there are no iframes,
        # the popup might not be present or might be already handled
        print("No privacy popup found or it was already handled")
        return True
            
    except Exception as e:
        print(f"Error handling privacy popup: {str(e)}")
        driver.switch_to.default_content()
        return False

def extract_genres(driver, wait, song):
    """Extract genres using the provided XPath and iterate through all genre tags"""
    try:
        # Wait for the spotify-tags section to load
        spotify_tags_container = wait.until(
            EC.presence_of_element_located((By.ID, "spotify-tags"))
        )
        
        base_xpath = "//*[@id='spotify-tags']/div/div[2]/a"
        genres_2 = []
        
        # Try to find the first genre tag
        try:
            # Find all genre tags dynamically
            genre_elements_2 = driver.find_elements(By.XPATH, base_xpath)
            
            if genre_elements_2:
                for element in genre_elements_2:
                    genre_text = element.text.strip()
                    if genre_text:
                        genres_2.append(genre_text)
                
                print(f"Found {len(genres_2)} genres for '{song}': {genres_2}")
                return genres_2
            else:
                print(f"No genre elements found for '{song}' using XPath")
        except NoSuchElementException:
            print(f"No genre elements found for '{song}' using XPath")
            
        # If the specific XPath failed, try alternative methods to find genres
        alternative_selectors = [
            ".tag.is-info",
            "//div[@id='spotify-tags']//a",
            "//div[contains(@class, 'tags')]//a",
            "//a[contains(@href, '/genre/')]"
        ]
        
        for selector in alternative_selectors:
            try:
                if selector.startswith("//"):
                    elements = driver.find_elements(By.XPATH, selector)
                else:
                    elements = driver.find_elements(By.CSS_SELECTOR, selector)
                
                if elements:
                    for element in elements:
                        genre_text = element.text.strip()
                        if genre_text:
                            genres_2.append(genre_text)
                    
                    if genres_2:
                        print(f"Found {len(genres_2)} genres using alternative selector: {selector}")
                        return genres
            except:
                continue
        
        # If we couldn't find any genres with any method
        print(f"No genres found for '{song}' after trying all methods")
        return []
        
    except Exception as e:
        print(f"Error extracting genres for '{song}': {str(e)}")
        return []

def scrape_genres(df_music, output_file='genres_results_2.csv', save_partial=True):
    driver = setup_driver()
    wait = WebDriverWait(driver, 15)
    short_wait = WebDriverWait(driver, 5)
    genres_list_2 = []
    processed_songs_2 = []
    
    try:
        # Navigate to the website
        url = "https://www.chosic.com/music-genre-finder"
        print("Navigating to URL...")
        driver.get(url)
        
        # Handle privacy popup
        handle_privacy_popup(driver, wait)
        
        # Wait for the page to be fully loaded
        time.sleep(2)
        
        # Process each song
        for idx, song in enumerate(df_music["Song"][472:]):
            try:
                print(f"\nProcessing song {idx+1}/{len(df_music)}: '{song}'")
                processed_songs_2.append(song)
                
                # Try multiple selectors for the search input
                search_selectors = [
                    "input.search-input",
                    "#search",
                    "//input[@placeholder='Search for a song...']",
                    "//input[@type='text' and contains(@class, 'search')]",
                    "#search-wrap input",
                    ".search-container input"
                ]
                
                search_input = None
                for selector in search_selectors:
                    try:
                        if selector.startswith("//"):
                            search_input = short_wait.until(EC.presence_of_element_located((By.XPATH, selector)))
                        else:
                            search_input = short_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
                        if search_input:
                            print(f"Found search input using selector: {selector}")
                            break
                    except:
                        continue
                
                if not search_input:
                    raise Exception("Could not find search input with any selector")
                
                # Scroll element into view
                driver.execute_script("arguments[0].scrollIntoView(true);", search_input)
                time.sleep(4)
                
                # Clear the field
                search_input.clear()
                driver.execute_script("arguments[0].value = '';", search_input)
                
                # Click and send keys directly
                search_input.click()
                search_input.send_keys(song)
                time.sleep(1)
                
                # Check for search results
                try:
                    result_selectors = [
                        '//*[@id="form-suggestions"]/span[1]',
                        '#form-suggestions > span:nth-child(1)'
                    ]
                    
                    result_element = None
                    for selector in result_selectors:
                        try:
                            if selector.startswith("//"):
                                result_element = short_wait.until(EC.element_to_be_clickable((By.XPATH, selector)))
                            else:
                                result_element = short_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, selector)))
                            if result_element:
                                print(f"Found search result using selector: {selector}")
                                break
                        except:
                            continue
                    
                    if result_element:
                        # Click the first search result
                        print("Clicking on first search result")
                        result_element.click()
                    else:
                        # If no results are found, just press Enter
                        print("No search results found, pressing Enter")
                        search_input.send_keys(Keys.ENTER)
                        
                except Exception as e:
                    print(f"Could not find search results, using Enter key instead: {str(e)}")
                    search_input.send_keys(Keys.ENTER)
                
                # Wait for results page to load
                time.sleep(3)
                
                # Extract genres using our dedicated function
                song_genres = extract_genres(driver, wait, song)
                genres_list_2.append(song_genres)
                
                # Save partial results after each song to avoid losing data
                if save_partial:
                    partial_results_df_2 = pd.DataFrame({
                        'Song': processed_songs_2,
                        'Genres': genres_list_2
                    })
                    partial_results_df_2.to_csv(f'partial_2_{output_file}', index=False)
                
                # Navigate back to the search page
                driver.back()
                time.sleep(2)
                
            except Exception as e:
                print(f"Error processing song '{song}': {str(e)}")
                genres_list_2.append([])
                # Return to the main page if there's an error
                try:
                    driver.get(url)
                    time.sleep(2)
                except:
                    pass
                
                # Save partial results after each error to ensure we don't lose data
                if save_partial:
                    partial_results_df_2 = pd.DataFrame({
                        'Song': processed_songs_2,
                        'Genres': genres_list_2
                    })
                    partial_results_df_2.to_csv(f'partial_2_{output_file}', index=False)
                
    except Exception as e:
        print(f"Fatal error: {str(e)}")
    finally:
        driver.quit()
        
    # Create a DataFrame with results
    results_df_2 = pd.DataFrame({
        'Song': processed_songs_2,
        'Genres': genres_list_2
    })
    
    # Save final results
    results_df.to_csv(output_file, index=False)
        
    return results_df



results_2 = scrape_genres(df_music, output_file='my_genres_2.csv')

Navigating to URL...
Checking for privacy notice...
No privacy popup found or it was already handled

Processing song 1/600: 'Keine Liebe'
Found search input using selector: #search-wrap input
Found search result using selector: //*[@id="form-suggestions"]/span[1]
Clicking on first search result
Found 5 genres for 'Keine Liebe': ['medieval metal', 'industrial rock', 'industrial metal', 'industrial', 'ebm']

Processing song 2/600: 'Ayy Macarena'
Found search input using selector: #search-wrap input
Found search result using selector: //*[@id="form-suggestions"]/span[1]
Clicking on first search result
Found 5 genres for 'Ayy Macarena': ['trap', 'southern hip hop', 'rap', 'pop rap', 'hip hop']

Processing song 3/600: 'Angeklagt'
Found search input using selector: #search-wrap input
Found search result using selector: //*[@id="form-suggestions"]/span[1]
Clicking on first search result
Found 1 genres for 'Angeklagt': ['german hip hop']

Processing song 4/600: 'Ritmo (Bad Boys For Life)'
Fou

In [13]:
df_2 = pd.read_csv("partial_2_my_genres_2.csv")

In [14]:
df_2

Unnamed: 0,Song,Genres
0,Keine Liebe,"['medieval metal', 'industrial rock', 'industr..."
1,Ayy Macarena,"['trap', 'southern hip hop', 'rap', 'pop rap',..."
2,Angeklagt,['german hip hop']
3,Ritmo (Bad Boys For Life),"['pop rap', 'pop', 'dance pop', 'reggaeton', '..."
4,Zu Ende,"['german hip hop', 'german pop']"
...,...,...
123,So Am I,['pop']
124,Blei,"['rock', 'pop', 'modern rock']"
125,Nummer 1,['german hip hop']
126,Shot,"['neo-singer-songwriter', 'folk-pop']"


In [17]:
vertical_stack = pd.concat([results_df, df_2], ignore_index=True)

In [18]:
vertical_stack

Unnamed: 0,Song,Genres
0,I Like The Way You Kiss Me,"['pop', 'hyperpop', 'sped up', 'dark r&b', 'si..."
1,Wunder,"['german hip hop', 'german pop']"
2,Beautiful Things,['singer-songwriter pop']
3,Stumblin' In,"['classic uk pop', 'glam rock']"
4,Vois sur ton chemin,['techno']
...,...,...
594,So Am I,['pop']
595,Blei,"['rock', 'pop', 'modern rock']"
596,Nummer 1,['german hip hop']
597,Shot,"['neo-singer-songwriter', 'folk-pop']"


In [21]:
# Beim Scrapen von den Genres 2019 haben sich mehrere Fehler eingeschlichen (weil die Charts Seite meist neuere Songs
# ausgewählt hat), weshalb ich aus Praktikabilitätsgründen nur 2020-24 mache und es immer noch ein großer Datensatz ist
# Hier wird aber auch Stichprobenartig überprüft

genre_20_24 = vertical_stack[:499]
music_20_24 = df_music[:499]

In [59]:
df_20_24 = genre_20_24.merge(music_20_24, left_on=['Song', genre_20_24.groupby('Song').cumcount()],
               right_on=['Song', music_20_24.groupby('Song').cumcount()])

In [60]:
df_20_24 = df_20_24.drop(columns=["Unnamed: 0"])
df_20_24 = df_20_24.drop(columns=["Position"])

In [61]:
df_20_24

Unnamed: 0,Song,key_1,Genres,Artist,Feature
0,I Like The Way You Kiss Me,0,"['pop', 'hyperpop', 'sped up', 'dark r&b', 'si...",Artemas,Nein
1,Wunder,0,"['german hip hop', 'german pop']",Ayliva x Apache 207,Ja
2,Beautiful Things,0,['singer-songwriter pop'],Benson Boone,Nein
3,Stumblin' In,0,"['classic uk pop', 'glam rock']",Cyril,Nein
4,Vois sur ton chemin,0,['techno'],Bennett,Nein
...,...,...,...,...,...
493,Fuckst mich nur ab,0,['german hip hop'],Bonez MC,Nein
494,WAP,0,"['rap', 'pop', 'trap queen', 'houston rap']",Cardi B feat. Megan Thee Stallion,Ja
495,Circles,0,"['rap', 'pop', 'melodic rap', 'dfw rap']",Post Malone,Nein
496,Doch in der Nacht,0,['german hip hop'],Apache 207,Nein


In [62]:
df_20_24['Combined'] = df_20_24['Song'].astype(str) + " " + df_20_24['Artist']
df_20_24 = df_20_24.drop(columns=["key_1"])

In [63]:
df_20_24.head()

Unnamed: 0,Song,Genres,Artist,Feature,Combined
0,I Like The Way You Kiss Me,"['pop', 'hyperpop', 'sped up', 'dark r&b', 'si...",Artemas,Nein,I Like The Way You Kiss Me Artemas
1,Wunder,"['german hip hop', 'german pop']",Ayliva x Apache 207,Ja,Wunder Ayliva x Apache 207
2,Beautiful Things,['singer-songwriter pop'],Benson Boone,Nein,Beautiful Things Benson Boone
3,Stumblin' In,"['classic uk pop', 'glam rock']",Cyril,Nein,Stumblin' In Cyril
4,Vois sur ton chemin,['techno'],Bennett,Nein,Vois sur ton chemin Bennett


In [64]:
# Now I want to scrape the song duration
# Hätte ich auch in einem Scrape machen können, aber hab nicht gesehen, dass die Info
# auf derselben Seite vorhanden ist

def setup_driver():
    chrome_options = Options()
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.maximize_window()
    return driver

def handle_privacy_popup(driver, wait):
    try:
        print("Checking for privacy notice...")
        
        # First check if there's an iframe
        iframes = driver.find_elements(By.TAG_NAME, "iframe")
        if iframes:
            for iframe in iframes:
                try:
                    driver.switch_to.frame(iframe)
                    
                    # Try different button selectors
                    button_selectors = [
                        '//*[@id="notice"]/div[3]/div[2]/button',
                        '//button[contains(text(), "Zustimmen")]',
                        '//button[contains(text(), "Accept all")]',
                        '//button[contains(@title, "Accept all")]',
                        '#notice button.sp_choice_type_ACCEPT_ALL'
                    ]
                    
                    for selector in button_selectors:
                        try:
                            if selector.startswith("//"):
                                button = driver.find_elements(By.XPATH, selector)
                            else:
                                button = driver.find_elements(By.CSS_SELECTOR, selector)
                            button.click()
                            print("Successfully clicked privacy button")
                            driver.switch_to.default_content()
                            return True
                        except:
                            continue
                    
                    # If no button was found in this iframe, switch back and try the next one
                    driver.switch_to.default_content()
                except:
                    # If there was an error with this iframe, switch back and try the next one
                    driver.switch_to.default_content()
                    continue
        
        # If we couldn't find a button in any iframe, or if there are no iframes,
        # the popup might not be present or might be already handled
        print("No privacy popup found or it was already handled")
        return True
            
    except Exception as e:
        print(f"Error handling privacy popup: {str(e)}")
        driver.switch_to.default_content()
        return False

def extract_duration(driver, wait, song):
    """Extract genres using the provided XPath and iterate through all genre tags"""
    try:
        # Wait for the spotify-tags section to load
        duration_tag = wait.until(
            EC.presence_of_element_located((By.CLASS_NAME, "tempo-duration-first"))
        )
        song_duration = []
        tempo = []
        
        try:
            # Find all genre tags dynamically
            dauer = driver.find_elements(By.XPATH, '//*[@id="result-analyzer"]/div[1]/div[2]/div[1]/div[1]/div[1]/span')
            bpm = driver.find_elements(By.XPATH, '//*[@id="result-analyzer"]/div[1]/div[2]/div[1]/div[1]/div[2]/span')
            
            if dauer and bpm:
                laenge = dauer[0].text.strip()
                song_duration.append(laenge)
                rhythm = bpm[0].text.strip()
                tempo.append(rhythm)
                print(f"Found the duration for '{song}': {song_duration}")
                print(f"Found the tempo for '{song}': {tempo}")
                return song_duration
                return tempo
            else:
                print(f"No duration or tempo elements found for '{song}' using XPath")
        except NoSuchElementException:
            print(f"No duration or tempo elements found for '{song}' using XPath")

    except Exception as e:
        print(f"Error extracting genres for '{song}': {str(e)}")
        return []

        return song_duration if song_duration else tempo if tempo else []
            

def scrape_genres(df_music, output_file='song_info.csv', save_partial=True):
    driver = setup_driver()
    wait = WebDriverWait(driver, 15)
    short_wait = WebDriverWait(driver, 5)
    song_infos = []
    processed_songs_3 = []
    
    try:
        # Navigate to the website
        url = "https://www.chosic.com/music-genre-finder"
        print("Navigating to URL...")
        driver.get(url)
        
        # Handle privacy popup
        handle_privacy_popup(driver, wait)
        
        # Wait for the page to be fully loaded
        time.sleep(2)
        
        # Process each song
        for idx, song in enumerate(df_20_24["Combined"]):
            try:
                print(f"\nProcessing song {idx+1}/{len(df_20_24)}: '{song}'")
                processed_songs_3.append(song)
                
                # Try multiple selectors for the search input
                search_selectors = [
                    "input.search-input",
                    "#search",
                    "//input[@placeholder='Search for a song...']",
                    "//input[@type='text' and contains(@class, 'search')]",
                    "#search-wrap input",
                    ".search-container input"
                ]
                
                search_input = None
                for selector in search_selectors:
                    try:
                        if selector.startswith("//"):
                            search_input = short_wait.until(EC.presence_of_element_located((By.XPATH, selector)))
                        else:
                            search_input = short_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
                        if search_input:
                            print(f"Found search input using selector: {selector}")
                            break
                    except:
                        continue
                
                if not search_input:
                    raise Exception("Could not find search input with any selector")
                
                # Scroll element into view
                driver.execute_script("arguments[0].scrollIntoView(true);", search_input)
                time.sleep(4)
                
                # Clear the field
                search_input.clear()
                driver.execute_script("arguments[0].value = '';", search_input)
                
                # Click and send keys directly
                search_input.click()
                search_input.send_keys(song)
                time.sleep(1)
                
                # Check for search results
                try:
                    result_selectors = [
                        '//*[@id="form-suggestions"]/span[1]',
                        '#form-suggestions > span:nth-child(1)'
                    ]
                    
                    result_element = None
                    for selector in result_selectors:
                        try:
                            if selector.startswith("//"):
                                result_element = short_wait.until(EC.element_to_be_clickable((By.XPATH, selector)))
                            else:
                                result_element = short_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, selector)))
                            if result_element:
                                print(f"Found search result using selector: {selector}")
                                break
                        except:
                            continue
                    
                    if result_element:
                        # Click the first search result
                        print("Clicking on first search result")
                        result_element.click()
                    else:
                        # If no results are found, just press Enter
                        print("No search results found, pressing Enter")
                        search_input.send_keys(Keys.ENTER)
                        
                except Exception as e:
                    print(f"Could not find search results, using Enter key instead: {str(e)}")
                    search_input.send_keys(Keys.ENTER)
                
                # Wait for results page to load
                time.sleep(3)
                
                # Extract genres using our dedicated function
                song_info = extract_duration(driver, wait, song)
                song_infos.append(song_info)
                
                # Save partial results after each song to avoid losing data
                if save_partial:
                    partial_results_df_3 = pd.DataFrame({
                        'Song': processed_songs_3,
                        'Song Speed': song_infos
                    })
                    partial_results_df_3.to_csv(f'partial_3_{output_file}', index=False)
                
                # Navigate back to the search page
                driver.back()
                time.sleep(2)
                
            except Exception as e:
                print(f"Error processing song '{song}': {str(e)}")
                song_infos.append([])
                # Return to the main page if there's an error
                try:
                    driver.get(url)
                    time.sleep(2)
                except:
                    pass
                
    except Exception as e:
        print(f"Fatal error: {str(e)}")
    finally:
        driver.quit()
        
    # Create a DataFrame with results
    results_df_3 = pd.DataFrame({
        'Song': processed_songs_3,
        'Song Infos': song_infos
    })
    
    # Save final results
    results_df_3.to_csv(output_file, index=False)
        
    return results_df_3

results_3 = scrape_genres(df_music, output_file='my_speed.csv')

Navigating to URL...
Checking for privacy notice...
No privacy popup found or it was already handled

Processing song 1/498: 'I Like The Way You Kiss Me Artemas'
Found search input using selector: #search-wrap input
Found search result using selector: //*[@id="form-suggestions"]/span[1]
Clicking on first search result
Found the duration for 'I Like The Way You Kiss Me Artemas': ['Length: 2:22']
Found the tempo for 'I Like The Way You Kiss Me Artemas': ['Tempo: 152 bpm']

Processing song 2/498: 'Wunder Ayliva x Apache 207'
Found search input using selector: #search-wrap input
Found search result using selector: //*[@id="form-suggestions"]/span[1]
Clicking on first search result
Found the duration for 'Wunder Ayliva x Apache 207': ['Length: 2:56']
Found the tempo for 'Wunder Ayliva x Apache 207': ['Tempo: 100 bpm']

Processing song 3/498: 'Beautiful Things Benson Boone'
Found search input using selector: #search-wrap input
Found search result using selector: //*[@id="form-suggestions"]/s

In [68]:
pd.read_csv("partial_3_my_speed.csv")

Unnamed: 0,Song,Song Speed
0,I Like The Way You Kiss Me Artemas,['Length: 2:22']
1,Wunder Ayliva x Apache 207,['Length: 2:56']
2,Beautiful Things Benson Boone,['Length: 3:00']
3,Stumblin' In Cyril,['Length: 3:33']
4,Vois sur ton chemin Bennett,['Length: 2:58']
...,...,...
493,Fuckst mich nur ab Bonez MC,['Length: 3:02']
494,WAP Cardi B feat. Megan Thee Stallion,['Length: 3:07']
495,Circles Post Malone,['Length: 3:35']
496,Doch in der Nacht Apache 207,['Length: 3:11']
