# MixesDb Web Scraping

In [44]:

import requests
import bs4
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

import pandas as pd
import numpy as np

from tqdm import tqdm_notebook
import tqdm

import time
import pickle
import re

### The Idea is to get a broader genre spectrum by scraping specifically by genre

### Second Idea: connect with favorite soundcloud mixes if possible

In [45]:
url = "https://www.mixesdb.com/w/MixesDB:Explorer/Mixes?do=mx&mode=&cat1=&cat2=&jnC=&style=&year=&tlC=1&tlI=1&hasPl=1&so=sc&tmatch1=&tmatch2=Live+PA&jnTm=not&usesFile=&minHotnessLevel=&count=25&order=hotness&sort=asc"


    Generate mixdB queries for genre searches

In [46]:
style_parameters = {
    "H": "House",
    "DH": "Deep House",
    "HOS": "Old School House",
    "MH": "Minimal House",
    "PM": "Pure Minimal",
    "MT": "Minimal Tech House",
    "TH": "Tech House / Electro",
    "TA": "Techno / Acid",
    "TC": "Hard Techno / Hardcore",
    "PH": "Progressive House",
    "PT": "Progressive / Trance",
    "HH": "Hip Hop / R&B",
    "ST": "Dubstep / Breakbeat",
    "DB": "Drum & Bass / Jungle",
    "CA": "Chill Out / Ambient",
    "DI": "Disco / Pop",
    "VA": "Various",
    "NT": "No Progressive / Trance"
}


def generate_genre_queries(style_parameters):
    queries = dict()
    base_url = "https://www.mixesdb.com/w/MixesDB:Explorer/Mixes?do=mx&mode=&cat1=&cat2=&jnC=&style={}&year=&tlC=1&tlI=1&hasPl=1&so=sc&tmatch1=&tmatch2=Live+PA&jnTm=not&usesFile=&minHotnessLevel=&count=25&hidePL=1&order=hotness&sort=desc"

    for style in style_parameters:
        genre = style_parameters[style]
        query = base_url.format(style)
        queries[genre] = query

    return queries


In [47]:
genre_searches = generate_genre_queries(style_parameters)

In [48]:
genre_searches

{'House': 'https://www.mixesdb.com/w/MixesDB:Explorer/Mixes?do=mx&mode=&cat1=&cat2=&jnC=&style=H&year=&tlC=1&tlI=1&hasPl=1&so=sc&tmatch1=&tmatch2=Live+PA&jnTm=not&usesFile=&minHotnessLevel=&count=25&hidePL=1&order=hotness&sort=desc',
 'Deep House': 'https://www.mixesdb.com/w/MixesDB:Explorer/Mixes?do=mx&mode=&cat1=&cat2=&jnC=&style=DH&year=&tlC=1&tlI=1&hasPl=1&so=sc&tmatch1=&tmatch2=Live+PA&jnTm=not&usesFile=&minHotnessLevel=&count=25&hidePL=1&order=hotness&sort=desc',
 'Old School House': 'https://www.mixesdb.com/w/MixesDB:Explorer/Mixes?do=mx&mode=&cat1=&cat2=&jnC=&style=HOS&year=&tlC=1&tlI=1&hasPl=1&so=sc&tmatch1=&tmatch2=Live+PA&jnTm=not&usesFile=&minHotnessLevel=&count=25&hidePL=1&order=hotness&sort=desc',
 'Minimal House': 'https://www.mixesdb.com/w/MixesDB:Explorer/Mixes?do=mx&mode=&cat1=&cat2=&jnC=&style=MH&year=&tlC=1&tlI=1&hasPl=1&so=sc&tmatch1=&tmatch2=Live+PA&jnTm=not&usesFile=&minHotnessLevel=&count=25&hidePL=1&order=hotness&sort=desc',
 'Pure Minimal': 'https://www.mixesd

    List of Users I am following copied my own soundcloud:

In [49]:
with open('soundcloud.txt') as html:
    soup = BeautifulSoup(html, 'html.parser')
    user_names = [name.text.strip() for name in soup.find_all(class_='userBadgeListItem__heading')]


user_names = list(set(user_names))

    Generate mixdB queries for all artists I follow on soundcloud

In [50]:
def generate_artist_queries(artist_list):
    queries = dict()
    base_url = "https://www.mixesdb.com/w/MixesDB:Explorer/Mixes?do=mx&mode=&cat1=&cat2=&jnC=&style=&year=&so=&tmatch1={}&tmatch2=&jnTm=&usesFile=&minHotnessLevel=&count=25&order=hotness&sort=desc"

    for name in artist_list:
        query_name = name.replace(' ', '+')
        query = base_url.format(query_name)
        queries[name] = query

    return queries

In [51]:
artist_searches = generate_artist_queries(user_names)

In [52]:
artist_searches

{'gainz': 'https://www.mixesdb.com/w/MixesDB:Explorer/Mixes?do=mx&mode=&cat1=&cat2=&jnC=&style=&year=&so=&tmatch1=gainz&tmatch2=&jnTm=&usesFile=&minHotnessLevel=&count=25&order=hotness&sort=desc',
 'voodoohop': 'https://www.mixesdb.com/w/MixesDB:Explorer/Mixes?do=mx&mode=&cat1=&cat2=&jnC=&style=&year=&so=&tmatch1=voodoohop&tmatch2=&jnTm=&usesFile=&minHotnessLevel=&count=25&order=hotness&sort=desc',
 'ton töpferei': 'https://www.mixesdb.com/w/MixesDB:Explorer/Mixes?do=mx&mode=&cat1=&cat2=&jnC=&style=&year=&so=&tmatch1=ton+töpferei&tmatch2=&jnTm=&usesFile=&minHotnessLevel=&count=25&order=hotness&sort=desc',
 'Pauli Pocket': 'https://www.mixesdb.com/w/MixesDB:Explorer/Mixes?do=mx&mode=&cat1=&cat2=&jnC=&style=&year=&so=&tmatch1=Pauli+Pocket&tmatch2=&jnTm=&usesFile=&minHotnessLevel=&count=25&order=hotness&sort=desc',
 'AYER FIJEN': 'https://www.mixesdb.com/w/MixesDB:Explorer/Mixes?do=mx&mode=&cat1=&cat2=&jnC=&style=&year=&so=&tmatch1=AYER+FIJEN&tmatch2=&jnTm=&usesFile=&minHotnessLevel=&coun

In [53]:
base_url = 'https://www.mixesdb.com'
mixdb_categories = dict()

with open('mixesdb.txt') as html_code:
    soup = BeautifulSoup(html_code, 'html.parser')
    list_objects = soup.find_all('li')

    for li_tag in list_objects:
        a_tag = li_tag.find('a')

        if a_tag is not None:
            category = a_tag.get_text()
            href_value = a_tag['href']
            mixdb_categories[category] = base_url + href_value

In [54]:
mixdb_categories

{'2 Step': 'https://www.mixesdb.com/w/Category:2_Step',
 'Acid': 'https://www.mixesdb.com/w/Category:Acid',
 'Acid House': 'https://www.mixesdb.com/w/Category:Acid_House',
 'Acid Techno': 'https://www.mixesdb.com/w/Category:Acid_Techno',
 'Ambient': 'https://www.mixesdb.com/w/Category:Ambient',
 'Ballroom': 'https://www.mixesdb.com/w/Category:Ballroom',
 'Bassline': 'https://www.mixesdb.com/w/Category:Bassline',
 'Big Beat': 'https://www.mixesdb.com/w/Category:Big_Beat',
 'Bigroom': 'https://www.mixesdb.com/w/Category:Bigroom',
 'Boogie': 'https://www.mixesdb.com/w/Category:Boogie',
 'Booty': 'https://www.mixesdb.com/w/Category:Booty',
 'Bossa Nova': 'https://www.mixesdb.com/w/Category:Bossa_Nova',
 'Brasil': 'https://www.mixesdb.com/w/Category:Brasil',
 'Breakbeat': 'https://www.mixesdb.com/w/Category:Breakbeat',
 'Breakbeat Hardcore': 'https://www.mixesdb.com/w/Category:Breakbeat_Hardcore',
 'Breakcore': 'https://www.mixesdb.com/w/Category:Breakcore',
 'Breaks': 'https://www.mixesdb.

In [55]:
drop_keys = { 
    'Category': 'https://www.mixesdb.com/w/Category:Style',
    'Discussion': 'https://www.mixesdb.com/w/Category_talk:Style',
    'History': 'https://www.mixesdb.com/db/index.php?title=Category:Style&action=history',
    '': 'https://www.mixesdb.com/db/index.php?title=Category:Style&action=purge',
    'Create account': 'https://www.mixesdb.com/db/index.php?title=Special:UserLogin&returnto=Category%3AStyle&type=signup',
    'Log in': 'https://www.mixesdb.com/db/index.php?title=Special:UserLogin&returnto=Category%3AStyle',
    '2023': 'https://www.mixesdb.com/w/Category:2023',
    'Event': 'https://www.mixesdb.com/w/Category:Event',
    'Show': 'https://www.mixesdb.com/w/Category:Show',
    'Style': 'https://www.mixesdb.com/w/Category:Style',
    'Artist': 'https://www.mixesdb.com/w/Category:Artist',
    'Venue': 'https://www.mixesdb.com/w/Category:Venue',
    'Podcast': 'https://www.mixesdb.com/w/Category:Podcast',
    'Artist content': 'https://www.mixesdb.com/list-artist-content/?id=all&from=menu',
    'Explore mixes': 'https://www.mixesdb.com/w/MixesDB:Explorer/Mixes',
    'Help section': 'https://www.mixesdb.com/w/Help:General',
    'New mix': 'https://www.mixesdb.com/w/MixesDB:Add_a_new_mix',
    'Recent changesRC': 'https://www.mixesdb.com/w/Special:RecentChanges',
    'Special pages': 'https://www.mixesdb.com/w/MixesDB:Special_pages',
    'Maintenance': 'https://www.mixesdb.com/w/MixesDB:Maintenance',
    'Special characters': 'https://www.mixesdb.com/w/Help:Special_characters',
    'Calendar': 'https://www.mixesdb.comhttp://www.timeanddate.com/calendar/custom.html?year=&ctf=1&cdt=31&holm=1&df=1&cdt=31#calarea',
    'Short URL': 'https://www.mixesdb.comhttps://mixesdb.com/?6289',
    'Page info': 'https://www.mixesdb.com/db/index.php?title=Category:Style&action=info',
    'This site is not for downloading sets.': 'https://www.mixesdb.com/w/Help:General#How_can_I_download.3F',
    'About': 'https://www.mixesdb.com/w/MixesDB:About',
    'Privacy policy': 'https://www.mixesdb.com/w/MixesDB:Privacy_policy',
    'Cookies': 'https://www.mixesdb.com/w/MixesDB:Privacy_policy#Cookies',
    'Legal stuff': 'https://www.mixesdb.com/w/MixesDB:Legal_stuff',
    'DMCA': 'https://www.mixesdb.com/w/MixesDB:DMCA',
    'Powered by MediaWiki': 'https://www.mixesdb.comhttp://www.mediawiki.org/wiki/MediaWiki',
    'enable Javascript in your browser': 'https://www.mixesdb.comhttp://activatejavascript.org/en/'
    }

In [56]:
for key in drop_keys.keys():
    if key in mixdb_categories:
        del mixdb_categories[key]

In [57]:
mixdb_categories

{'2 Step': 'https://www.mixesdb.com/w/Category:2_Step',
 'Acid': 'https://www.mixesdb.com/w/Category:Acid',
 'Acid House': 'https://www.mixesdb.com/w/Category:Acid_House',
 'Acid Techno': 'https://www.mixesdb.com/w/Category:Acid_Techno',
 'Ambient': 'https://www.mixesdb.com/w/Category:Ambient',
 'Ballroom': 'https://www.mixesdb.com/w/Category:Ballroom',
 'Bassline': 'https://www.mixesdb.com/w/Category:Bassline',
 'Big Beat': 'https://www.mixesdb.com/w/Category:Big_Beat',
 'Bigroom': 'https://www.mixesdb.com/w/Category:Bigroom',
 'Boogie': 'https://www.mixesdb.com/w/Category:Boogie',
 'Booty': 'https://www.mixesdb.com/w/Category:Booty',
 'Bossa Nova': 'https://www.mixesdb.com/w/Category:Bossa_Nova',
 'Brasil': 'https://www.mixesdb.com/w/Category:Brasil',
 'Breakbeat': 'https://www.mixesdb.com/w/Category:Breakbeat',
 'Breakbeat Hardcore': 'https://www.mixesdb.com/w/Category:Breakbeat_Hardcore',
 'Breakcore': 'https://www.mixesdb.com/w/Category:Breakcore',
 'Breaks': 'https://www.mixesdb.

In [63]:
genre_url = 'https://www.mixesdb.com/w/Category:Detroit_Techno'

r = requests.get(genre_url, headers={'User-agent': 'surfacer 1.3'})
content = r.content

soup = BeautifulSoup(content, 'html.parser')
mixes_list = soup.find('h2', {'id': 'Mixes'}).find_next('ul', {'id': 'catMixesList'})
list_objects = mixes_list.find_all('li')

genre_mixes = {}

for li_tag in list_objects:
    a_tag = li_tag.find('a')
    href_value = a_tag['href']
    title = a_tag.text
    genre_mixes[href_value] = title

genre_mixes

{'/w/1987_-_The_Electrifying_Mojo_%26_Juan_Atkins,_Derrick_May,_Kevin_Saunderson_-_WHYT_96.3_FM,_Detroit': '1987 - The Electrifying Mojo & Juan Atkins, Derrick May, Kevin Saunderson - WHYT 96.3 FM, Detroit',
 '/w/198X_-_Derrick_May_@_Music_Institute': '198X - Derrick May @ Music Institute',
 '/w/1990-05_-_Derrick_May_@_Unknown_Gig,_Miami': '1990-05 - Derrick May @ Unknown Gig, Miami',
 '/w/1990-09-08_-_Derrick_May_@_Slam_In_The_Park_II,_Strathclyde_Park,_Bristol': '1990-09-08 - Derrick May @ Slam In The Park II, Strathclyde Park, Bristol',
 '/w/1991-07-05_-_Carl_Craig_@_Detroit_Technology,_Caf%C3%A9_d%27Anvers,_Belgium': "1991-07-05 - Carl Craig @ Detroit Technology, Café d'Anvers, Belgium",
 '/w/1992_-_Aubrey_@_InterDance_Sterns,_Worthing,_England': '1992 - Aubrey @ InterDance Sterns, Worthing, England',
 '/w/1992_-_Underground_Resistance_-_Dreaming_Daisies,_Three_D_Radio_93.7FM,_Adelaide': '1992 - Underground Resistance - Dreaming Daisies, Three D Radio 93.7FM, Adelaide',
 '/w/1992-0

In [59]:
mixdb_mix_urls = dict()

In [65]:
options = Options()
options.headless = True
driver = webdriver.Chrome('../archive/chromedriver/chromedriver', options=options)

def get_tracklist(URL):
    driver.get(URL)
    
    try:
        mix = driver.title.split(" | ")[0]
        tracklist = driver.find_elements(By.XPATH, '//ol/li/span')
        youtube_ids = {}
        
        for track in tracklist:
            artist = track.get_attribute("data-keywordsartist")
            title = track.get_attribute("data-keywordstitle")
            youtube_id = track.get_attribute("data-youtubeid")
            youtube_ids[youtube_id] = [artist, title]
            print(youtube_id, ' --- ', artist, ' - ', title)
        
        mix_tracklist = {mix: youtube_ids}
        return mix_tracklist
    
    except AttributeError:
        print('No YouTube IDs found on the page:', URL)
        return None
    finally:
        driver.quit()






mix_url = 'https://www.mixesdb.com/w/2020-05_-_Bollek_-_Outro_Podcast_027'
idlist = get_tracklist(mix_url)





  options.headless = True
  driver = webdriver.Chrome('../archive/chromedriver/chromedriver', options=options)


uNYS-FibMn8  ---  Rupert Clervaux  -  Damper And Drum
_5E3ggezzww  ---  Sugi.Wa  -  Changing Lanes
6RjfWye_Ihc  ---  Saiko  -  Powder
7ythk60VILM  ---  Kupla  -  Search For Tomorrow
u005Fx-YBSI  ---  Chuckee  -  Birthday Beats
  ---  Payfone  -  Subconscient Lamentation
bxPiEolWDZY  ---  AzudemSK  -  Lauf
8WG0rjNApFk  ---  Björk  -  I Miss You (Dobie Rub Part One 'Sunshine' Mix)
zhhHG2AWu80  ---  Stieber Twins  -  Einmal Macco, Zweimal Stieber
y-N9iNr3wcU  ---  Desmond Cheese  -  Polyfizzal Drizzal
Z-I8HLaIOIY  ---  The Dynamics  -  Move On Up
LKKgi6rDWJA  ---  Cora E  -  Lügen... Ihr Kriegt Mich Nie
0Z_319O2GhI  ---  ABS  -  Schlagzeilen
f8cHxydDb7o  ---  Queen Latifah  -  U.N.I.T.Y.
zXSzJB_0yjk  ---  Nina  -  Doppel X Chromosom
HTJxoLy25Xo  ---  The Conscious Daughter  -  Somethin' To Ride To (Fonky Expedition)
YFLt1lhbtKM  ---  Da Brat  -  Give It 2 You
-1  ---  Deichkind  -  ...Und Andere Prioritäten
s3dNTgA1eyo  ---  Dirty Art Club  -  What If
ffePfl-Ew_c  ---  Cold  -  Existence
