# Find Musicians

The purpose of this code is to get a very long list of musicians. This list is later uploaded to SPIKE, so that we can find sentences that contain known musicians. This will be the basis for our test set. 

In [7]:
import pandas
import requests
from bs4 import BeautifulSoup
from random import shuffle

Find a wikipedia page with a relevant list

In [2]:
wiki_url = "https://en.wikipedia.org/wiki/Lists_of_musicians"
response = requests.get(wiki_url)
soup = BeautifulSoup(response.content)

Note that the following code is tailored for the above wikipedia page. Error handling here is relaxed, since we are not trying to get an exhaustive list, but a very big sample.

In [5]:
musicians_by_category = dict()
general_list_of_musicians = []
for item in soup.find_all("li"):
    link = item.find("a")
    try:
        if link["href"].startswith("/wiki/List_of_"):
            musicians_by_category[link.text] = []
            sublist_url = f"https://en.wikipedia.org{link['href']}"
            sublist_response = requests.get(sublist_url)
            subsoup = BeautifulSoup(sublist_response.content)
            for artist in subsoup.find_all("li"):
                sublink = artist.find("a")
                if sublink:
                    if not sublink["href"].startswith("/wiki/List_of_"):
                        if not sublink.parent.get("class"):
                            if len(sublink.text) > 4:
                                musicians_by_category[link.text].append(sublink.text)
                                if all(x not in link.text for x in ["bands", "groups"] ):
                                    general_list_of_musicians.append(sublink.text)
                            if sublink.text == "^":
                                break
    except Exception as e:
        if link:
            print(link, e)
            
general_list_of_musicians[:10]

<a href="/wiki/List_of_anarcho-punk_bands" title="List of anarcho-punk bands">List of anarcho-punk bands</a> 'href'
<a href="/wiki/List_of_bebop_musicians" title="List of bebop musicians">List of bebop musicians</a> 'href'
<a href="/wiki/List_of_Christian_worship_music_artists" title="List of Christian worship music artists">List of Christian worship music artists</a> 'href'
<a href="/wiki/List_of_cool_jazz_and_West_Coast_jazz_musicians" title="List of cool jazz and West Coast jazz musicians">List of cool jazz and West Coast jazz musicians</a> 'href'
<a href="/wiki/List_of_dub_artists" title="List of dub artists">List of dub artists</a> 'href'
<a href="/wiki/List_of_free_improvising_musicians_and_groups" title="List of free improvising musicians and groups">List of free improvising musicians and groups</a> 'href'
<a href="/wiki/List_of_gospel_musicians" title="List of gospel musicians">List of gospel musicians</a> 'href'
<a href="/wiki/List_of_punk_rock_bands,_L%E2%80%93Z" title="List 

<a href="/wiki/List_of_French_singers" title="List of French singers">List of French singers</a> 'href'
<a href="/wiki/List_of_German_musicians" title="List of German musicians">List of German musicians</a> 'href'
<a href="/wiki/List_of_Greek_composers" title="List of Greek composers">List of Greek composers</a> 'href'
<a href="/wiki/List_of_Greek_composers" title="List of Greek composers">List of Greek composers</a> 'href'
<a href="/wiki/List_of_Polish_musicians_and_musical_groups" title="List of Polish musicians and musical groups">List of Polish musicians and musical groups</a> 'href'
<a href="/wiki/List_of_Polish_musicians_and_musical_groups" title="List of Polish musicians and musical groups">List of Polish musicians and musical groups</a> 'href'
<a href="/wiki/List_of_Portuguese_musicians" title="List of Portuguese musicians">List of Portuguese musicians</a> 'href'
<a href="/wiki/List_of_Portuguese_musicians" title="List of Portuguese musicians">List of Portuguese musicians</a> '

['2nd Chapter of Acts',
 'After the Fire',
 'Dennis Agajanian',
 'All Saved Freak Band',
 'The Alpha Band',
 'The Archers',
 'a band called David',
 'Brown Bannister',
 'Bash-n-the-Code',
 'Bob Bennett']

In [10]:
# Create a file with one item per line. This should later be manually cleaned, or take a clean sample from this list.
with open("../data/lists/musicians_dirty.txt", "w") as f:
    shuffle(general_list_of_musicians) # avoid having all musicians from related genres. 
    for x in general_list_of_musicians:
        f.write(f"{x}\n")

### scrape for comedians

Another example. This can, but doesn't have to, be a part of collecting negative examples.

In [11]:
wiki_url = "https://en.wikipedia.org/wiki/List_of_comedians"
response = requests.get(wiki_url)
soup = BeautifulSoup(response.content)

In [14]:
with open("../data/lists/comedians_dirty.txt", "w") as f:
    for item in soup.find_all("li"):
        try:
            link = item.find("a")
            if link["href"].startswith("/wiki/"):
                if len(link.text.split()) > 1:
                    f.write(link.text + "\n")
        except:
            print(link)

None
None
