In [1]:
import numpy as np
import pandas as pd
import requests
import bs4
import re
import itertools
from time import sleep
from spellchecker import SpellChecker

In [30]:
# Starting out with just the list of jazz pianists, but this will be somehow expanded to get data from the lists of 
# drummers, bassists, saxophonists, trumpeters, floutists, bonists, COMPOSERS, etc. (?)
result = requests.get("https://en.wikipedia.org/wiki/List_of_jazz_trombonists")

In [31]:
soup = bs4.BeautifulSoup(result.text, "html.parser")

In [32]:
# Gets the names and hrefs of all the people on the page by exploiting <a> tag and reads them into uBassistsLinks, 
# a dictionary
uTrombonistsLinks = dict()

# The page of the list of jazz bassists was a little more tricky to figure out... That's why this scraper is 
# much more convoluted than the pianist one, which it was based off of. One major issue was that there are 
# pictures "within" the lists so that some of the picture tags get read into the scraper. 
# I figure this could use some review, but for the time being I mostly only care about collecting the data. 

# Finding the first h2 tag in order to use the .next_siblings iterator that's part of the bs4 library
body = soup.find('div', {'class' : 'mw-parser-output'})
first_h2 = body.find('h2')
for h2 in body.find_all('h2'):
    if h2.find('span', {'class' : 'mw-headline'}):
        first_h2 = h2
        break

for sib in first_h2.next_siblings:
    if type(sib) == bs4.element.Tag:
        for li in sib.find_all('li'):
            for artist_tag in li.find_all('a'):
                # I get an error here but it doesn't affect the outcome of my scraping because the error 
                # occurs after the names and links have already been scraped. 
                uTrombonistsLinks.update( {artist_tag.text : artist_tag.attrs["href"] } )
                #print(f'{artist_tag.text}, {artist_tag.attrs["href"]}')

KeyError: 'href'

In [52]:
#print(uTrombonistsLinks)

In [53]:
newTrombonistsLinks = dict(list(uTrombonistsLinks.items())[0:159])
#print(newTrombonistsLinks)

In [36]:
# Scraper to get the year dates that come after most of the names. Similar structure to the link scraper. 
uTrombonistsYears = dict()

body = soup.find('div', {'class' : 'mw-parser-output'})
first_h2 = body.find('h2')
for h2 in body.find_all('h2'):
    if h2.find('span', {'class' : 'mw-headline'}):
        first_h2 = h2
        break

for sib in first_h2.next_siblings:
    if type(sib) == bs4.element.Tag:
        for li in sib.find_all('li'):
            s = li.text
            #print(s)
            
            noParens = False
            yearDates = ''
            fullName = ''
            
            # Extracting the dates from s
            # Using regular expression to extract everything within the parentheses and assign it to variable years
            if re.search('\(([^)]+)', s):
                years = re.search('\(([^)]+)', s).group(1)
            # Testing if the format of the dates is "born XXXX" or "XXXX-XXXX" by simply determining if there 
            # is a whitespace in the string (I should think about making this process a little more robust)
                if len(years.split()) > 1:
                # Don't want the word "born" in the year data, so only taking the year using .split()
                    yearDates = years.split()[1].strip()
                else:
                    yearDates = years.strip()
            else:
                yearDates = 'NaN'
            # If there is no year data given, then there are no parentheses, which allows the scripts to 
            # properly extract the artist's name
                noParens = True
            
            # Extracting the artist's name from s
            # If there are parentheses, simply take a substring of s that ends at the first '('
            if noParens == False:
                name = s[0:s.find('(')]
                fullName = name.strip()
            # If there aren't any parentheses, then just take s as it is (stripped of whitespace of course)
            else:
                fullName = s.strip()
            
            #print(fullName, yearDates, '\n')
        
            # Now append the name and year data to the uBassistsYears dictionary
            uTrombonistsYears.update( {fullName : yearDates})


In [54]:
newTrombonistsYears = dict(list(uTrombonistsYears.items())[0:159])
#print(newTrombonistsYears)

In [39]:
# To combine the two dictionaries so that for each artist's name, there is a wiki and years listed
combinedList = [newTrombonistsLinks, newTrombonistsYears]
fullTrombonistsList = dict()

for artist in newTrombonistsLinks.keys():
    fullTrombonistsList[artist] = tuple(fullTrombonistsList[artist] for fullTrombonistsList in combinedList)

In [58]:
#print(fullTrombonistsList)

#for key, value in fullTrombonistsList.items():
#    print(key, value)

In [44]:
# Creating a custom dictionary with pyspellchecker that contains most, if not all, instruments used to play jazz
spell = SpellChecker(language = None, case_sensitive = True)
a = spell.word_frequency.load_words(['Piano', 'Vocals', 'Keyboards', 'Double bass', 'Upright bass', 'Electric bass', 
                                     'Saxophone', 'Clarinet', 'Trumpet', 
                                     'Trombone', 'Drums', 'Guitar', 'Flute', 'Vibraphone', 'Banjo', 'Violin', 'Viola', 
                                     'Cello', 'Synthesizer'])

In [59]:
# Creating DataFrame to store the information on the pianists
pDF = pd.DataFrame(columns = ['artist', 'instrument', 'birthdate', 'deathdate', 'birth_year', 'death_year'])

# Main script that will take the wikis in the fullPianistsList dictionary and scrape the data in the biographical
# table for each artist. I have done much research into the legality and ethicality of this operation, and I've 
# determined that it's okay to run a script like this as long as I have at least a one-second delay inbetween 
# each iteration of the below for loop, so as to not create too much traffic. 

# pianist = pianist's name; linkYear = a tuple where the first element is the wiki link and the second element 
# is the string containing the artist's dates: ( /wiki/ARTIST_NAME, (XXXX OR XXXX-XXXX) )

# For testing the DataFrame:
runs = 0

for bonist, linkYear in fullTrombonistsList.items():
    
    # For testing purposes
    if runs < 30:
        #print(bonist, linkYear[1])
    
        bday = ''
        dday = ''
        
        bYear = ''
        dYear = ''
        
        # instrumentList is important for a later operation, although it can most likely be simplified
        instrumentList = []
    
        # Create the link by appending the wikipedia link for the artist onto the end of the general wikipedia link
        wikiLink = 'https://en.wikipedia.org' + linkYear[0]
    
        # Following the basic requests and bs4 process to read in a webpage
        pInfo = requests.get(wikiLink)
        pSoup = bs4.BeautifulSoup(pInfo.text, "html.parser")
    
        # Finding the biographical information table, which is always in a table with the class = infobox vcard plainlist
        # However, some pages don't have the table, which is why there is a conditional first. 
        if pSoup.find('table', {'class' : 'infobox vcard plainlist'}):
            
    
            # aTable is the tag in which the script will find all of the necessary information
            aTable = pSoup.find('table', {'class' : 'infobox vcard plainlist'})
    
            # All of the necessary information is within the <tr> row element 
            for table in aTable.find_all('tr'):
    
                # First, finding the primary instrument of the artist (which is all that this project will
                # be concerned with for the moment, sorry multi-instrumentalists). The instrument data is listed in
                # the <td> tag with the class = note. However, because the script must account for the fact that
                # there are multi-instrumentalists, the script only returns the first instrument that is found. 
                # In lieu of a more efficient way to do this, there is an index that will only allow the instrumentList
                # to append the first instrument it finds. This should probably be fixed too. 
                if table.find('td', {'class' : 'note'}):
                    index = 0
                    for string in table.find('td', {'class' : 'note'}).stripped_strings:
                        if index == 0:
                            string = string.split(',')[0]
                            string = spell.correction(string)
                            instrumentList.append(string.split(',')[0])
                            index += 1
        
                # Finding the birthday of the artist. If there is one, it is always contained within the span class = 
                # bday tag, and the date can be accessed with the .string bs4 method
                if table.find('span', {'class' : 'bday'}):
                    #print(table.find('span', {'class' : 'bday'}).string)
                    bday = table.find('span', {'class' : 'bday'}).string
                    bYear = bday[0:4]
        
                # Finding the deathdate of the artist. If there is one, it is usually contained within the span style = 
                # display:none tag. However, there are multiple pieces of information that can be contained within a 
                # style = diplay:none tag. Fortunately, the death date is the only piece of information that is 
                # accessible via the .string method, so the script checks for the possibility of the .string method
                # and then removes the parentheses which always surround it. 
                if table.find('span', {'style' : 'display:none'}):
                    for row in table.find_all('span', {'style' : 'display:none'}):
                        if row.string:
                            #print(row.string[1:-1])
                            dday = row.string[1:-1]
                            dYear = dday[0:4]
                    
     
            # Lines 85-102 and lines 113-129 use the year data from the "List of jazz pianists" page to fill in 
            # the DataFrame when information is not found in the individual artist's vtable
            if not bday:
                bday = linkYear[1][0:4]
                bYear = bday
                
            if not dday:
                if linkYear[1] == 'NaN':
                    dday = 'NaN'
                    dYear = 'NaN'
                else:
                    if len(linkYear[1]) > 4:
                        dday = linkYear[1][5:9]
                        dYear = dday
                    else:
                        dday = 'NaN' # Make these 'Present' if other solution doesn't work
                        dYear = 'NaN'
            
            if not instrumentList:
                instrumentList.append('Trombone')
        
            #print('\n')
        
            # Time delay on for loop
            #sleep(1.0)
            
            
    
        else:
            #print('\n')
            if len(linkYear[1]) > 4:
                bday = linkYear[1][0:4]
                bYear = bday
                dday = linkYear[1][5:9]
                dYear = dday
            else:
                bday = linkYear[1]
                bYear = bday
                if bday == 'NaN':
                    dday = 'NaN'
                    dYear = 'NaN'
                else:
                    dday = 'NaN'  # Make these 'Present' if other solution doesn't work
                    dYear = 'NaN'
                    
            if not instrumentList:
                instrumentList.append('Trombone')
                
                
        # Appending the values to the DataFrame        
        pDF = pDF.append( {'artist' : bonist, 'instrument' : instrumentList[0], 'birthdate' : str(bday),
                           'deathdate' : str(dday), 'birth_year' : bYear, 'death_year' : dYear}, ignore_index = True)
    
        runs += 1
        #print(runs)



In [60]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
# To make it easier to loop through DataFrame? 
# pDF.set_index('Artist', inplace = True)

# Deleting Birth Year and Death Year columns for now because I don't think they're that necessary given the solution
# I've decided to pursue. However, I will leave this like so for now because I don't want to commit to this solution 
# yet. 
pDF.drop(columns = ['birth_year', 'death_year'], inplace = True)

#print(pDF)

In [61]:
# Adding 'alive' column to the DataFrame using np.select() 

conditions = [ (pDF['birthdate'] == 'NaN') & (pDF['deathdate'] == 'NaN'), (pDF['deathdate'] == 'NaN'), 
             (pDF['deathdate'] != 'NaN') ]

values = ['NaN', True, False]

pDF['alive'] = np.select(conditions, values)
#print(pDF)

In [62]:
# Forcing all of the dates into datetime objects 

pDF['bday_dt'] = pd.to_datetime(pDF['birthdate'], errors = 'coerce')
pDF['dday_dt'] = pd.to_datetime(pDF['deathdate'], errors = 'coerce')

# Account for the age of people who are alive today as well
conditions = [ (pDF['alive'] == 'True'), (pDF['alive'] == 'False'), (pDF['alive'] == 'NaN') ]

values = [ (pd.to_datetime('today') - pDF['bday_dt']).astype('timedelta64[Y]').astype(float),
         (pDF['dday_dt'] - pDF['bday_dt']).astype('timedelta64[Y]').astype(float), 'NaN' ]

pDF['age_yrs'] = np.select(conditions, values)

#display(pDF)

In [51]:
# Outputs the .csv file that was then cleaned in Excel and put back into the jazz-pianist-files folder as 
# jazz_pianists_cleaned1.csv
pDF.to_csv('jazz_trombonists1.csv', index = False)