In [1]:
# A project to build a basic database of jazz musicians using wikipedia data, then do analysis on it. Keep in mind
# that all of this information is from Wikipedia, so some of it may be inaccurate. 

In [1]:
import numpy as np
# I don't know if numpy is necessary, but I think it's needed for pandas? 
import pandas as pd
# For dataframes
import requests
# For fetching data from Wikipedia
import bs4
# For html navigation and web scraping
import re
# For regex operations
from time import sleep
# For time delay on for loop
from spellchecker import SpellChecker
# For spellchecking words
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Starting out with just the list of jazz pianists, but this will be somehow expanded to get data from the lists of 
# drummers, bassists, saxophonists, trumpeters, floutists, bonists, COMPOSERS, etc. (?)
result = requests.get("https://en.wikipedia.org/wiki/List_of_jazz_pianists")

In [3]:
# Read Wikipedia link into bs4 object called 'soup'
soup = bs4.BeautifulSoup(result.text, "html.parser")

In [4]:
# Gets the names and hrefs of all the people on the page by exploiting <a> tag and reads them into uPianistsLinks, 
# a dictionary
uPianistsLinks = dict()

# For each letter category on the page (which are marked by div class: div-col)
for div_col in soup.find_all('div', {'class' : 'div-col'}):
    # For each tag containing the artist's name and hyperlink (marked by the <a> tag)
    for artist_tag in div_col.find_all('a'):
        # artist_tag.text = artist's name; artist_tag.attrs["href"] = wiki link extension for that artist
        # print(f'{artist_tag.text}, {artist_tag.attrs["href"]}')
        uPianistsLinks.update( {artist_tag.text : artist_tag.attrs["href"] } )
            


In [5]:
# Gets the artist name and years that they were born and puts the data into a dictionary to be combined
# with the uPianistsLinks dictionary. This is an important step because some of the artists' individual 
# wiki pages don't have their dates listed, or don't have them listed in a table where it's easy to extract 
# the information. Therefore, it is more robust to extract the year data from the "list of jazz musicians" wikipedia 
# page and use these dates unless more specific ones are provided on the artist's individual wiki page. 
uPianistsYears = dict()

# Again using div-col tags to get the letter categories
for div_col in soup.find_all('div', {'class' : 'div-col'}):
    # This time, using the <li> tag to get the year data
    for info in div_col.find_all('li'):
        # Declaring three variables that will be used to store year and name data to put in the dictionary
        # noParens = if True, there are no year parentheses after the name, meaning there is no year data
        # on the artist; alternatively, if False then there is year data. 
        # yearDates = dates; can be either "born XXXX" or "XXXX-XXXX" (Unfortunately, there were no jazz musicians
        # before 1000)
        # fullName = contains the name of the artist
        noParens = False
        yearDates = ''
        fullName = ''
        
        # info.text will return a string like "Bill Evans (1929-1980)"
        s = info.text
        
        # Extracting the dates from s
        # Using regular expression to extract everything within the parentheses and assign it to variable years
        if re.search('\(([^)]+)', s):
            years = re.search('\(([^)]+)', s).group(1)
            # Testing if the format of the dates is "born XXXX" or "XXXX-XXXX" by simply determining if there 
            # is a whitespace in the string (I should think about making this process a little more robust)
            if len(years.split()) > 1:
                # Don't want the word "born" in the year data, so only taking the year using .split()
                yearDates = years.split()[1].strip()
            else:
                yearDates = years.strip()
        # There is no year data given. I want this to read into the dataframe as NaN, so remember to come 
        # back to this if it doesn't read in as NaN
        else:
            yearDates = 'NaN'
            # If there is no year data given, then there are no parentheses, which allows the scripts to 
            # properly extract the artist's name
            noParens = True
            
        # Extracting the artist's name from s
        # If there are parentheses, simply take a substring of s that ends at the first '('
        if noParens == False:
            name = s[0:s.find('(')]
            fullName = name.strip()
        # If there aren't any parentheses, then just take s as it is (stripped of whitespace of course)
        else:
            fullName = s.strip()
            
        #print(fullName, yearDates, '\n')
        
        # Now append the name and year data to the uPianistsYears dictionary
        uPianistsYears.update( {fullName : yearDates})
            

In [6]:
#print(uPianistsLinks)

In [7]:
#print(uPianistsYears)

In [8]:
# To combine the two dictionaries so that for each artist's name, there is a wiki and years listed
# I'm not exactly sure how this works, maybe I should work on figuring that out
combinedList = [uPianistsLinks, uPianistsYears]
fullPianistsList = dict()

for artist in uPianistsLinks.keys():
    fullPianistsList[artist] = tuple(fullPianistsList[artist] for fullPianistsList in combinedList)

In [9]:
#print(fullPianistsList)

In [10]:
# Creating a custom dictionary with pyspellchecker that contains most, if not all, instruments used to play jazz
spell = SpellChecker(language = None, case_sensitive = True)
a = spell.word_frequency.load_words(['Piano', 'Vocals', 'Keyboards', 'Double bass', 'Saxophone', 'Clarinet', 'Trumpet', 
                                    'Trombone', 'Drums', 'Guitar', 'Flute', 'Vibraphone', 'Banjo', 'Violin', 'Viola', 
                                    'Cello', 'Synthesizer'])

In [15]:
# Creating DataFrame to store the information on the pianists
pDF = pd.DataFrame(columns = ['artist', 'instrument', 'birthdate', 'deathdate', 'birth_year', 'death_year'])

# Main script that will take the wikis in the fullPianistsList dictionary and scrape the data in the biographical
# table for each artist. I have done much research into the legality and ethicality of this operation, and I've 
# determined that it's okay to run a script like this as long as I have at least a one-second delay inbetween 
# each iteration of the below for loop, so as to not create too much traffic. 

# pianist = pianist's name; linkYear = a tuple where the first element is the wiki link and the second element 
# is the string containing the artist's dates: ( /wiki/ARTIST_NAME, (XXXX OR XXXX-XXXX) )

# For testing the DataFrame:
runs = 0

for pianist, linkYear in fullPianistsList.items():
    
    # For testing purposes
    #if runs < 100:
        #print(pianist, linkYear[1])
    
        bday = ''
        dday = ''
        
        bYear = ''
        dYear = ''
        
        lSpan = 0
        
        # instrumentList is important for a later operation, although it can most likely be simplified
        instrumentList = []
    
        # Create the link by appending the wikipedia link for the artist onto the end of the general wikipedia link
        wikiLink = 'https://en.wikipedia.org' + linkYear[0]
    
        # Following the basic requests and bs4 process to read in a webpage
        pInfo = requests.get(wikiLink)
        pSoup = bs4.BeautifulSoup(pInfo.text, "html.parser")
    
        # Finding the biographical information table, which is always in a table with the class = infobox vcard plainlist
        # However, some pages don't have the table, which is why there is a conditional first. 
        if pSoup.find('table', {'class' : 'infobox vcard plainlist'}):
            
    
            # aTable is the tag in which the script will find all of the necessary information
            aTable = pSoup.find('table', {'class' : 'infobox vcard plainlist'})
    
            # All of the necessary information is within the <tr> row element 
            for table in aTable.find_all('tr'):
    
                # First, finding the primary instrument of the artist (which is all that this project will
                # be concerned with for the moment, sorry multi-instrumentalists). The instrument data is listed in
                # the <td> tag with the class = note. However, because the script must account for the fact that
                # there are multi-instrumentalists, the script only returns the first instrument that is found. 
                # In lieu of a more efficient way to do this, there is an index that will only allow the instrumentList
                # to append the first instrument it finds. This should probably be fixed too. 
                if table.find('td', {'class' : 'note'}):
                    index = 0
                    for string in table.find('td', {'class' : 'note'}).stripped_strings:
                        if index == 0:
                            string = string.split(',')[0]
                            string = spell.correction(string)
                            instrumentList.append(string.split(',')[0])
                            index += 1
        
                # Finding the birthday of the artist. If there is one, it is always contained within the span class = 
                # bday tag, and the date can be accessed with the .string bs4 method
                # THERE ARE ISSUES HERE BECAUSE OF WIKIPEDIA SHODDY FORMATTING - FIGURE THIS OUT
                if table.find('span', {'class' : 'bday'}):
                    #print(table.find('span', {'class' : 'bday'}).string)
                    bday = table.find('span', {'class' : 'bday'}).string
                    bYear = bday[0:4]
        
                # Finding the deathdate of the artist. If there is one, it is usually contained within the span style = 
                # display:none tag. However, there are multiple pieces of information that can be contained within a 
                # style = diplay:none tag. Fortunately, the death date is the only piece of information that is 
                # accessible via the .string method, so the script checks for the possibility of the .string method
                # and then removes the parentheses which always surround it. 
                if table.find('span', {'style' : 'display:none'}):
                    for row in table.find_all('span', {'style' : 'display:none'}):
                        if row.string:
                            #print(row.string[1:-1])
                            dday = row.string[1:-1]
                            dYear = dday[0:4]
                    
     
            
            if not bday:
                bday = linkYear[1][0:4]
                bYear = bday
                
            if not dday:
                if linkYear[1] == 'NaN':
                    dday = 'NaN'
                    dYear = 'NaN'
                else:
                    if len(linkYear[1]) > 4:
                        dday = linkYear[1][5:9]
                        dYear = dday
                    else:
                        dday = 'NaN' # Make these 'Present' if other solution doesn't work
                        dYear = 'NaN'
            
            if not instrumentList:
                instrumentList.append('Piano')
        
            #print('\n')
        
            # Time delay on for loop
            sleep(1.0)
            
            
    
        else:
            #print('\n')
        
            if len(linkYear[1]) > 4:
                bday = linkYear[1][0:4]
                bYear = bday
                dday = linkYear[1][5:9]
                dYear = dday
            else:
                bday = linkYear[1]
                bYear = bday
                if bday == 'NaN':
                    dday = 'NaN'
                    dYear = 'NaN'
                else:
                    dday = 'NaN'  # Make these 'Present' if other solution doesn't work
                    dYear = 'NaN'
                    
            if not instrumentList:
                instrumentList.append('Piano')
                
        pDF = pDF.append( {'artist' : pianist, 'instrument' : instrumentList[0], 'birthdate' : str(bday),
                           'deathdate' : str(dday), 'birth_year' : bYear, 'death_year' : dYear}, ignore_index = True)
    
        runs += 1
        #print(runs)



In [16]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
# To make it easier to loop through DataFrame? 
# pDF.set_index('Artist', inplace = True)

# Deleting Birth Year and Death Year columns for now because I don't think they're that necessary given the solution
# I've decided to pursue
pDF.drop(columns = ['birth_year', 'death_year'], inplace = True)

#print(pDF)

In [17]:
# Adding 'alive' column to the DataFrame using np.select() 
# https://www.dataquest.io/blog/tutorial-add-column-pandas-dataframe-based-on-if-else-condition/

conditions = [ (pDF['birthdate'] == 'NaN') & (pDF['deathdate'] == 'NaN'), (pDF['deathdate'] == 'NaN'), 
             (pDF['deathdate'] != 'NaN') ]

values = ['NaN', True, False]

pDF['alive'] = np.select(conditions, values)
#print(pDF)

In [18]:
# Forcing all of the dates into datetime objects 

pDF['bday_dt'] = pd.to_datetime(pDF['birthdate'], errors = 'coerce')
pDF['dday_dt'] = pd.to_datetime(pDF['deathdate'], errors = 'coerce')

# Account for the age of people who are alive today as well
conditions = [ (pDF['alive'] == 'True'), (pDF['alive'] == 'False'), (pDF['alive'] == 'NaN') ]

values = [ (pd.to_datetime('today') - pDF['bday_dt']).astype('timedelta64[Y]').astype(float),
         (pDF['dday_dt'] - pDF['bday_dt']).astype('timedelta64[Y]').astype(float), 'NaN' ]

pDF['age_yrs'] = np.select(conditions, values)

display(pDF)

Unnamed: 0,artist,instrument,birthdate,deathdate,alive,bday_dt,dday_dt,age_yrs
0,Irving Aaronson,Piano,1895,1963-03-10,False,1895-01-01,1963-03-10,68.0
1,Anders Aarum,Piano,1974-12-17,,True,1974-12-17,NaT,45.0
2,Mike Abene,Piano,1942,,True,1942-01-01,NaT,78.0
3,Don Abney,Piano,1923,2000-01-20,False,1923-01-01,2000-01-20,77.0
4,Chris Abrahams,Piano,1961,,True,1961-01-01,NaT,59.0
5,Muhal Richard Abrams,Piano,1930-09-19,2017-10-29,False,1930-09-19,2017-10-29,87.0
6,John Adriano Acea,Piano,1917,1963,False,1917-01-01,1963-01-01,45.0
7,Beegie Adair,Piano,1937-12-11,,True,1937-12-11,NaT,82.0
8,Kei Akagi,Piano,1953-03-16,,True,1953-03-16,NaT,67.0
9,Toshiko Akiyoshi,Piano,1929-12-12,,True,1929-12-12,NaT,90.0


In [15]:
pDF.to_csv('jazz_pianists1.csv', index = False)