In [1]:
import numpy as np
import pandas as pd
import requests
import bs4
import re
from time import sleep
from spellchecker import SpellChecker

In [11]:
# Starting out with just the list of jazz pianists, but this will be somehow expanded to get data from the lists of 
# drummers, bassists, saxophonists, trumpeters, floutists, bonists, COMPOSERS, etc. (?)
result = requests.get("https://en.wikipedia.org/wiki/List_of_jazz_drummers")

In [12]:
soup = bs4.BeautifulSoup(result.text, "html.parser")

In [13]:
# Gets the names and hrefs of all the people on the page by exploiting <a> tag and reads them into uPianistsLinks, 
# a dictionary
uDrummersLinks = dict()

# For each letter category on the page (which are marked by div class: div-col)
for div_col in soup.find_all('div', {'class' : 'div-col'}):
    # For each tag containing the artist's name and hyperlink (marked by the <a> tag)
    for artist_tag in div_col.find_all('a'):
        # artist_tag.text = artist's name; artist_tag.attrs["href"] = wiki link extension for that artist
        print(f'{artist_tag.text}, {artist_tag.attrs["href"]}')
        uDrummersLinks.update( {artist_tag.text : artist_tag.attrs["href"] } )
            


Espen Aalberg, /wiki/Espen_Aalberg
Knut Aalefjær, /wiki/Knut_Aalefj%C3%A6r
Brian Abrahams, /wiki/Brian_Abrahams
Clarence Acox Jr., /wiki/Clarence_Acox_Jr.
Airto Moreira, /wiki/Airto_Moreira
Alex Acuña, /wiki/Alex_Acu%C3%B1a
Annette A. Aguilar, /wiki/Annette_A._Aguilar
Rashied Ali, /wiki/Rashied_Ali
Don Alias, /wiki/Don_Alias
Carl Allen, /wiki/Carl_Allen_(drummer)
Barry Altschul, /wiki/Barry_Altschul
Leon Anderson, /wiki/Leon_Anderson
Charly Antolini, /wiki/Charly_Antolini
Steve Argüelles, /wiki/Steve_Arg%C3%BCelles
John Armatage, /wiki/John_Armatage
Joe Ascione, /wiki/Joe_Ascione
Jon Audun Baar, /wiki/Jon_Audun_Baar
Colin Bailey, /wiki/Colin_Bailey_(drummer)
Dave Bailey, /wiki/Dave_Bailey_(musician)
Donald Bailey, /wiki/Donald_Bailey_(musician)
Ginger Baker, /wiki/Ginger_Baker
Newman Taylor Baker, /wiki/Newman_Taylor_Baker
Butch Ballard, /wiki/Butch_Ballard
Tom Bancroft, /wiki/Tom_Bancroft
Paul Barbarin, /wiki/Paul_Barbarin
Danny Barcelona, /wiki/Danny_Barcelona
Joey Baron, /wiki/Joey_

In [14]:
uDrummersYears = dict()

# Again using div-col tags to get the letter categories
for div_col in soup.find_all('div', {'class' : 'div-col'}):
    # This time, using the <li> tag to get the year data
    for info in div_col.find_all('li'):
        # Declaring three variables that will be used to store year and name data to put in the dictionary
        # noParens = if True, there are no year parentheses after the name, meaning there is no year data
        # on the artist; alternatively, if False then there is year data. 
        # yearDates = dates; can be either "born XXXX" or "XXXX-XXXX" (Unfortunately, there were no jazz musicians
        # before 1000)
        # fullName = contains the name of the artist
        noParens = False
        yearDates = ''
        fullName = ''
        
        # info.text will return a string like "Bill Evans (1929-1980)"
        s = info.text
        
        # Extracting the dates from s
        # Using regular expression to extract everything within the parentheses and assign it to variable years
        if re.search('\(([^)]+)', s):
            years = re.search('\(([^)]+)', s).group(1)
            # Testing if the format of the dates is "born XXXX" or "XXXX-XXXX" by simply determining if there 
            # is a whitespace in the string (I should think about making this process a little more robust)
            if len(years.split()) > 1:
                # Don't want the word "born" in the year data, so only taking the year using .split()
                yearDates = years.split()[1].strip()
            else:
                yearDates = years.strip()
        else:
            yearDates = 'NaN'
            # If there is no year data given, then there are no parentheses, which allows the scripts to 
            # properly extract the artist's name
            noParens = True
            
        # Extracting the artist's name from s
        # If there are parentheses, simply take a substring of s that ends at the first '('
        if noParens == False:
            name = s[0:s.find('(')]
            fullName = name.strip()
        # If there aren't any parentheses, then just take s as it is (stripped of whitespace of course)
        else:
            fullName = s.strip()
            
        print(fullName, yearDates, '\n')
        
        # Now append the name and year data to the uPianistsYears dictionary
        uDrummersYears.update( {fullName : yearDates})
            

Espen Aalberg 1975 

Knut Aalefjær 1974 

Brian Abrahams 1947 

Clarence Acox Jr. 1952 

Airto Moreira NaN 

Alex Acuña 1944 

Annette A. Aguilar 1957 

Rashied Ali 1933–2009 

Don Alias 1939–2006 

Carl Allen 1961 

Barry Altschul 1943 

Leon Anderson NaN 

Charly Antolini 1937 

Steve Argüelles 1963 

John Armatage 1929 

Joe Ascione 1961–2016 

Jon Audun Baar 1986 

Colin Bailey 1934 

Dave Bailey 1926 

Donald Bailey 1933–2013 

Ginger Baker 1939–2019 

Newman Taylor Baker 1943 

Butch Ballard 1918–2011 

Tom Bancroft 1967 

Paul Barbarin 1899–1969 

Danny Barcelona 1929–2007 

Joey Baron 1955 

Ray Barretto 1929–2006 

Ray Bauduc 1906–1988 

Louie Bellson 1924–2009 

Warren Benbow 1954 

Gregg Bendian 1963 

Tommy Benford 1905–1994 

Han Bennink 1942 

Black Benny 1890–1924 

Chief Bey 1913–2004 

Big Bill Bissonnette 1937-2018 

Snorre Bjerck 1962 

Ivar Loe Bjørnstad 1981 

James N. Black 1940–1988 

Jim Black 1967 

Cindy Blackman 1959 

Ed Blackwell 1929–1992 

Brian Blade 197

In [15]:
print(uDrummersLinks)

{'Espen Aalberg': '/wiki/Espen_Aalberg', 'Knut Aalefjær': '/wiki/Knut_Aalefj%C3%A6r', 'Brian Abrahams': '/wiki/Brian_Abrahams', 'Clarence Acox Jr.': '/wiki/Clarence_Acox_Jr.', 'Airto Moreira': '/wiki/Airto_Moreira', 'Alex Acuña': '/wiki/Alex_Acu%C3%B1a', 'Annette A. Aguilar': '/wiki/Annette_A._Aguilar', 'Rashied Ali': '/wiki/Rashied_Ali', 'Don Alias': '/wiki/Don_Alias', 'Carl Allen': '/wiki/Carl_Allen_(drummer)', 'Barry Altschul': '/wiki/Barry_Altschul', 'Leon Anderson': '/wiki/Leon_Anderson', 'Charly Antolini': '/wiki/Charly_Antolini', 'Steve Argüelles': '/wiki/Steve_Arg%C3%BCelles', 'John Armatage': '/wiki/John_Armatage', 'Joe Ascione': '/wiki/Joe_Ascione', 'Jon Audun Baar': '/wiki/Jon_Audun_Baar', 'Colin Bailey': '/wiki/Colin_Bailey_(drummer)', 'Dave Bailey': '/wiki/Dave_Bailey_(musician)', 'Donald Bailey': '/wiki/Donald_Bailey_(musician)', 'Ginger Baker': '/wiki/Ginger_Baker', 'Newman Taylor Baker': '/wiki/Newman_Taylor_Baker', 'Butch Ballard': '/wiki/Butch_Ballard', 'Tom Bancroft'

In [16]:
print(uDrummersYears)

{'Espen Aalberg': '1975', 'Knut Aalefjær': '1974', 'Brian Abrahams': '1947', 'Clarence Acox Jr.': '1952', 'Airto Moreira': '1941', 'Alex Acuña': '1944', 'Annette A. Aguilar': '1957', 'Rashied Ali': '1933–2009', 'Don Alias': '1939–2006', 'Carl Allen': '1961', 'Barry Altschul': '1943', 'Leon Anderson': 'NaN', 'Charly Antolini': '1937', 'Steve Argüelles': '1963', 'John Armatage': '1929', 'Joe Ascione': '1961–2016', 'Jon Audun Baar': '1986', 'Colin Bailey': '1934', 'Dave Bailey': '1926', 'Donald Bailey': '1933–2013', 'Ginger Baker': '1939–2019', 'Newman Taylor Baker': '1943', 'Butch Ballard': '1918–2011', 'Tom Bancroft': '1967', 'Paul Barbarin': '1899–1969', 'Danny Barcelona': '1929–2007', 'Joey Baron': '1955', 'Ray Barretto': '1929–2006', 'Ray Bauduc': '1906–1988', 'Louie Bellson': '1924–2009', 'Warren Benbow': '1954', 'Gregg Bendian': '1963', 'Tommy Benford': '1905–1994', 'Han Bennink': '1942', 'Black Benny': '1890–1924', 'Chief Bey': '1913–2004', 'Big Bill Bissonnette': '1937-2018', 'Sn

In [17]:
# To combine the two dictionaries so that for each artist's name, there is a wiki and years listed
combinedList = [uDrummersLinks, uDrummersYears]
fullDrummersList = dict()

for artist in uDrummersLinks.keys():
    fullDrummersList[artist] = tuple(fullDrummersList[artist] for fullDrummersList in combinedList)

In [19]:
print(fullDrummersList)

{'Espen Aalberg': ('/wiki/Espen_Aalberg', '1975'), 'Knut Aalefjær': ('/wiki/Knut_Aalefj%C3%A6r', '1974'), 'Brian Abrahams': ('/wiki/Brian_Abrahams', '1947'), 'Clarence Acox Jr.': ('/wiki/Clarence_Acox_Jr.', '1952'), 'Airto Moreira': ('/wiki/Airto_Moreira', '1941'), 'Alex Acuña': ('/wiki/Alex_Acu%C3%B1a', '1944'), 'Annette A. Aguilar': ('/wiki/Annette_A._Aguilar', '1957'), 'Rashied Ali': ('/wiki/Rashied_Ali', '1933–2009'), 'Don Alias': ('/wiki/Don_Alias', '1939–2006'), 'Carl Allen': ('/wiki/Carl_Allen_(drummer)', '1961'), 'Barry Altschul': ('/wiki/Barry_Altschul', '1943'), 'Leon Anderson': ('/wiki/Leon_Anderson', 'NaN'), 'Charly Antolini': ('/wiki/Charly_Antolini', '1937'), 'Steve Argüelles': ('/wiki/Steve_Arg%C3%BCelles', '1963'), 'John Armatage': ('/wiki/John_Armatage', '1929'), 'Joe Ascione': ('/wiki/Joe_Ascione', '1961–2016'), 'Jon Audun Baar': ('/wiki/Jon_Audun_Baar', '1986'), 'Colin Bailey': ('/wiki/Colin_Bailey_(drummer)', '1934'), 'Dave Bailey': ('/wiki/Dave_Bailey_(musician)', 

In [21]:
# Creating a custom dictionary with pyspellchecker that contains most, if not all, instruments used to play jazz
spell = SpellChecker(language = None, case_sensitive = True)
a = spell.word_frequency.load_words(['Piano', 'Vocals', 'Keyboards', 'Double bass', 'Saxophone', 'Clarinet', 'Trumpet', 
                                    'Trombone', 'Drums', 'Guitar', 'Flute', 'Vibraphone', 'Banjo', 'Violin', 'Viola', 
                                    'Cello', 'Synthesizer'])

In [27]:
# Creating DataFrame to store the information on the pianists
pDF = pd.DataFrame(columns = ['artist', 'instrument', 'birthdate', 'deathdate', 'birth_year', 'death_year'])

# For testing the DataFrame:
runs = 0

for drummer, linkYear in fullDrummersList.items():
    
    # For testing purposes
    #if runs < 30:
        print(drummer, linkYear[1])
    
        bday = ''
        dday = ''
        
        bYear = ''
        dYear = ''
        
        # instrumentList is important for a later operation, although it can most likely be simplified
        instrumentList = []
    
        # Create the link by appending the wikipedia link for the artist onto the end of the general wikipedia link
        wikiLink = 'https://en.wikipedia.org' + linkYear[0]
    
        # Following the basic requests and bs4 process to read in a webpage
        pInfo = requests.get(wikiLink)
        pSoup = bs4.BeautifulSoup(pInfo.text, "html.parser")
    
        # Finding the biographical information table, which is always in a table with the class = infobox vcard plainlist
        # However, some pages don't have the table, which is why there is a conditional first. 
        if pSoup.find('table', {'class' : 'infobox vcard plainlist'}):
            
    
            # aTable is the tag in which the script will find all of the necessary information
            aTable = pSoup.find('table', {'class' : 'infobox vcard plainlist'})
    
            # All of the necessary information is within the <tr> row element 
            for table in aTable.find_all('tr'):
    
                # First, finding the primary instrument of the artist (which is all that this project will
                # be concerned with for the moment, sorry multi-instrumentalists). The instrument data is listed in
                # the <td> tag with the class = note. However, because the script must account for the fact that
                # there are multi-instrumentalists, the script only returns the first instrument that is found. 
                # In lieu of a more efficient way to do this, there is an index that will only allow the instrumentList
                # to append the first instrument it finds. This should probably be fixed too. 
                if table.find('td', {'class' : 'note'}):
                    index = 0
                    for string in table.find('td', {'class' : 'note'}).stripped_strings:
                        if index == 0:
                            string = string.split(',')[0]
                            string = spell.correction(string)
                            instrumentList.append(string.split(',')[0])
                            index += 1
        
                # Finding the birthday of the artist. If there is one, it is always contained within the span class = 
                # bday tag, and the date can be accessed with the .string bs4 method
                # THERE ARE ISSUES HERE BECAUSE OF WIKIPEDIA SHODDY FORMATTING - FIGURE THIS OUT
                if table.find('span', {'class' : 'bday'}):
                    #print(table.find('span', {'class' : 'bday'}).string)
                    bday = table.find('span', {'class' : 'bday'}).string
                    bYear = bday[0:4]
        
                # Finding the deathdate of the artist. If there is one, it is usually contained within the span style = 
                # display:none tag. However, there are multiple pieces of information that can be contained within a 
                # style = diplay:none tag. Fortunately, the death date is the only piece of information that is 
                # accessible via the .string method, so the script checks for the possibility of the .string method
                # and then removes the parentheses which always surround it. 
                if table.find('span', {'style' : 'display:none'}):
                    for row in table.find_all('span', {'style' : 'display:none'}):
                        if row.string:
                            #print(row.string[1:-1])
                            dday = row.string[1:-1]
                            dYear = dday[0:4]
                    
     
            # Lines 88-105 and lines 116-132 use the year data from the "List of jazz pianists" page to fill in 
            # the DataFrame when information is not found in the individual artist's vtable
            if not bday:
                bday = linkYear[1][0:4]
                bYear = bday
                
            if not dday:
                if linkYear[1] == 'NaN':
                    dday = 'NaN'
                    dYear = 'NaN'
                else:
                    if len(linkYear[1]) > 4:
                        dday = linkYear[1][5:9]
                        dYear = dday
                    else:
                        dday = 'NaN' # Make these 'Present' if other solution doesn't work
                        dYear = 'NaN'
            
            if not instrumentList:
                instrumentList.append('Drums')
        
            #print('\n')
        
            # Time delay on for loop
            #sleep(1.0)
            
            
    
        else:
            #print('\n')
            if len(linkYear[1]) > 4:
                bday = linkYear[1][0:4]
                bYear = bday
                dday = linkYear[1][5:9]
                dYear = dday
            else:
                bday = linkYear[1]
                bYear = bday
                if bday == 'NaN':
                    dday = 'NaN'
                    dYear = 'NaN'
                else:
                    dday = 'NaN'  # Make these 'Present' if other solution doesn't work
                    dYear = 'NaN'
                    
            if not instrumentList:
                instrumentList.append('Drums')
                
                
        # Appending the values to the DataFrame        
        pDF = pDF.append( {'artist' : drummer, 'instrument' : instrumentList[0], 'birthdate' : str(bday),
                           'deathdate' : str(dday), 'birth_year' : bYear, 'death_year' : dYear}, ignore_index = True)
    
        runs += 1
        print(runs)



Espen Aalberg 1975
1
Knut Aalefjær 1974
2
Brian Abrahams 1947
3
Clarence Acox Jr. 1952
4
Airto Moreira 1941
5
Alex Acuña 1944
6
Annette A. Aguilar 1957
7
Rashied Ali 1933–2009
8
Don Alias 1939–2006
9
Carl Allen 1961
10
Barry Altschul 1943
11
Leon Anderson NaN
12
Charly Antolini 1937
13
Steve Argüelles 1963
14
John Armatage 1929
15
Joe Ascione 1961–2016
16
Jon Audun Baar 1986
17
Colin Bailey 1934
18
Dave Bailey 1926
19
Donald Bailey 1933–2013
20
Ginger Baker 1939–2019
21
Newman Taylor Baker 1943
22
Butch Ballard 1918–2011
23
Tom Bancroft 1967
24
Paul Barbarin 1899–1969
25
Danny Barcelona 1929–2007
26
Joey Baron 1955
27
Ray Barretto 1929–2006
28
Ray Bauduc 1906–1988
29
Louie Bellson 1924–2009
30
Warren Benbow 1954
31
Gregg Bendian 1963
32
Tommy Benford 1905–1994
33
Han Bennink 1942
34
Black Benny 1890–1924
35
Chief Bey 1913–2004
36
Big Bill Bissonnette 1937-2018
37
Snorre Bjerck 1962
38
Ivar Loe Bjørnstad 1981
39
James N. Black 1940–1988
40
Jim Black 1967
41
Cindy Blackman 1959
42
Ed Bla

In [28]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
# To make it easier to loop through DataFrame? 
# pDF.set_index('Artist', inplace = True)

# Deleting Birth Year and Death Year columns for now because I don't think they're that necessary given the solution
# I've decided to pursue. However, I will leave this like so for now because I don't want to commit to this solution 
# yet. 
pDF.drop(columns = ['birth_year', 'death_year'], inplace = True)

print(pDF)

                           artist                  instrument   birthdate  \
0                   Espen Aalberg                       Drums        1975   
1                   Knut Aalefjær                       Drums  1974-12-21   
2                  Brian Abrahams                    vocalist  1947-06-26   
3               Clarence Acox Jr.                       Drums        1952   
4                   Airto Moreira                       Drums  1941-08-05   
5                      Alex Acuña                       Drums  1944-12-12   
6              Annette A. Aguilar                  Percussion        1957   
7                     Rashied Ali                       Drums  1933-07-01   
8                       Don Alias                       Drums  1939-12-25   
9                      Carl Allen                       Drums        1961   
10                 Barry Altschul                       Drums  1943-01-06   
11                  Leon Anderson                       Drums         NaN   

In [29]:
# Adding 'alive' column to the DataFrame using np.select() 

conditions = [ (pDF['birthdate'] == 'NaN') & (pDF['deathdate'] == 'NaN'), (pDF['deathdate'] == 'NaN'), 
             (pDF['deathdate'] != 'NaN') ]

values = ['NaN', True, False]

pDF['alive'] = np.select(conditions, values)
print(pDF)

                           artist                  instrument   birthdate  \
0                   Espen Aalberg                       Drums        1975   
1                   Knut Aalefjær                       Drums  1974-12-21   
2                  Brian Abrahams                    vocalist  1947-06-26   
3               Clarence Acox Jr.                       Drums        1952   
4                   Airto Moreira                       Drums  1941-08-05   
5                      Alex Acuña                       Drums  1944-12-12   
6              Annette A. Aguilar                  Percussion        1957   
7                     Rashied Ali                       Drums  1933-07-01   
8                       Don Alias                       Drums  1939-12-25   
9                      Carl Allen                       Drums        1961   
10                 Barry Altschul                       Drums  1943-01-06   
11                  Leon Anderson                       Drums         NaN   

In [30]:
# Forcing all of the dates into datetime objects 

pDF['bday_dt'] = pd.to_datetime(pDF['birthdate'], errors = 'coerce')
pDF['dday_dt'] = pd.to_datetime(pDF['deathdate'], errors = 'coerce')

# Account for the age of people who are alive today as well
conditions = [ (pDF['alive'] == 'True'), (pDF['alive'] == 'False'), (pDF['alive'] == 'NaN') ]

values = [ (pd.to_datetime('today') - pDF['bday_dt']).astype('timedelta64[Y]').astype(float),
         (pDF['dday_dt'] - pDF['bday_dt']).astype('timedelta64[Y]').astype(float), 'NaN' ]

pDF['age_yrs'] = np.select(conditions, values)

display(pDF)

Unnamed: 0,artist,instrument,birthdate,deathdate,alive,bday_dt,dday_dt,age_yrs
0,Espen Aalberg,Drums,1975,,True,1975-01-01,NaT,45.0
1,Knut Aalefjær,Drums,1974-12-21,,True,1974-12-21,NaT,45.0
2,Brian Abrahams,vocalist,1947-06-26,,True,1947-06-26,NaT,73.0
3,Clarence Acox Jr.,Drums,1952,,True,1952-01-01,NaT,68.0
4,Airto Moreira,Drums,1941-08-05,,True,1941-08-05,NaT,79.0
5,Alex Acuña,Drums,1944-12-12,,True,1944-12-12,NaT,75.0
6,Annette A. Aguilar,Percussion,1957,,True,1957-01-01,NaT,63.0
7,Rashied Ali,Drums,1933-07-01,2009-08-12,False,1933-07-01,2009-08-12,76.0
8,Don Alias,Drums,1939-12-25,2006-03-28,False,1939-12-25,2006-03-28,66.0
9,Carl Allen,Drums,1961,,True,1961-01-01,NaT,59.0


In [31]:
# Outputs the .csv file that was then cleaned in Excel and put back into the jazz-pianist-files folder as 
# jazz_pianists_cleaned1.csv
pDF.to_csv('jazz_drummers1.csv', index = False)