In [1]:
# A project to build a basic database of jazz musicians using wikipedia data, then do analysis on it. Keep in mind
# that all of this information is from Wikipedia, so some of it may be inaccurate. 

In [1]:
import numpy as np
# I don't know if numpy is necessary, but I think it's needed for pandas? 
import pandas as pd
# For dataframes
import requests
# For fetching data from Wikipedia
import bs4
# For html navigation and web scraping
import re
# For regex operations
from time import sleep
# For time delay on for loop
from spellchecker import SpellChecker
# For spellchecking words
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Starting out with just the list of jazz pianists, but this will be somehow expanded to get data from the lists of 
# drummers, bassists, saxophonists, trumpeters, floutists, bonists, etc. (?)
result = requests.get("https://en.wikipedia.org/wiki/List_of_jazz_pianists")

In [3]:
# Read Wikipedia link into bs4 object called 'soup'
soup = bs4.BeautifulSoup(result.text, "html.parser")

In [4]:
# Gets the names and hrefs of all the people on the page by exploiting <a> tag and reads them into uPianistsLinks, 
# a dictionary
uPianistsLinks = dict()

# For each letter category on the page (which are marked by div class: div-col)
for div_col in soup.find_all('div', {'class' : 'div-col'}):
    # For each tag containing the artist's name and hyperlink (marked by the <a> tag)
    for artist_tag in div_col.find_all('a'):
        # artist_tag.text = artist's name; artist_tag.attrs["href"] = wiki link extension for that artist
        print(f'{artist_tag.text}, {artist_tag.attrs["href"]}')
        uPianistsLinks.update( {artist_tag.text : artist_tag.attrs["href"] } )
            


Irving Aaronson, /wiki/Irving_Aaronson
Anders Aarum, /wiki/Anders_Aarum
Mike Abene, /wiki/Mike_Abene
Don Abney, /wiki/Don_Abney
Chris Abrahams, /wiki/Chris_Abrahams
Muhal Richard Abrams, /wiki/Muhal_Richard_Abrams
John Adriano Acea, /wiki/John_Adriano_Acea
Beegie Adair, /wiki/Beegie_Adair
Kei Akagi, /wiki/Kei_Akagi
Toshiko Akiyoshi, /wiki/Toshiko_Akiyoshi
Erling Aksdal Jr., /wiki/Erling_Aksdal
Joe Albany, /wiki/Joe_Albany
Tony Aless, /wiki/Tony_Aless
Charlie Alexander, /wiki/Charlie_Alexander
Joey Alexander, /wiki/Joey_Alexander
Monty Alexander, /wiki/Monty_Alexander
Sinan Alimanović, /wiki/Sinan_Alimanovi%C4%87
Esther Allan, /wiki/Esther_Allan
Geri Allen, /wiki/Geri_Allen
Steve Allen, /wiki/Steve_Allen
Mose Allison, /wiki/Mose_Allison
Mikhail Alperin, /wiki/Mikhail_Alperin
Helio Alves, /wiki/Helio_Alves
Jimmy Amadie, /wiki/Jimmy_Amadie
Albert Ammons, /wiki/Albert_Ammons
Franck Amsallem, /wiki/Franck_Amsallem
Chris Anderson, /wiki/Chris_Anderson_(piano)
Bill Anschell, /wiki/Bill_Ansche

In [5]:
# Gets the artist name and years that they were born and puts the data into a dictionary to be combined
# with the uPianistsLinks dictionary. This is an important step because some of the artists' individual 
# wiki pages don't have their dates listed, or don't have them listed in a table where it's easy to extract 
# the information. Therefore, it is more robust to extract the year data from the "list of jazz musicians" wikipedia 
# page and use these dates unless more specific ones are provided on the artist's individual wiki page. 
uPianistsYears = dict()

# Again using div-col tags to get the letter categories
for div_col in soup.find_all('div', {'class' : 'div-col'}):
    # This time, using the <li> tag to get the year data
    for info in div_col.find_all('li'):
        # Declaring three variables that will be used to store year and name data to put in the dictionary
        # noParens = if True, there are no year parentheses after the name, meaning there is no year data
        # on the artist; alternatively, if False then there is year data. 
        # yearDates = dates; can be either "born XXXX" or "XXXX-XXXX" (Unfortunately, there were no jazz musicians
        # before 1000)
        # fullName = contains the name of the artist
        noParens = False
        yearDates = ''
        fullName = ''
        
        # info.text will return a string like "Bill Evans (1929-1980)"
        s = info.text
        
        # Extracting the dates from s
        # Using regular expression to extract everything within the parentheses and assign it to variable years
        if re.search('\(([^)]+)', s):
            years = re.search('\(([^)]+)', s).group(1)
            # Testing if the format of the dates is "born XXXX" or "XXXX-XXXX" by simply determining if there 
            # is a whitespace in the string (I should think about making this process a little more robust)
            if len(years.split()) > 1:
                # Don't want the word "born" in the year data, so only taking the year using .split()
                yearDates = years.split()[1].strip()
            else:
                yearDates = years.strip()
        # There is no year data given. I want this to read into the dataframe as NaN, so remember to come 
        # back to this if it doesn't read in as NaN
        else:
            yearDates = 'NaN'
            # If there is no year data given, then there are no parentheses, which allows the scripts to 
            # properly extract the artist's name
            noParens = True
            
        # Extracting the artist's name from s
        # If there are parentheses, simply take a substring of s that ends at the first '('
        if noParens == False:
            name = s[0:s.find('(')]
            fullName = name.strip()
        # If there aren't any parentheses, then just take s as it is (stripped of whitespace of course)
        else:
            fullName = s.strip()
            
        #print(fullName, yearDates, '\n')
        
        # Now append the name and year data to the uPianistsYears dictionary
        uPianistsYears.update( {fullName : yearDates})
            

In [6]:
print(uPianistsLinks)

{'Irving Aaronson': '/wiki/Irving_Aaronson', 'Anders Aarum': '/wiki/Anders_Aarum', 'Mike Abene': '/wiki/Mike_Abene', 'Don Abney': '/wiki/Don_Abney', 'Chris Abrahams': '/wiki/Chris_Abrahams', 'Muhal Richard Abrams': '/wiki/Muhal_Richard_Abrams', 'John Adriano Acea': '/wiki/John_Adriano_Acea', 'Beegie Adair': '/wiki/Beegie_Adair', 'Kei Akagi': '/wiki/Kei_Akagi', 'Toshiko Akiyoshi': '/wiki/Toshiko_Akiyoshi', 'Erling Aksdal Jr.': '/wiki/Erling_Aksdal', 'Joe Albany': '/wiki/Joe_Albany', 'Tony Aless': '/wiki/Tony_Aless', 'Charlie Alexander': '/wiki/Charlie_Alexander', 'Joey Alexander': '/wiki/Joey_Alexander', 'Monty Alexander': '/wiki/Monty_Alexander', 'Sinan Alimanović': '/wiki/Sinan_Alimanovi%C4%87', 'Esther Allan': '/wiki/Esther_Allan', 'Geri Allen': '/wiki/Geri_Allen', 'Steve Allen': '/wiki/Steve_Allen', 'Mose Allison': '/wiki/Mose_Allison', 'Mikhail Alperin': '/wiki/Mikhail_Alperin', 'Helio Alves': '/wiki/Helio_Alves', 'Jimmy Amadie': '/wiki/Jimmy_Amadie', 'Albert Ammons': '/wiki/Albert

In [7]:
print(uPianistsYears)

{'Irving Aaronson': '1895–1963', 'Anders Aarum': '1974', 'Mike Abene': '1942', 'Don Abney': '1923–2000', 'Chris Abrahams': '1961', 'Muhal Richard Abrams': '1930–2017', 'John Adriano Acea': '1917–1963', 'Beegie Adair': '1937', 'Kei Akagi': '1953', 'Toshiko Akiyoshi': '1929', 'Erling Aksdal Jr.': '1953', 'Joe Albany': '1924–1988', 'Tony Aless': '1921–1985', 'Charlie Alexander': '1890–1970', 'Joey Alexander': '2003', 'Monty Alexander': '1944', 'Sinan Alimanović': '1954', 'Esther Allan': '1914–1985', 'Geri Allen': '1957–2017', 'Steve Allen': '1921–2000', 'Mose Allison': '1927–2016', 'Mikhail Alperin': '1956', 'Helio Alves': '1966', 'Jimmy Amadie': '1937–2013', 'Albert Ammons': '1907–1949', 'Franck Amsallem': '1961', 'Chris Anderson': '1926–2008', 'Bill Anschell': 'NaN', 'Jim Aton': '1925–2008', 'Ivar Antonsen': '1946', 'Lil Hardin Armstrong': '1898–1971', 'Dag Arnesen': '1950', 'Lynne Arriale': '1957', 'Roy Assaf': 'NaN', 'Ehud Asherie': '1979', 'Fahir Atakoğlu': 'NaN', 'Eivind Austad': '1

In [8]:
# To combine the two dictionaries so that for each artist's name, there is a wiki and years listed
# I'm not exactly sure how this works, maybe I should work on figuring that out
combinedList = [uPianistsLinks, uPianistsYears]
fullPianistsList = dict()

for artist in uPianistsLinks.keys():
    fullPianistsList[artist] = tuple(fullPianistsList[artist] for fullPianistsList in combinedList)

In [9]:
print(fullPianistsList)

{'Irving Aaronson': ('/wiki/Irving_Aaronson', '1895–1963'), 'Anders Aarum': ('/wiki/Anders_Aarum', '1974'), 'Mike Abene': ('/wiki/Mike_Abene', '1942'), 'Don Abney': ('/wiki/Don_Abney', '1923–2000'), 'Chris Abrahams': ('/wiki/Chris_Abrahams', '1961'), 'Muhal Richard Abrams': ('/wiki/Muhal_Richard_Abrams', '1930–2017'), 'John Adriano Acea': ('/wiki/John_Adriano_Acea', '1917–1963'), 'Beegie Adair': ('/wiki/Beegie_Adair', '1937'), 'Kei Akagi': ('/wiki/Kei_Akagi', '1953'), 'Toshiko Akiyoshi': ('/wiki/Toshiko_Akiyoshi', '1929'), 'Erling Aksdal Jr.': ('/wiki/Erling_Aksdal', '1953'), 'Joe Albany': ('/wiki/Joe_Albany', '1924–1988'), 'Tony Aless': ('/wiki/Tony_Aless', '1921–1985'), 'Charlie Alexander': ('/wiki/Charlie_Alexander', '1890–1970'), 'Joey Alexander': ('/wiki/Joey_Alexander', '2003'), 'Monty Alexander': ('/wiki/Monty_Alexander', '1944'), 'Sinan Alimanović': ('/wiki/Sinan_Alimanovi%C4%87', '1954'), 'Esther Allan': ('/wiki/Esther_Allan', '1914–1985'), 'Geri Allen': ('/wiki/Geri_Allen', '

In [10]:
# Creating a custom dictionary with pyspellchecker that contains most, if not all, instruments used to play jazz
spell = SpellChecker(language = None, case_sensitive = True)
a = spell.word_frequency.load_words(['Piano', 'Vocals', 'Keyboards', 'Double bass', 'Saxophone', 'Clarinet', 'Trumpet', 
                                    'Trombone', 'Drums', 'Guitar', 'Flute', 'Vibraphone', 'Banjo', 'Violin', 'Viola', 
                                    'Cello', 'Synthesizer'])

In [11]:
# Creating DataFrame to store the information on the pianists
pDF = pd.DataFrame(columns = ['artist', 'instrument', 'birthdate', 'deathdate', 'birth_year', 'death_year'])

# Main script that will take the wikis in the fullPianistsList dictionary and scrape the data in the biographical
# table for each artist. I have done much research into the legality and ethicality of this operation, and I've 
# determined that it's okay to run a script like this as long as I have at least a one-second delay inbetween 
# each iteration of the below for loop, so as to not create too much traffic. 

# pianist = pianist's name; linkYear = a tuple where the first element is the wiki link and the second element 
# is the string containing the artist's dates: ( /wiki/ARTIST_NAME, (XXXX OR XXXX-XXXX) )

# For testing the DataFrame:
runs = 0

for pianist, linkYear in fullPianistsList.items():
    
    # For testing purposes
    #if runs < 100:
        print(pianist, linkYear[1])
    
        bday = ''
        dday = ''
        
        bYear = ''
        dYear = ''
        
        lSpan = 0
        
        # instrumentList is important for a later operation, although it can most likely be simplified
        instrumentList = []
    
        # Create the link by appending the wikipedia link for the artist onto the end of the general wikipedia link
        wikiLink = 'https://en.wikipedia.org' + linkYear[0]
    
        # Following the basic requests and bs4 process to read in a webpage
        pInfo = requests.get(wikiLink)
        pSoup = bs4.BeautifulSoup(pInfo.text, "html.parser")
    
        # Finding the biographical information table, which is always in a table with the class = infobox vcard plainlist
        # However, some pages don't have the table, which is why there is a conditional first. 
        if pSoup.find('table', {'class' : 'infobox vcard plainlist'}):
            
    
            # aTable is the tag in which the script will find all of the necessary information
            aTable = pSoup.find('table', {'class' : 'infobox vcard plainlist'})
    
            # All of the necessary information is within the <tr> row element 
            for table in aTable.find_all('tr'):
    
                # First, finding the primary instrument of the artist (which is all that this project will
                # be concerned with for the moment, sorry multi-instrumentalists). The instrument data is listed in
                # the <td> tag with the class = note. However, because the script must account for the fact that
                # there are multi-instrumentalists, the script only returns the first instrument that is found. 
                # In lieu of a more efficient way to do this, there is an index that will only allow the instrumentList
                # to append the first instrument it finds. This should probably be fixed too. 
                if table.find('td', {'class' : 'note'}):
                    index = 0
                    for string in table.find('td', {'class' : 'note'}).stripped_strings:
                        if index == 0:
                            string = string.split(',')[0]
                            string = spell.correction(string)
                            instrumentList.append(string.split(',')[0])
                            index += 1
        
                # Finding the birthday of the artist. If there is one, it is always contained within the span class = 
                # bday tag, and the date can be accessed with the .string bs4 method
                # THERE ARE ISSUES HERE BECAUSE OF WIKIPEDIA SHODDY FORMATTING - FIGURE THIS OUT
                if table.find('span', {'class' : 'bday'}):
                    print(table.find('span', {'class' : 'bday'}).string)
                    bday = table.find('span', {'class' : 'bday'}).string
                    bYear = bday[0:4]
        
                # Finding the deathdate of the artist. If there is one, it is usually contained within the span style = 
                # display:none tag. However, there are multiple pieces of information that can be contained within a 
                # style = diplay:none tag. Fortunately, the death date is the only piece of information that is 
                # accessible via the .string method, so the script checks for the possibility of the .string method
                # and then removes the parentheses which always surround it. 
                if table.find('span', {'style' : 'display:none'}):
                    for row in table.find_all('span', {'style' : 'display:none'}):
                        if row.string:
                            print(row.string[1:-1])
                            dday = row.string[1:-1]
                            dYear = dday[0:4]
                    
     
            
            if not bday:
                bday = linkYear[1][0:4]
                bYear = bday
                
            if not dday:
                if linkYear[1] == 'NaN':
                    dday = 'NaN'
                    dYear = 'NaN'
                else:
                    if len(linkYear[1]) > 4:
                        dday = linkYear[1][5:9]
                        dYear = dday
                    else:
                        dday = 'NaN' # Make these 'Present' if other solution doesn't work
                        dYear = 'NaN'
            
            if not instrumentList:
                instrumentList.append('Piano')
        
            print('\n')
        
            # Time delay on for loop
            #sleep(1.0)
            
            
    
        else:
            print('\n')
        
            if len(linkYear[1]) > 4:
                bday = linkYear[1][0:4]
                bYear = bday
                dday = linkYear[1][5:9]
                dYear = dday
            else:
                bday = linkYear[1]
                bYear = bday
                if bday == 'NaN':
                    dday = 'NaN'
                    dYear = 'NaN'
                else:
                    dday = 'NaN'  # Make these 'Present' if other solution doesn't work
                    dYear = 'NaN'
                    
            if not instrumentList:
                instrumentList.append('Piano')
                
        pDF = pDF.append( {'artist' : pianist, 'instrument' : instrumentList[0], 'birthdate' : str(bday),
                           'deathdate' : str(dday), 'birth_year' : bYear, 'death_year' : dYear}, ignore_index = True)
    
        runs += 1
        print(runs)



Irving Aaronson 1895–1963
1963-03-10


1
Anders Aarum 1974
1974-12-17


2
Mike Abene 1942


3
Don Abney 1923–2000
2000-01-20


4
Chris Abrahams 1961


5
Muhal Richard Abrams 1930–2017
1930-09-19
2017-10-29


6
John Adriano Acea 1917–1963


7
Beegie Adair 1937
1937-12-11


8
Kei Akagi 1953
1953-03-16


9
Toshiko Akiyoshi 1929
1929-12-12


10
Erling Aksdal Jr. 1953
1953-02-18


11
Joe Albany 1924–1988
1924-01-24
1988-01-12


12
Tony Aless 1921–1985
1921-08-28
1988-01-11


13
Charlie Alexander 1890–1970


14
Joey Alexander 2003
2003-06-25


15
Monty Alexander 1944
1944-06-06


16
Sinan Alimanović 1954
1954-02-11


17
Esther Allan 1914–1985


18
Geri Allen 1957–2017


19
Steve Allen 1921–2000


20
Mose Allison 1927–2016
1927-11-11
2016-11-15


21
Mikhail Alperin 1956
1956-11-07
2018-05-11


22
Helio Alves 1966


23
Jimmy Amadie 1937–2013


24
Albert Ammons 1907–1949
1907-03-01
1949-12-02


25
Franck Amsallem 1961
1961-10-25


26
Chris Anderson 1926–2008
1926-02-26
2008-02-04


27
Bill Ansc

1932-05-20
2008-05-15


229
Roberto Fonseca 1975


230
Joel Forrester 1946


231
Sullivan Fortner 1986
1986-12-29


232
Herman Foster 1928–1999
1928-04-26
1999-04-03


233
Ray Foxley 1928–2002


234
Harmen Fraanje 1976
1976-01-23


235
Ming Freeman NaN
1963-10-26


236
Russ Freeman 1926–2002
1926-05-28
2002-06-27


237
Sharon Freeman NaN


238
Don Friedman 1935–2016
1935-05-04
2016-06-30


239
David Frishberg 1933
1933-03-23


240
Satoko Fujii 1958
1958-10-09


241
Larry Fuller 1965
1965-07-14


242
Arve Furset 1964
1964-12-05


243
Joel Futterman 1946


244
Hal Galper 1938
1938-04-18


245
Salman Gambarov 1959
1959-04-18


246
Laszlo Gardony 1956


247
Red Garland 1923–1984
1923-05-13
1984-04-23


248
Erroll Garner 1921–1977
1921-06-15
1977-01-02


249
Michael Garrick 1933–2011


250
Giorgio Gaslini 1929–2014
1929-10-22
2014-07-29


251
Charles Gayle 1939
1939-02-28


252
Chris Geith NaN


253
Leo Genovese 1979


254
George Gershwin 1898–1937


255
Lafayette Gilchrist 1967
1967-08-03


1983-07-13


457
Harold Mabern 1936–2019
1936-03-20
2019-09-17


458
Earl MacDonald 1970
1970-07-26


459
Dave Mackay 1932


460
Shai Maestro 1987
1987-02-05


461
Roberto Magris 1959
1959-06-19


462
Adam Makowicz 1940
1940-08-18


463
John Malachi 1919–1987
1987-02-11


464
Junior Mance 1928
1928-10-10


465
Henry Mancini 1924–1994
1924-04-16
1994-06-14


466
Emanuele Maniscalco 1983


467
Frank Mantooth 1947–2004


468
Fate Marable 1890–1947


469
Tânia Maria 1948
1948-05-09


470
César Camargo Mariano 1943


471
Phil Markowitz 1952


472
Jon Marks 1947–2007


473
Michael "Dodo" Marmarosa 1925–2002
1925-12-12
2002-09-17


474
Denman Maroney 1949


475
André Marques 1975


476
Ellis Marsalis, Jr. 1934–2020
1934-11-14
2020-04-01


477
Peter Martin 1970


478
Ronnie Mathews 1935–2008
1935-12-02
2008-06-28


479
Keiko Matsui 1961
1961-07-26


480
Takashi Matsunaga 1986


481
David Matthews 1942


482
Rebeca Mauleon 1962


483
Jon Mayer 1938
1938-09-07


484
Bill Mays 1944
1944-02-05


4

2013-06-29


676
Willie "The Lion" Smith 1897–1973
1893-11-23
1973-04-18


677
Martial Solal 1927
1927-08-23


678
Jess Stacy 1904–1994
1904-08-11
1995-01-01


679
Bobo Stenson 1944
1944-08-04


680
Tommy Stewart 1939
1939-11-19


681
Vigleik Storaas 1963
1963-02-02


682
Ståle Storløkken 1969
1969-02-22


683
Geoff Stradling 1955


684
Charley Straight 1891–1940


685
Billy Strayhorn 1915–1967
1915-11-29
1967-05-31


686
Frank Strazzeri 1930–2014
1930-04-24
2014-05-09


687
Dana Suesse 1909–1987
1987-10-16


688
Joe Sullivan 1906–1971
1971-10-13


689
Helen Sung NaN


690
Ralph Sutton 1922–2001
1922-11-04
2001-12-30


691
Esbjörn Svensson 1964–2008
1964-04-16
2008-06-14


692
Hermann Szobel 1958


693
Craig Taborn 1970
1970-02-20


694
Aki Takase 1948
1948-01-26


695
Ayumi Tanaka 1986


696
Israel Tanenbaum 1961


697
Horace Tapscott 1934–1999
1934-04-06
1999-02-27


698
Art Tatum 1909–1956
1909-10-13
1956-11-05


699
Billy Taylor 1921–2010
1921-07-24
2010-12-28


700
Cecil Taylor 19

In [12]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
# To make it easier to loop through DataFrame? 
# pDF.set_index('Artist', inplace = True)

# Deleting Birth Year and Death Year columns for now because I don't think they're that necessary given the solution
# I've decided to pursue
pDF.drop(columns = ['birth_year', 'death_year'], inplace = True)

print(pDF)

                          artist       instrument   birthdate   deathdate
0                Irving Aaronson            Piano        1895  1963-03-10
1                   Anders Aarum            Piano  1974-12-17         NaN
2                     Mike Abene            Piano        1942         NaN
3                      Don Abney            Piano        1923  2000-01-20
4                 Chris Abrahams            Piano        1961         NaN
5           Muhal Richard Abrams            Piano  1930-09-19  2017-10-29
6              John Adriano Acea            Piano        1917        1963
7                   Beegie Adair            Piano  1937-12-11         NaN
8                      Kei Akagi            Piano  1953-03-16         NaN
9               Toshiko Akiyoshi            Piano  1929-12-12         NaN
10             Erling Aksdal Jr.            Piano  1953-02-18         NaN
11                    Joe Albany            Piano  1924-01-24  1988-01-12
12                    Tony Aless      

In [13]:
# Adding 'alive' column to the DataFrame using np.select() 
# https://www.dataquest.io/blog/tutorial-add-column-pandas-dataframe-based-on-if-else-condition/

conditions = [ (pDF['birthdate'] == 'NaN') & (pDF['deathdate'] == 'NaN'), (pDF['deathdate'] == 'NaN'), 
             (pDF['deathdate'] != 'NaN') ]

values = ['NaN', True, False]

pDF['alive'] = np.select(conditions, values)
print(pDF)

                          artist       instrument   birthdate   deathdate  \
0                Irving Aaronson            Piano        1895  1963-03-10   
1                   Anders Aarum            Piano  1974-12-17         NaN   
2                     Mike Abene            Piano        1942         NaN   
3                      Don Abney            Piano        1923  2000-01-20   
4                 Chris Abrahams            Piano        1961         NaN   
5           Muhal Richard Abrams            Piano  1930-09-19  2017-10-29   
6              John Adriano Acea            Piano        1917        1963   
7                   Beegie Adair            Piano  1937-12-11         NaN   
8                      Kei Akagi            Piano  1953-03-16         NaN   
9               Toshiko Akiyoshi            Piano  1929-12-12         NaN   
10             Erling Aksdal Jr.            Piano  1953-02-18         NaN   
11                    Joe Albany            Piano  1924-01-24  1988-01-12   

In [14]:
# Forcing all of the dates into datetime objects (what the guy on stack overflow said to do idk)

pDF['bday_dt'] = pd.to_datetime(pDF['birthdate'], errors = 'coerce')
pDF['dday_dt'] = pd.to_datetime(pDF['deathdate'], errors = 'coerce')

# Account for the age of people who are alive today as well
conditions = [ (pDF['alive'] == 'True'), (pDF['alive'] == 'False'), (pDF['alive'] == 'NaN') ]
# Why did I have to convert booleans to strings? For some reason the booleans are being read in as strings...
# It's not a huge deal just confusing

values = [ (pd.to_datetime('today') - pDF['bday_dt']).astype('timedelta64[Y]').astype(float),
         (pDF['dday_dt'] - pDF['bday_dt']).astype('timedelta64[Y]').astype(float), 'NaN' ]

pDF['age_yrs'] = np.select(conditions, values)

display(pDF)

Unnamed: 0,artist,instrument,birthdate,deathdate,alive,bday_dt,dday_dt,age_yrs
0,Irving Aaronson,Piano,1895,1963-03-10,False,1895-01-01,1963-03-10,68.0
1,Anders Aarum,Piano,1974-12-17,,True,1974-12-17,NaT,45.0
2,Mike Abene,Piano,1942,,True,1942-01-01,NaT,78.0
3,Don Abney,Piano,1923,2000-01-20,False,1923-01-01,2000-01-20,77.0
4,Chris Abrahams,Piano,1961,,True,1961-01-01,NaT,59.0
5,Muhal Richard Abrams,Piano,1930-09-19,2017-10-29,False,1930-09-19,2017-10-29,87.0
6,John Adriano Acea,Piano,1917,1963,False,1917-01-01,1963-01-01,45.0
7,Beegie Adair,Piano,1937-12-11,,True,1937-12-11,NaT,82.0
8,Kei Akagi,Piano,1953-03-16,,True,1953-03-16,NaT,67.0
9,Toshiko Akiyoshi,Piano,1929-12-12,,True,1929-12-12,NaT,90.0


In [15]:
pDF.to_csv('jazz_pianists1.csv', index = False)