In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import math


from bs4 import BeautifulSoup
import requests
import re
from fuzzywuzzy import fuzz



# 1 Scrape Genre

## 1.1 Main function

In [46]:
# test case
artist = 'Booker T. and The M.G.\'s'
title  = 'Groovin\''
url    = 'https://en.wikipedia.org/wiki/Groovin%27'

In [160]:
def scrape_genre(url, title, artist):
    re_infobox = 'infobox.*'
    
    html = requests.get(url)                             # url  -> html
    soup = BeautifulSoup(html.content, 'html.parser')    # html -> soup object
    infobox_list = soup.find_all('table', class_=re.compile(re_infobox))  # >=2 songs (with same title, different artist) share a same url
    for infobox in infobox_list:
        des_artist = ''
        try:
            description = infobox.find('th', class_='description').get_text()
            des_artist = get_artist(description)
        except Exception as e:                           # no description
            # print(e)
            continue
            
        fuzz_ratio = fuzz.ratio(des_artist, artist)
        if fuzz_ratio < 70:                              # set 70 as threshold
            #print('Artist no match! ',title, des_artist, ' - ', artist, ': ', fuzz_ratio)
            des_artist = des_artist.lower()              # the Beatles and George Martin  -  The Beatles
            artist = artist.lower()                      # they actually should be the same artist
            if des_artist.find(artist) != -1:
                pass #print(artist, ' is substring of ', des_artist)
            elif artist.find(des_artist) != -1:
                pass #print(des_artist, ' is substring of ', artist)
            else:
                continue
        
        category = infobox.find('td', class_='category hlist')  # category may be NoneType (null)
        
        genre = ''
        try:
            genre = category.get_text()
            genre = clean_genre(genre)
        except Exception as e:
            pass #print(e)
            
        return genre
    return ''

In [None]:
scrape_genre(url, title, artist)

In [115]:
artist = 'Bob Moore'
des_artist = 'Bob Moore & His Orchestra'
des_artist.find(artist)

0

## 1.2 Clean Genre

1. before cleaning:  Rock, country, soul,[1] pop[2]
2. after cleaning :   Rock, country, soul, pop

In [242]:
def clean_genre_2(genre):
    ''' Clean the genre.
    
    Args:
        genre (str): need to be cleaned. eg. 'R&B, blue-eyed soul[1]'
    Return:
        (str): 'R&B, blue-eyed soul'
    '''
    re_genre = '(.*)(\[.*\])(.*)'   # Rock, country, soul,[1] pop[2]
    match = re.search(re_genre, genre)
    if match:
        return match.group(1)+ match.group(3)
    else:
        return genre

In [269]:
def clean_genre(genre):
    genre = genre.strip('\n')
    genre = genre.replace('\n', ', ')  # get rid of newlines
    while genre.find('[') != -1:
        genre = clean_genre_2(genre)
    return genre.lower()               # lowercase

In [244]:
genre = 'Rock, country, soul,[1] pop[2]'
print('before cleaning: ', genre)
print('after cleaning:  ', clean_genre(genre))

before cleaning:  Rock, country, soul,[1] pop[2]
after cleaning:   Rock, country, soul, pop


## 1.3 Extract Artist

In [4]:
def get_artist(description):
    ''' Get the artist from infobox's description
    
    Args:
        description (str): eg. 'Single by The Young Rascals'
    Return:
        (str):             eg. 'The Young Rascals'
    '''
    re_artist  = '(.*) by (.*)'   # eg. Single by The Young Rascals
    
    match = re.search(re_artist, description)
    if match:
        return match.group(2)
    else:
        return 'null'

In [7]:
description = 'Single by Chubby Checker'
get_artist(description)

'Chubby Checker'


## 1.4 Approximate String Match

The following two strings should match according to edit distance

1. Booker T & the M.G.s  
2. Booker T. and The M.G.'s

In [41]:
fuzz.ratio('Booker T & the M.G.s',  'Booker T. and The M.G.\'s')

82

# 2 Add Genre to Dataframe

In [281]:
titles_url = pd.read_csv('../music_data/titles_url.csv')
titles_url['GenreRaw'] = pd.Series()                     # create a new column

In [279]:
# test
song = titles_url.iloc[0,:]
artist = song.Artist
title  = song.Title
url    = song.URL
scrape_genre(url, title, artist)

'easy listening'

In [288]:
for i in range(0, len(titles_url)):
    song = titles_url.iloc[i,:]
    if song.isnull()[2] == True:  # url is nan
        continue
    artist = song.Artist
    title  = song.Title
    url    = song.URL
    genre = scrape_genre(url, title, artist)
    song.GenreRaw = genre
    titles_url.iloc[i,:] = song
    print(i, genre)

titles_url.to_csv('../music_data/genres.csv')

0 easy listening
1 country, pop
2 pop
3 pop
4 pop
5 country
6 rock
7 rock
8 
9 rock and roll
10 pop, teenage tragedy song
11 pop, country
12 
13 pop
14 rockabilly
15 pop
16 
17 vocal, country
18 vocal
19 pop
20 
21 pop
22 pop
23 rock and roll, rhythm and blues
24 rock and roll
25 pop
26 rock and roll, pop
27 pop
28 pop rock, rock and roll
29 pop
30 rhythm and blues, rock and roll
31 rock and roll, bossa nova
32 
33 rock, country, soul, pop
34 country, pop, soul
35 rock and roll
36 r&b
37 rock and roll
38 
40 rock and roll
41 rock and roll
42 rhythm and blues, rock and roll, soul
43 pop
44 pop
45 pop
46 pop
47 traditional pop music
48 country pop
49 country
50 rock and roll
51 
52 rock and roll
53 pop
54 rock and roll
55 pop
57 
59 country, tex-mex
60 country
61 
62 bubblegum pop
63 pop
64 pop, rock 'n' roll
65 country, rock
66 bolero
67 
68 rockabilly
69 rock
70 rock
71 
72 doo-wop
73 traditional pop
74 pop
79 pop
80 rock, surf rock
81 rock, surf rock
83 r&b, pop
84 
85 
86 pop, soul
8

647 soul, r&b, pop
648 pop, vaudeville, bubblegum
649 pop
650 country, schlager
651 pop, schlager
652 
653 
654 folk rock
655 garage rock
656 soul, funk, rhythm and blues
657 funk
658 beat, garage rock, blues rock
659 r&b, novelty
660 
661 folk rock
662 blues rock, jazz
663 country rock
664 beat
666 pop rock
667 raga rock
668 psychedelic rock
669 
670 pop, orchestral pop, r&b
671 country, rock, beat
672 
673 garage rock, folk rock
674 garage rock, folk rock, pop rock
675 
676 folk rock, jazz, soul
677 pop
678 garage rock, tex-mex
679 
680 sunshine pop, folk rock, psychedelic rock, psychedelic pop
681 sunshine pop
682 sunshine pop
683 psychedelic pop
684 pop rock
685 
686 
687 pop
688 rhythm and blues
689 folk rock, sunshine pop
690 pop rock, psychedelic pop
691 folk rock
692 folk rock
693 pop, folk rock
694 folk rock
695 
696 
697 psychedelic pop
698 soul
699 easy listening
700 
701 soul
702 country, folk, pop
703 pop
704 sunshine pop
705 baroque pop, sunshine pop, psychedelic pop
706 