Build a text classification model to predict the artist from a piece of text.

- Download HTML pages
- Get a list of song urls
- Extract lyrics from song urls
- Convert text to numbers by applying the Bag Of Words method
- Build and train a Naive Bayes classifier
- Balance out your dataset
- Write a command-line interface
- Give a 5-minute lightning talk by the end of the week

In [6]:
import requests
import re
from bs4 import BeautifulSoup

# Download HTML pages

In [2]:
# Choose the URL you want to request
url = 'https://www.lyrics.com/artist/Funkadelic/4323'

In [3]:
# Send the request
request = requests.get(url)

In [4]:
# Check the status code in order to assess wether our request was succesful
request.status_code

200

In [5]:
# It is a requests type
type(request)

requests.models.Response

In [6]:
# requests.text will return the html file of the website as a string
type(request.text)

str

In [7]:
# Save the html in a txt file and search in an editor
with open('Funkadelic.txt', 'w') as file:
    file.write(request.text)

## Find and save all links for lyrics

In [8]:
# with regular expressions:

In [9]:
text = request.text

In [10]:
pattern = "/lyric/\d+/Funkadelic/[A-Za-z+%\d+]+"

In [11]:
re.findall(pattern, text, re.IGNORECASE)

['/lyric/35625143/Funkadelic/Nappy+Dugout',
 '/lyric/34578626/Funkadelic/Maggot+Brain+%5BBMG+Dub%5D',
 '/lyric/36001012/Funkadelic/%28Not+Just%29+Knee+Deep',
 '/lyric/32423403/Funkadelic/%28Not+Just%29+Knee+Deep',
 '/lyric/23784932/Funkadelic/Cholly+%28Funk+Getting%27+Ready+To+Roll%29',
 '/lyric/23784931/Funkadelic/Smokey',
 '/lyric/23784925/Funkadelic/Into+You',
 '/lyric/23784919/Funkadelic/If+You+Got+Funk%2C+You+Got+Style',
 '/lyric/23784918/Funkadelic/Freak+of+the+Week',
 '/lyric/23784922/Funkadelic/Soul+Mate',
 '/lyric/23784912/Funkadelic/Oh%2C+I',
 '/lyric/21037081/Funkadelic/Freak+of+the+Week',
 '/lyric/21037032/Funkadelic/If+You+Got+Funk+You+Got+Style',
 '/lyric/16863157/Funkadelic/This+Broken+Heart',
 '/lyric/14406897/Funkadelic/Soul+Mate',
 '/lyric/15343678/Funkadelic/Cosmic+Slop',
 '/lyric/26445147/Funkadelic/Sunshine+of+Your+Love',
 '/lyric/23968820/Funkadelic/Sunshine+of+Your+Love',
 '/lyric/21063263/Funkadelic/Cholly+%28Funk+Getting+Ready+To+Roll%29',
 '/lyric/20279949/Fun

### with beautifulSoup

In [12]:
funka_soup = BeautifulSoup(text, 'html.parser')

In [13]:
funka_soup.body.find_all(class_='tal qx')

[<td class="tal qx"><strong><a href="/lyric/35625143/Funkadelic/Nappy+Dugout">Nappy Dugout</a></strong></td>,
 <td class="tal qx"><strong><a href="/lyric/34578626/Funkadelic/Maggot+Brain+%5BBMG+Dub%5D">Maggot Brain [BMG Dub]</a></strong></td>,
 <td class="tal qx"><strong><a href="/lyric/36001012/Funkadelic/%28Not+Just%29+Knee+Deep">(Not Just) Knee Deep</a></strong></td>,
 <td class="tal qx"><strong><a href="/lyric/32423403/Funkadelic/%28Not+Just%29+Knee+Deep">(Not Just) Knee Deep</a></strong></td>,
 <td class="tal qx"><strong><a href="/lyric/23784932/Funkadelic/Cholly+%28Funk+Getting%27+Ready+To+Roll%29">Cholly (Funk Getting' Ready To Roll)</a></strong></td>,
 <td class="tal qx"><strong><a href="/lyric/23784931/Funkadelic/Smokey">Smokey</a></strong></td>,
 <td class="tal qx"><strong><a href="/lyric/23784925/Funkadelic/Into+You">Into You</a></strong></td>,
 <td class="tal qx"><strong><a href="/lyric/23784919/Funkadelic/If+You+Got+Funk%2C+You+Got+Style">If You Got Funk, You Got Style</a>

In [14]:
funka_soup.find(class_='tal qx').find('a')['href']

'/lyric/35625143/Funkadelic/Nappy+Dugout'

In [15]:
links = []

for td in funka_soup.find_all('td'):
    if "tal" in td.get('class',[]):
          links.append('https://www.lyrics.com' + td.find('a')['href'])

In [16]:
len(links)

283

In [17]:
links[4:10]

['https://www.lyrics.com/lyric/23784932/Funkadelic/Cholly+%28Funk+Getting%27+Ready+To+Roll%29',
 'https://www.lyrics.com/lyric/23784931/Funkadelic/Smokey',
 'https://www.lyrics.com/lyric/23784925/Funkadelic/Into+You',
 'https://www.lyrics.com/lyric/23784919/Funkadelic/If+You+Got+Funk%2C+You+Got+Style',
 'https://www.lyrics.com/lyric/23784918/Funkadelic/Freak+of+the+Week',
 'https://www.lyrics.com/lyric/23784922/Funkadelic/Soul+Mate']

In [18]:
#Loop to create all lyric files

for i in range(len(links)):
    temp_url = links[i]
    title = temp_url.split('/')[-1]
    print(i)
    
    temp_req = requests.get(temp_url)

    with open("Funkadelic_lyrics/" + title + '.txt',  'w') as file:
        file.write(temp_req.text)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

## Extract lyrics corpus

#### Open song txt file on atom
 Lyrics start at `<pre id="lyric-body-text" class="lyric-body">`  and ends at  `</pre>`

..

To read many files, use the expression:

import os

for fn in os.listdir('madonna/')

     text = open('madonna/' + fn).read()

In [14]:
# test on one song:

with open("Funkadelic_lyrics/Soul+Mate.txt") as reader:
        songfile = reader.read()
        reader.close()

In [15]:
type(songfile)

str

In [16]:
song_soup = BeautifulSoup(songfile, 'html.parser')

In [18]:
lyrics = song_soup.pre.get_text()

In [19]:
lyrics

"Given the chance\nI would love the hell out of you, little girl\nDon't you know\nGive me a sign (give me a sign)\nLet me know if it's alright that I desire you\n\nYou've got just what it takes\nTo turn me on\nYou're out of sight girl, don't you know?\n\nI just want to kiss you on your\nDesire, baby\nI've got a thing for you\nI want to kiss ya!\nI just want to kiss you on your\nDesire, baby\nI've got a thing for you\n\nI feel like trying (feel like trying) my love out on you, girl\nFeel like trying (feel like trying, oh!)\nTrying my love out on you, girl (feel like trying)\nI feel like trying out (feel like trying)\n\nMovin' around\nGirl you got me so upset, don't you know\nDon't it show\nGive me a place (give me a place)\nTo say, baby let's get down\nAll the way\n\nYou've got just what it takes\nTo turn me on\nAnd I'm out of sight girl, don't you know?\n\nI just want to kiss you on your\nDesire, baby\nI've got a thing for you\nI want to kiss you\nI just want to kiss you on your\nDesir

In [23]:
corpus = []

