In [1]:
import config
import requests
import urllib3
import json
import re
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
base = "https://api.genius.com"

In [3]:
def send_request(path, params=None, headers=None):
    requrl = '/'.join([base, path])
    token = "Bearer {}".format(config.access_token)
    if headers:
        headers['Authorization'] = token
    else:
        headers = {"Authorization": token}
        
    response = requests.get(url=requrl, params=params, headers=headers)
    response.raise_for_status()
    return response.json()

In [4]:
def normalize_name(name):
    return re.sub(r'\W+', '', name)

In [5]:
def verify_primary_artist(artist, song):
    normalized_artist = normalize_name(artist).lower()
    normalize_primary = normalize_name(song['primary_artist']['name']).lower()
    return normalized_artist == normalize_primary

In [6]:
def is_valid_song(song):
    is_valid = True
    is_valid &= '(' not in song['title']
    is_valid &= '-' not in song['title']
    is_valid &= 'annotated' not in song['url']
    return is_valid

In [7]:
def get_artist_id(artist):
    artist_encoded = artist.replace(' ', '%20')
    artist_hits = send_request('search/?q=%s' % artist_encoded)['response']['hits']
    for hit in artist_hits:
        if hit['type'] != 'song':
            continue
        if verify_primary_artist(artist, hit['result']):
            return hit['result']['primary_artist']['id']
    raise Exception('Artist %s Not Found' % artist)

In [8]:
def get_artist_songs(artist):
    try:
        artist_id = get_artist_id(artist)
    except:
        return []
    page = 1
    result = []
    while True:
        songs = send_request('artists/%s/songs?per_page=50&page=%s' % (artist_id, page))['response']['songs']
        if len(songs) == 0:
            return result
        for song in songs:
            if not is_valid_song(song):
                continue
            if verify_primary_artist(artist, song):
                result.append({
                    'artist': artist,
                    'title': song['title'],
                    'url': song['url'],
                    'lyrics': scrape_lyrics(song['url'])
                })
        print('Done with page %s' % page)
        page += 1
    

In [10]:
def scrape_lyrics(url):
    resp = requests.get(url)
    s = BeautifulSoup(resp.text)
    lyrics = s.findAll("div", {"class": "lyrics"})
    if len(lyrics) == 1:
        return lyrics[0].text
    return ''

In [125]:
get_artist_id('Denzel Curry')

16690

In [126]:
more_artists = ['The Notorious BIG', '2pac', 'Travis Scott', 'Young Thug', 'Lil Wayne', 'Big Sean', 'Danny Brown',
               'J. Cole', 'Earl Sweatshirt', 'Future', 'Gucci Mane', 'JID', 'Lil Uzi Vert', '21 Savage',
                'Denzel Curry', 'Migos', 'Nas', 'Schoolboy Q', 'Skepta', 'Vince Staples', 'YG']

In [11]:
artists = ['The Weeknd', 'Kanye West', 'Kendrick Lamar', 'Eminem', 'Tyler the Creator', 'Jay-Z', 'Drake', 'Pusha-T']

In [12]:
def get_artist_songs_batch(artists):
    res = []
    for artist in artists:
        print("Starting to scrape %s's songs..." % artist)
        songs = get_artist_songs(artist)
        res.append(songs)
        print('Done')
    return res

In [13]:
all_songs = get_artist_songs_batch(artists)

Starting to scrape The Weeknd's songs...
Done with page 1
Done with page 2
Done with page 3
Done with page 4
Done with page 5
Done with page 6
Done with page 7
Done with page 8
Done with page 9
Done
Starting to scrape Kanye West's songs...
Done with page 1
Done with page 2
Done with page 3
Done with page 4
Done with page 5
Done with page 6
Done with page 7
Done with page 8
Done with page 9
Done with page 10
Done with page 11
Done with page 12
Done with page 13
Done with page 14
Done with page 15
Done with page 16
Done with page 17
Done with page 18
Done with page 19
Done with page 20
Done with page 21
Done with page 22
Done with page 23
Done with page 24
Done with page 25
Done with page 26
Done with page 27
Done with page 28
Done with page 29
Done with page 30
Done with page 31
Done with page 32
Done with page 33
Done
Starting to scrape Kendrick Lamar's songs...
Done with page 1
Done with page 2
Done with page 3
Done with page 4
Done with page 5
Done with page 6
Done with page 7
Done w

In [127]:
all_songs = get_artist_songs_batch(more_artists)

Starting to scrape The Notorious BIG's songs...
Done with page 1
Done with page 2
Done with page 3
Done with page 4
Done with page 5
Done with page 6
Done with page 7
Done with page 8
Done with page 9
Done with page 10
Done
Starting to scrape 2pac's songs...
Done with page 1
Done with page 2
Done with page 3
Done with page 4
Done with page 5
Done with page 6
Done with page 7
Done with page 8
Done with page 9
Done with page 10
Done with page 11
Done with page 12
Done with page 13
Done with page 14
Done with page 15
Done with page 16
Done with page 17
Done with page 18
Done with page 19
Done with page 20
Done with page 21
Done with page 22
Done with page 23
Done
Starting to scrape Travis Scott's songs...
Done with page 1
Done with page 2
Done with page 3
Done with page 4
Done with page 5
Done with page 6
Done with page 7
Done with page 8
Done with page 9
Done with page 10
Done with page 11
Done with page 12
Done with page 13
Done
Starting to scrape Young Thug's songs...
Done with page 1


In [128]:
df_songs = pd.DataFrame(all_songs[0])
for songs in all_songs[1:]:
    df_songs = df_songs.append(songs)

In [129]:
df_songs

Unnamed: 0,artist,title,url,lyrics
0,The Notorious BIG,1970 Somethin’,https://genius.com/The-notorious-big-1970-some...,\n\n[Intro: Notorious B.I.G. (Faith Evans)]\n1...
1,The Notorious BIG,1991 Block Party Freestyle,https://genius.com/The-notorious-big-1991-bloc...,"\n\n[The Notorious B.I.G]\n\nYes it's me, the ..."
2,The Notorious BIG,Another,https://genius.com/The-notorious-big-another-l...,"\n\n[Intro: Big, Lil' Kim]\nYeah, fuck you\nFu..."
3,The Notorious BIG,Basement Freestyle,https://genius.com/The-notorious-big-basement-...,\n\n[Intro: Notorious B.I.G]\nYo\nWe up in the...
4,The Notorious BIG,Beef,https://genius.com/The-notorious-big-beef-lyrics,\n\n[Intro: Notorious B.I.G.]\nWe here\nWe ain...
...,...,...,...,...
180,YG,Yo Nigga Ain’t Me,https://genius.com/Yg-yo-nigga-aint-me-lyrics,\n\nHook: Charlie Hood and YG\nSee shawty be r...
181,YG,Yo Pussy,https://genius.com/Yg-yo-pussy-lyrics,\n\n[Intro: Raw Smooth]\nRaw Smooth with a ban...
182,YG,You Betta Kno,https://genius.com/Yg-you-betta-kno-lyrics,"\n\n[Intro]\nAy, You don't even know it\nI'm o..."
183,YG,You Broke,https://genius.com/Yg-you-broke-lyrics,"\n\n[Chorus: YG]\nBitch you broke, shut up\nDo..."


In [130]:
import string

In [131]:
df_songs.to_csv('hiphop_lyrics2.csv', index=False)

In [132]:
df_songs = df_songs.reset_index(drop=True)

In [133]:
import os

In [134]:
def clean_lyrics(l):
    l = re.sub(r'[\(\[].*?[\)\]]', '', l)
    l = os.linesep.join([s for s in l.splitlines() if s])
    l = l.replace('\r', '')
    l += '\n'
    l = ''.join([i for i in l if i in string.printable])
    #l = l.replace('\n', '$')
    return l

In [135]:
df_songs['lyrics'] = df_songs['lyrics'].apply(clean_lyrics)

In [136]:
lyrics_txt = ''.join(list(df_songs['lyrics']))

In [137]:
lyrics_txt[:100]

"]\n1970 somethin', 1970 somethin'\n1970 somethin', 1970 somethin' \n1970 somethin', 1970 somethin' \n197"

In [116]:
with open('lyrics_more.txt', 'w') as file:
    file.write(lyrics_txt)

In [74]:
lyrics_txt[6200:6300]

"her day$God bid yesterday goodbye$Thought I almost died in my dream again $Fightin' for my life, I\u2005c"

In [75]:
len(lyrics_txt)

5407105

In [2]:
with open('lyrics_more.txt', 'r') as file:
    lyrics_more = file.read()

In [4]:
with open('lyrics.txt', 'r') as file:
    lyrics = file.read()

In [5]:
lyrics[-100:]

" the world\nLet's define what my world is\nKnee-deep in this dope money\nDamn near where my world ends\n"

In [6]:
lyrics_more = lyrics_more.replace('[', '').replace(']', '')

In [8]:
lyrics_more[1:100]

"1970 somethin', 1970 somethin'\n1970 somethin', 1970 somethin' \n1970 somethin', 1970 somethin' \n1970"

In [9]:
all_lyrics = lyrics + lyrics_more[1:]

In [10]:
len(all_lyrics)

21290845

In [11]:
with open('all_lyrics.txt', 'w') as file:
    file.write(all_lyrics)