In [1]:

import requests
from bs4 import BeautifulSoup
import re
from collections import Counter
import pandas as pd
import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')


BASE_URL = "https://api.genius.com"
TOKEN = "lM7YS2hNb7u45Ol2Q06EwHe4IuqZqTeWwkUOx3giZYmYB2grlK1GVQkJ8oJQnDdR" 
HEADERS = {"Authorization": f"Bearer {TOKEN}"}

FILLER = ["like", "yeah", "uh", "um", "you know", "huh", "ayy", "yo", "ooh", "hey"]
SWEARS = ["fuck", "shit", "bitch", "ass", "damn", "hoe"]
STOPWORDS = set(stopwords.words("english"))

def search_artist(artist_name):
    url = BASE_URL + "/search"
    params = {"q": artist_name}
    response = requests.get(url, params=params, headers=HEADERS)
    return response.json()

def get_song_info(song_id):
    url = BASE_URL + f"/songs/{song_id}"
    response = requests.get(url, headers=HEADERS)
    return response.json()

def extract_lyrics_from_url(url):
    # Scrape lyrics from a Genius song page. The site uses different structures so try a few fallbacks.
    import requests
    from bs4 import BeautifulSoup

    headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"}
    try:
        page = requests.get(url, headers=headers, timeout=10)
    except Exception as e:
        # network or timeout - return empty string so caller can skip this song
        return ""

    html = BeautifulSoup(page.text, "html.parser")
    # Old page layout: single div.lyrics
    lyrics_div = html.find("div", class_="lyrics")
    if lyrics_div:
        return lyrics_div.get_text(separator=" " ).strip()
    # Newer layout: multiple divs with the data-lyrics-container attribute
    parts = html.find_all("div", attrs={"data-lyrics-container": "true"})
    if parts:
        return " ".join(p.get_text(separator=" " ).strip() for p in parts)
    # Fallbacks: try to find any element that looks like lyrics text
    paragraphs = html.find_all("p")
    if paragraphs:
        return " ".join(p.get_text(separator=" " ).strip() for p in paragraphs)
    return ""


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/pranjalpatel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/pranjalpatel/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pranjalpatel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#Identify nounds, verbs, and adjectives in the lyrics using NLTK
#NLP Functions
def clean_lyrics(text):
    text = text.lower()
    # remove punctuation
    text = re.sub(r"[^\w\s]", "", text)
    # remove filler words
    for f in FILLER:
        text = re.sub(r"\b" + re.escape(f) + r"\b", "", text)
    # remove stopwords
    words = [w for w in text.split() if w not in STOPWORDS]
    return " ".join(words)

def get_pos_counts(text):
    words = word_tokenize(text.lower())
    tagged = pos_tag(words)

    nouns = [w for w, t in tagged if t.startswith("NN")]
    verbs = [w for w, t in tagged if t.startswith("VB")]
    adjs  = [w for w, t in tagged if t.startswith("JJ")]

    return {
        "nouns": Counter(nouns),
        "verbs": Counter(verbs),
        "adjectives": Counter(adjs)
    }

In [3]:

def count_fillers(text):
    """Count occurrences of filler words/phrases in text (case-insensitive)."""
    text_low = (text or "").lower()
    count = 0
    for f in FILLER:
        # handles both single and multi-word fillers
        count += len(re.findall(r"\b" + re.escape(f) + r"\b", text_low))
    return count

def count_swears(text):
    """Count occurrences of swear words from SWEARS list. Returns int."""
    text_low = (text or "").lower()
    count = 0
    for s in SWEARS:
        count += len(re.findall(r"\b" + re.escape(s) + r"\b", text_low))
    return count

In [4]:
song_data = {
    "artist": "",
    "genre": "",
    "title": "",
    "release_year": "",
    "lyrics": "",
    "city": "",
    "gender": ""
}

In [5]:
top_words = {
    "rap": {
        "verbs": [],
        "nouns": [],
        "adjectives": []
    },
    "country": {
        "verbs": [],
        "nouns": [],
        "adjectives": []
    }
}

In [6]:
#Pick 3 Genres:
GENRES = {
    "rap": ["Drake", "Kendrick Lamar", "Travis Scott"],
    "pop": ["Taylor Swift", "Ariana Grande", "Olivia Rodrigo"],
    "country": ["Luke Combs", "Morgan Wallen", "Kane Brown"]
}


In [None]:
# Collect Data for each genre and store in song_data dictionary (robustified)
import time
all_songs = []

for genre, artists in GENRES.items():
    for artist in artists:
        try:
            search_result = search_artist(artist)
        except Exception as e:
            print(f"Search failed for {artist}: {e}")
            continue

        hits = search_result.get("response", {}).get("hits", [])
        if not hits:
            print(f"No hits for {artist}")
            continue

        for hit in hits[:5]:  # take top 5 songs if available
            try:
                song_id = hit.get("result", {}).get("id")
                song_title = hit.get("result", {}).get("title")
                if not song_id:
                    print("Skipping a hit with no song id")
                    continue

                song_info = get_song_info(song_id)
                url = song_info.get("response", {}).get("song", {}).get("url", ""
,
                    continue

                lyrics = extract_lyrics_from_url(url)
                if not lyrics:
                    print(f"No lyrics found for {song_title} by {artist}, skipping")
                    continue

                clean_text = clean_lyrics(lyrics)
                pos_counts = get_pos_counts(clean_text)
                swear_count = count_swears(lyrics)
                filler_count = count_fillers(lyrics)

                all_songs.append({
                    "genre": genre,
                    "artist": artist,
                    "song_title": song_title,
                    "lyrics": lyrics,
                    "clean_lyrics": clean_text,
                    "top_nouns": pos_counts["nouns"].most_common(10),
                    "top_verbs": pos_counts["verbs"].most_common(10),
                    "top_adjectives": pos_counts["adjectives"].most_common(10),
                    "swear_count": swear_count,
                    "filler_count": filler_count
                })

            except Exception as e:
                print(f"Error processing song for {artist}: {e}")
                continue
            finally:
                # be polite to the API
                time.sleep(1)

# Convert to DataFrame
df = pd.DataFrame(all_songs)
df.to_csv("genre_lyrics_dataset.csv", index=False)
print("Dataset collected and saved! Shape:", df.shape)

SyntaxError: unterminated string literal (detected at line 27) (1171392905.py, line 27)