## Scraping Data

I'm interested in visualizing the similarity of Award Show Winning Songs from popular TV-programs such as Music Bank, Ingikayo, Hanteo, etc. This data will be scraped from the wikipedia's for each program as they are well-preserved and appear to be fairly accurate, even maintaining the date/episode number for some of the tables. This scraped data will then be parsed/cleaned to extract Spotify data that will be observed later in this process.

In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import quote
import pandas as pd

pd.set_option('display.max_rows', 100)

In [2]:
def scrape_award_show_wiki(music_program: str, year: int) -> pd.DataFrame:
    """
    Scrapes a Wikipedia table for award show winners, handling row and column spans.

    Args:
        music_program (str): The music program (e.g., "Music Bank", "Hanteo").
        year (int): The year to scrape.

    Returns:
        pd.DataFrame: DataFrame containing the parsed table data.
    """
    url = f"https://en.wikipedia.org/wiki/List_of_{music_program.replace(' ', '_')}_Chart_winners_({year})"
    url = quote(url, safe=":/()")

    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch page. Status Code: {response.status_code}")

    soup = BeautifulSoup(response.content, 'html.parser')
    tables = soup.find_all("table", {"class": "wikitable"})

    if len(tables) < 2:
        raise Exception("This page does not contain 2 tables")

    table = tables[-1]

    # Extract headers
    header_row = table.find('thead')
    if header_row:
        header_row = header_row.find('tr')
    else:
        header_row = table.find('tr')
    headers = [th.get_text(strip=True) for th in header_row.find_all('th')[:-1]]

    # Extract rows with rowspan/colspan handling
    rows = []
    active_rowspans = []

    for tr in table.find('tbody').find_all("tr")[1:]:
        current_row = [None] * len(headers)

        # Apply active rowspans
        for span in active_rowspans:
            if span['remaining'] > 0:
                start_col = span['start_col']
                end_col = start_col + span['colspan']
                for col in range(start_col, min(end_col, len(current_row))):
                    current_row[col] = span['value']
                span['remaining'] -= 1

        # Remove expired rowspans
        active_rowspans = [span for span in active_rowspans if span['remaining'] > 0]

        # Process current cells
        cells = tr.find_all(['th', 'td'])[:-1]  # Exclude last cell as per original
        current_col = 0

        for cell in cells:
            # Skip filled columns
            while current_col < len(current_row) and current_row[current_col] is not None:
                current_col += 1

            if current_col >= len(current_row):
                break

            rowspan = int(cell.get('rowspan', 1))
            colspan = int(cell.get('colspan', 1))
            cell_text = cell.get_text(strip=True)

            # Determine end column
            end_col = current_col + colspan
            end_col = min(end_col, len(current_row))

            # Fill current row
            for col in range(current_col, end_col):
                current_row[col] = cell_text

            # Record rowspan for future rows
            if rowspan > 1:
                active_rowspans.append({
                    'value': cell_text,
                    'remaining': rowspan - 1,
                    'start_col': current_col,
                    'colspan': colspan
                })

            current_col = end_col

        # Replace None with empty strings
        current_row = [cell if cell is not None else '' for cell in current_row]
        rows.append(current_row)

    df = pd.DataFrame(rows, columns=headers)
    df = preprocess_special_chars(df, music_program, year)
    
    return df

In [68]:
def preprocess_special_chars(df, music_program, year):
    df["Award Show"] = music_program
    df["Date"] = df["Date"] + f", {year}"
    
    # Define the regex pattern to remove special characters and bracketed sequences
    pattern = r"[‡†]|\[[a-zA-Z]\]$"

    # Check if 'Song' column contains '‡', '†', or bracketed characters like [c], [b]
    contains_special_chars_Episode = df["Episode"].str.contains(pattern, na=False)
    contains_special_chars_Artist = df["Artist"].str.contains(pattern, na=False)
    contains_special_chars_song = df["Song"].str.contains(pattern, na=False)
    contains_special_chars_points = df["Points"].str.contains(pattern, na=False)


    # Remove special characters and bracketed sequences from the 'Song' and 'Points' columns
    df.loc[contains_special_chars_Episode, "Episode"] = df.loc[contains_special_chars_Episode, "Episode"].str.replace(pattern, "", regex=True)
    df.loc[contains_special_chars_Artist, "Artist"] = df.loc[contains_special_chars_Artist, "Artist"].str.replace(pattern, "", regex=True)
    df.loc[contains_special_chars_song, "Song"] = df.loc[contains_special_chars_song, "Song"].str.replace(pattern, "", regex=True)
    df.loc[contains_special_chars_points, "Points"] = df.loc[contains_special_chars_points, "Points"].str.replace(pattern, "", regex=True)

    # Strip quotes from all rows in the 'Song' column
    df["Song"] = df["Song"].str.replace('"', '', regex=False).fillna('')
    df["Song"] = df["Song"].str.strip()

    return df

In [69]:
music_bank_list = []
for year in range(2022, 2026):
    music_bank_list.append(scrape_award_show_wiki("Music Bank", year))

music_bank_df = pd.concat(music_bank_list).reset_index(drop=True)
music_bank_df

Unnamed: 0,Episode,Date,Artist,Song,Points,Award Show
0,1102,"January 7, 2022",NCT U,Universe,5930,Music Bank
1,1103,"January 14, 2022",Kep1er,Wa Da Da,3678,Music Bank
2,1104,"January 21, 2022",Enhypen,Blessed-Cursed,8377,Music Bank
3,1105,"January 28, 2022",IU,Winter Sleep,2949,Music Bank
4,1106,"February 4, 2022",Pentagon,Feelin' Like,3426,Music Bank
...,...,...,...,...,...,...
155,—[note 1],"December 27, 2024",Stray Kids,Walkin on Water,5599,Music Bank
156,—,"January 3, 2025",Rosé&Bruno Mars,Apt.,4087,Music Bank
157,1233,"January 10, 2025",N.SSign,Love Potion,7525,Music Bank
158,1234,"January 17, 2025",BSS,CBZ (Prime Time),10191,Music Bank


In [70]:
m_countdown_list = []
for year in range(2022, 2026):
    m_countdown_list.append(scrape_award_show_wiki("M Countdown", year))

m_countdown_df = pd.concat(m_countdown_list, axis=0).reset_index(drop=True)
m_countdown_df

Unnamed: 0,Episode,Date,Artist,Song,Points,Award Show
0,—,"January 6, 2022",No Broadcast or Winner,No Broadcast or Winner,No Broadcast or Winner,M Countdown
1,735,"January 13, 2022",Kep1er,Wa Da Da,6500,M Countdown
2,736,"January 20, 2022",Kep1er,Wa Da Da,6569,M Countdown
3,737,"January 27, 2022",Wheein,Make Me Happy,7220,M Countdown
4,738,"February 3, 2022",Got the Beat,Step Back,—,M Countdown
...,...,...,...,...,...,...
155,—,"December 26, 2024",No Broadcast or Winner,No Broadcast or Winner,No Broadcast or Winner,M Countdown
156,—,"January 2, 2025",No Broadcast or Winner,No Broadcast or Winner,No Broadcast or Winner,M Countdown
157,866,"January 9, 2025",Stray Kids,Walkin on Water,8656,M Countdown
158,867,"January 16, 2025",BoyNextDoor,"If I Say, I Love You",9500,M Countdown


In [71]:
award_programs = ["Inkigayo", "M Countdown", "Music Bank", "Show Champion", "Show! Music Core", "The Show"]
all_show_list = []
for show in award_programs:
     for year in range(2022, 2026):
        all_show_list.append(scrape_award_show_wiki(show, year)) 

all_show_df = pd.concat(all_show_list, axis=0).reset_index(drop=True)
all_show_df

Unnamed: 0,Episode,Date,Artist,Song,Points,Award Show
0,—,"January 2, 2022","No show, winner not announced","No show, winner not announced","No show, winner not announced",Inkigayo
1,1121,"January 9, 2022",Ive,Eleven,8533,Inkigayo
2,1122,"January 16, 2022",Ive,Eleven,6583,Inkigayo
3,1123,"January 23, 2022",Ive,Eleven,5927,Inkigayo
4,1124,"January 30, 2022",Got the Beat,Step Back,5612,Inkigayo
...,...,...,...,...,...,...
956,—,", 2024",No Broadcast or Winner,No Broadcast or Winner,No Broadcast or Winner,The Show
957,—,"January 7, 2025",No Broadcast or Winner,No Broadcast or Winner,No Broadcast or Winner,The Show
958,—,"January 14, 2025",No Broadcast or Winner,No Broadcast or Winner,No Broadcast or Winner,The Show
959,—,"January 21, 2025",No Broadcast or Winner,No Broadcast or Winner,No Broadcast or Winner,The Show


In [72]:
all_show_df_winners = all_show_df.loc[(all_show_df["Points"].str.len() < 7) & (all_show_df["Points"].str.len() > 0), :].reset_index(drop=True)

#### Cleaning Song and Artist Names

There are entries where the current naming convention of a given track or artist will make it impossible to find through the Spotify API. This section will focus on finding those troublesome values and altering them for better search queries.

In [73]:
all_show_df_winners.loc[all_show_df_winners.Song == "That That", :]

Unnamed: 0,Episode,Date,Artist,Song,Points,Award Show
15,1137,"May 15, 2022",Psy,That That,9402,Inkigayo
16,1138,"May 22, 2022",Psy,That That,9108,Inkigayo
17,1139,"May 29, 2022",Psy,That That,7620,Inkigayo
149,753,"May 19, 2022",Psyfeat.Suga,That That,—,M Countdown
432,433,"May 11, 2022",Psyfeat.Suga,That That,4514,Show Champion
433,434,"May 18, 2022",Psyfeat.Suga,That That,4506,Show Champion
550,766,"May 21, 2022",Psy,That That,6586,Show! Music Core


In [74]:
all_show_df_winners.loc[all_show_df_winners.Artist.str.contains(r"feat")]

Unnamed: 0,Episode,Date,Artist,Song,Points,Award Show
149,753,"May 19, 2022",Psyfeat.Suga,That That,—,M Countdown
432,433,"May 11, 2022",Psyfeat.Suga,That That,4514,Show Champion
433,434,"May 18, 2022",Psyfeat.Suga,That That,4506,Show Champion


In [75]:
all_show_df_winners.loc[all_show_df_winners.Artist.str.contains(r"feat"), "Artist"] = "Psy"

all_show_df_winners.loc[all_show_df_winners.Song == "That That", :]

Unnamed: 0,Episode,Date,Artist,Song,Points,Award Show
15,1137,"May 15, 2022",Psy,That That,9402,Inkigayo
16,1138,"May 22, 2022",Psy,That That,9108,Inkigayo
17,1139,"May 29, 2022",Psy,That That,7620,Inkigayo
149,753,"May 19, 2022",Psy,That That,—,M Countdown
432,433,"May 11, 2022",Psy,That That,4514,Show Champion
433,434,"May 18, 2022",Psy,That That,4506,Show Champion
550,766,"May 21, 2022",Psy,That That,6586,Show! Music Core


In [76]:
all_show_df_winners.loc[all_show_df_winners.Song.str.contains(r"\([a-zA-Z]*\)")]

Unnamed: 0,Episode,Date,Artist,Song,Points,Award Show
230,834,"March 21, 2024",V,Fri(end)s,7950,M Countdown
231,835,"March 28, 2024",V,Fri(end)s,7767,M Countdown
583,801,"March 18, 2023",Onew,O (Circle),6590,Show! Music Core
652,874,"October 19, 2024",Aespa,Up(Karinasolo),6698,Show! Music Core


In [77]:
all_show_df_winners.loc[all_show_df_winners.Song.str.contains(r"Karina"), "Song"] = "Up"
all_show_df_winners.iloc[655]

Episode                    877
Date          November 9, 2024
Artist         Rosé&Bruno Mars
Song                      Apt.
Points                   6,707
Award Show    Show! Music Core
Name: 655, dtype: object

In [168]:
# Replace the song title directly using .loc to avoid the SettingWithCopyWarning
all_show_df_winners.loc[all_show_df_winners["Song"] == "A Travel to the Sky", "Song"] = "하늘 여행"
all_show_df_winners.loc[all_show_df_winners["Song"] == "Wish Lanterns", "Song"] = "풍등"
all_show_df_winners.loc[all_show_df_winners["Song"] == "My Name is Malgeum", "Song"] = "My Name is Malguem"
all_show_df_winners.loc[all_show_df_winners["Song"] == "Bon Voyage", "Song"] = "BONVOYAGE"
all_show_df_winners.loc[all_show_df_winners.Song.str.contains("(Korean ver.)"), "Song"] = all_show_df_winners.loc[all_show_df_winners.Song.str.contains("(Korean ver.)"), "Song"].str.replace(r"(Korean ver.)", "")

all_show_df_winners.loc[all_show_df_winners["Artist"] == 'Jungkook', "Artist"] = "Jung Kook"
all_show_df_winners.loc[all_show_df_winners["Artist"] == 'Jo Yu-ri', "Artist"] = "Jo Yuri"
all_show_df_winners.loc[all_show_df_winners["Artist"] == 'Park Jae-chan', "Artist"] = "Jaechan"
all_show_df_winners.loc[all_show_df_winners["Song"] == "At That Moment", "Artist"] = "WSG Wannabe (Gaya-G)"

  all_show_df_winners.loc[all_show_df_winners.Song.str.contains("(Korean ver.)"), "Song"] = all_show_df_winners.loc[all_show_df_winners.Song.str.contains("(Korean ver.)"), "Song"].str.replace(r"(Korean ver.)", "")


#### Pre-processing Data

While trying to extract the songs and their respective Spotify data, I came across several issues. For example, there are some songs such as "A Trip to the Sky" that isn't listed on Spotify in English but instead represented in Hangul (Korean written language). Another issue that I was encountering is that the search was returning the wrong song, especially if the artist had a track title that is a subset of another track title of theirs. 

In [170]:
# Construct the "Search Query" column using .loc for proper assignments
all_show_df_winners.loc[:,"Search Query"] = (
    "track:" + all_show_df_winners["Song"] + " artist:" +
     all_show_df_winners["Artist"] + " year:2000-" + 
     all_show_df_winners["Date"].str.slice(-4)
)

# Display the updated DataFrame
all_show_df_winners


Unnamed: 0,Episode,Date,Artist,Song,Points,Award Show,Search Query
0,1121,"January 9, 2022",Ive,Eleven,8533,Inkigayo,track:Eleven artist:Ive year:2000-2022
1,1122,"January 16, 2022",Ive,Eleven,6583,Inkigayo,track:Eleven artist:Ive year:2000-2022
2,1123,"January 23, 2022",Ive,Eleven,5927,Inkigayo,track:Eleven artist:Ive year:2000-2022
3,1124,"January 30, 2022",Got the Beat,Step Back,5612,Inkigayo,track:Step Back artist:Got the Beat year:2000-...
4,1125,"February 20, 2022",Got the Beat,Step Back,7224,Inkigayo,track:Step Back artist:Got the Beat year:2000-...
...,...,...,...,...,...,...,...
740,364,"September 24, 2024",P1Harmony,Sad Song,7049,The Show,track:Sad Song artist:P1Harmony year:2000-2024
741,366,"October 29, 2024",Kiss of Life,Get Loud,6552,The Show,track:Get Loud artist:Kiss of Life year:2000-2024
742,367,"November 5, 2024",TripleS,Hit the Floor,6900,The Show,track:Hit the Floor artist:TripleS year:2000-2024
743,368,"November 12, 2024",Kep1er,Tipi-tap,7680,The Show,track:Tipi-tap artist:Kep1er year:2000-2024


In [171]:
# Group by 'Artist' and count occurrences of each song
song_counts = (
    all_show_df_winners
     .groupby(["Artist", "Song"])["Song"]
     .value_counts()
     .reset_index(name="Frequency"))
song_counts.loc[range(100,157), :]

Unnamed: 0,Artist,Song,Frequency
100,Jung Kook,3D,3
101,Jung Kook,Seven,13
102,Jung Kook,Standing Next to You,2
103,Kai,Rover,2
104,Kang Daniel,Electric Shock,1
105,Kang Daniel,Upside Down,1
106,Kara,When I Move,1
107,Kep1er,Shooting Star,1
108,Kep1er,Tipi-tap,1
109,Kep1er,Up!,1


# Deprecated Aspects of Analysis

### Extracting Song Data Using Spotipy

To broaden the available data and areas for analysis, I wanted to incorporate another dataset that can be joined to the Award Show Winner dataset above. Therefore, I thought it would be suitable to gather characteristics about a song which would allow us to see what characteristics are shared amongst winning songs, the types of songs that are popular in certain seasons, etc.

#### Spotipy Authentication

To perform API calls, I need to authenticate myself to use the Spotify and access the dashboard for the needed keys.

In [7]:
import json
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

with open('.keys/spotipy.json', 'r') as file:
    data = json.load(file)

auth_manager = SpotifyClientCredentials(client_id=data["client_id"], client_secret=data["client_secret"])
sp = spotipy.Spotify(auth_manager=auth_manager)

In [172]:


data = sp.search('track:BonVoyage artist:Dreamcatcher year:2000-2023', limit=1, type=['track', 'artist'], market='KR')['tracks']['items']
print(f"{data[0]['artists'][0]['name']} - {data[0]['name']}: {data[0]['id']}\n")

Dreamcatcher - BONVOYAGE: 3Jnwl9zlbFNEqKQjydxLxe



In [173]:
uniq_tracks = list(set(all_show_df_winners["Search Query"]))
uniq_tracks

['track:Love artist:Monsta X year:2000-2022',
 'track:Way 4 Luv artist:Plave year:2000-2024',
 'track:Perfect Night artist:Le Sserafim year:2000-2023',
 'track:Deja Vu artist:Tomorrow X Together year:2000-2024',
 "track:Killin' Me Good artist:Jihyo year:2000-2023",
 'track:Be There For Me artist:NCT 127 year:2000-2024',
 'track:Lighthouse artist:Tempest year:2000-2024',
 'track:S-Class artist:Stray Kids year:2000-2023',
 'track:Cream Soda artist:Exo year:2000-2023',
 'track:Birthday artist:Red Velvet year:2000-2022',
 'track:Love Shhh! artist:Jo Yuri year:2000-2022',
 'track:Selfish artist:YooA year:2000-2022',
 'track:Eunoia artist:Billlie year:2000-2023',
 'track:Erase Me artist:Oneus year:2000-2023',
 'track:Loser artist:AB6IX year:2000-2023',
 'track:Hit the Floor artist:TripleS year:2000-2024',
 'track:Give Me That artist:WayV year:2000-2024',
 'track:Girls Never Die artist:TripleS year:2000-2024',
 'track:OMG artist:NewJeans year:2000-2023',
 'track:Whisper artist:The Boyz year:2

In [174]:
uniq_song_ids = {
    ## Song Query : song ID
}

In [175]:
def extract_song_ids(uniq_tracks, uniq_song_ids):
    for i in range(len(uniq_tracks)):
        if uniq_tracks[i] not in uniq_song_ids:
            print(f"Searching for: {uniq_tracks[i]}")
            data = sp.search(uniq_tracks[i], limit=1, type=['track', 'artist'], market='KR')['tracks']['items']
            print(f"{data[0]['artists'][0]['name']} - {data[0]['name']}: {data[0]['id']}\n")
            uniq_song_ids[uniq_tracks[i]] = data[0]['id']
    return uniq_song_ids

In [176]:
uniq_song_ids = extract_song_ids(uniq_tracks, uniq_song_ids)

Searching for: track:Love artist:Monsta X year:2000-2022
MONSTA X - Love Killa: 3sPju6MEJhk7Sz8dsRBkLQ

Searching for: track:Way 4 Luv artist:Plave year:2000-2024
PLAVE - WAY 4 LUV: 1T6xi2QrnmwaebXGvWAjLg

Searching for: track:Perfect Night artist:Le Sserafim year:2000-2023
LE SSERAFIM - Perfect Night: 74X2u8JMVooG2QbjRxXwR8

Searching for: track:Deja Vu artist:Tomorrow X Together year:2000-2024
TOMORROW X TOGETHER - Deja Vu: 3aAnwyBJY9OLNLqSgd4fZU

Searching for: track:Killin' Me Good artist:Jihyo year:2000-2023
JIHYO - Killin’ Me Good: 3gafQxekHAbM52PxdX9SDR

Searching for: track:Be There For Me artist:NCT 127 year:2000-2024
NCT 127 - Be There For Me: 1k5b4EAewkP3sqLWcCmWRQ

Searching for: track:Lighthouse artist:Tempest year:2000-2024
TEMPEST - LIGHTHOUSE: 6Zv31tXdpqXPuMXIT4Pq7p

Searching for: track:S-Class artist:Stray Kids year:2000-2023
Stray Kids - 특 S-Class: 54zRGA28tVRKRmFCpywWko

Searching for: track:Cream Soda artist:Exo year:2000-2023
EXO - Cream Soda: 42h7yc9Rda1IOMYLACVg

In [177]:
## This song is consistently searched incorrectly by the API and I made the decision
## to hard-code the value rather than altering and potentially breaking the query
uniq_song_ids["track:Love artist:Monsta X year:2000-2022"] = "0dLenhMYqqeTlHrZqcXkm6"
uniq_song_ids["track:Love artist:Monsta X year:2000-2022"]

'0dLenhMYqqeTlHrZqcXkm6'

In [180]:
all_show_df_winners['Song ID'] = all_show_df_winners["Search Query"].map(uniq_song_ids)
all_show_df_winners.to_csv("all_award_show_winners.csv")

### Extracting Audio Features

At this point, we now have a fairly clean dataset along with the Spotify ID for each track. With this track id, we are going to create another dataset pertaining to the tracks audio features.

In [None]:
sp.audio_features("3LCwQoTrdQgHsGJE5gGVqx")

### **IMPORTANT**: Pivot to MusicXMatch

As of recent, the Spotify API has deprecated the endpoints for the `audio_features` and `audio_analysis` features. Therefore, I spent time to find an alternative which is MusicXMatch. This API has a similar endpoint to Spotify's `audio_features` endpoint and the additional capability of extracting a track's lyrics. 

To proceed this pivot to the alternative API, I'm going to import the previous dataframe and begin extracting information for each track. 

In [9]:
df_all_shows = pd.read_csv('all_award_show_winners.csv').drop("Unnamed: 0", axis=1)
df_all_shows.head(5)

Unnamed: 0,Episode,Date,Artist,Song,Points,Award Show,Search Query,Song ID
0,1121,"January 9, 2022",Ive,Eleven,8533,Inkigayo,track:Eleven artist:Ive year:2000-2022,7n2FZQsaLb7ZRfRPfEeIvr
1,1122,"January 16, 2022",Ive,Eleven,6583,Inkigayo,track:Eleven artist:Ive year:2000-2022,7n2FZQsaLb7ZRfRPfEeIvr
2,1123,"January 23, 2022",Ive,Eleven,5927,Inkigayo,track:Eleven artist:Ive year:2000-2022,7n2FZQsaLb7ZRfRPfEeIvr
3,1124,"January 30, 2022",Got the Beat,Step Back,5612,Inkigayo,track:Step Back artist:Got the Beat year:2000-...,3LCwQoTrdQgHsGJE5gGVqx
4,1125,"February 20, 2022",Got the Beat,Step Back,7224,Inkigayo,track:Step Back artist:Got the Beat year:2000-...,3LCwQoTrdQgHsGJE5gGVqx


*Minor edit that I missed: Snip the text with parentheses to match the other instances of Future Perfect*

In [3]:
df_all_shows.loc[df_all_shows.Song == "Future Perfect (Pass the MIC)", "Song"] = "Future Perfect"

In [4]:
uniq_artists = df_all_shows.Artist.unique().tolist()
uniq_songs = df_all_shows.Song.unique().tolist()
uniq_artists.sort(), uniq_songs.sort()
print(f"Artists: {uniq_artists}\nSongs: {uniq_songs}")

Artists: ['(G)I-dle', 'AB6IX', 'AKMU', 'Aespa', 'Apink', 'Astro', 'Ateez', 'BSS', 'BTS', 'BabyMonster', 'Baekhyun', 'Bibi', 'Big Bang', 'Billlie', 'Blackpink', 'BoyNextDoor', 'BtoB', 'CSR', 'Chungha', 'Cravity', 'DKZ', 'Day6', 'Doyoung', 'Dreamcatcher', 'Enhypen', 'Everglow', 'Evnne', 'Exo', 'Fromis 9', 'G-Dragon', 'Got the Beat', 'H1-Key', 'Han Seung-woo', 'Highlight', 'IU', 'Illit', 'Itzy', 'Ive', 'J-Hope&J. Cole', 'Jaechan', 'Jaehyun', 'Jennie', 'Jihyo', 'Jimin', 'Jin', 'Jisoo', 'Jo Yuri', 'Jung Kook', 'Kai', 'Kang Daniel', 'Kara', 'Kep1er', 'Key', 'Kim Jae-hwan', 'Kim Min-seok', 'Kim Woo Seok', 'Kiss of Life', 'Kwon Eunbi', 'Le Sserafim', 'Lee Chan-won', 'Lee Gi-kwang', 'Lee Young-ji', 'Lim Young-woong', 'Loona', 'Miyeon', 'Monsta X', 'N.SSign', 'NCT 127', 'NCT DoJaeJung', 'NCT Dream', 'NCT U', 'NCT Wish', 'Nayeon', 'NewJeans', 'NiziU', 'Nmixx', 'ONF', 'Oh My Girl', 'Oneus', 'Onew', 'P1Harmony', 'Pentagon', 'Plave', 'Psy', 'QWER', 'Red Velvet', 'Riize', 'Rosé&Bruno Mars', 'SF9', 'S

In [10]:
# If you need to make a high volume of requests, consider using proxies
import json
from musicxmatch_api import MusixMatchAPI
import urllib
api = MusixMatchAPI()

In [6]:
query_track = set(df_all_shows.Song.str.cat([df_all_shows['Artist'], df_all_shows['Song ID']], sep=" | ").tolist())

In [7]:
df_all_shows.loc[(df_all_shows.Artist == "NCT 127") & (df_all_shows.Song == "Be There For Me"), "Search Query"].tolist()

['track:Be There For Me artist:NCT 127 year:2000-2024',
 'track:Be There For Me artist:NCT 127 year:2000-2024']

### Gathering the Track IDs

In [4]:
import requests
import pandas as pd
import json
from time import sleep

# Load your DataFrame (Assuming it has 'artist' and 'track' columns)
df = pd.read_csv('all_award_show_winners.csv').drop("Unnamed: 0", axis=1) 

BASE_URL = "https://api.musixmatch.com/ws/1.1/"
APP_ID = "cf4d7395cd2d7f5618c4057426e93f26" 

results = []

for _, row in df.iterrows():
    artist = row["Artist"]
    track = row["Song"]
    
    query_url = f"{BASE_URL}matcher.track.get?apikey={APP_ID}&q_artist={artist}&q_track={track}&format=json"
    
    try:
        response = requests.get(query_url)
        data = response.json()
        
        if response.status_code == 200:
            results.append(data)
        else:
            print(f"Error {response.status_code}: {data}")

        sleep(1)  # Prevent hitting rate limits
        
    except Exception as e:
        print(f"Request failed for {artist} - {track}: {e}")

# Save results to a JSON file
with open("musixmatch_results.json", "w") as f:
    json.dump(results, f, indent=4)

print("Finished retrieving track data.")


Finished retrieving track data.


In [17]:
def extract_unique_artist_tracks(json_filename):
    """
    Reads a JSON file and extracts unique (artist, track) combinations.
    
    :param json_filename: Path to the JSON file containing Musixmatch results
    :return: Set of unique (artist, track) tuples
    """
    unique_tracks = set()

    try:
        # Load JSON data
        with open(json_filename, "r", encoding="utf-8") as file:
            data = json.load(file)

        # Ensure data is a list of dictionaries
        if not isinstance(data, list):
            raise ValueError(f"Expected JSON data to be a list, but got {type(data)}")

        # Extract artist-track pairs
        for entry in data:
            if not isinstance(entry, dict):
                print(f"Skipping invalid entry (not a dict): {entry}")
                continue

            try:
                if entry["message"]["body"] == "":
                    continue
                # Ensure the structure exists before accessing keys
                track_info = entry["message"]["body"]["track"]

                # Extract values
                artist_name = track_info["artist_name"].strip()
                track_name = track_info["track_name"].strip()

                # Debugging: Print extracted data
                print(f"Extracted: Artist = {artist_name}, Track = {track_name}")

                # Add to set if both values exist
                if artist_name and track_name:
                    unique_tracks.add((artist_name, track_name))

            except KeyError as e:
                print(f"Skipping entry due to missing key: {e}")

    except json.JSONDecodeError:
        print("Error: Invalid JSON format in file.")
    except FileNotFoundError:
        print(f"Error: File {json_filename} not found.")
    except Exception as e:
        print(f"Unexpected error: {e}")

    return unique_tracks

json_file = "musixmatch_results.json"
unique_combinations = extract_unique_artist_tracks(json_file)

Extracted: Artist = IVE, Track = ELEVEN
Extracted: Artist = IVE, Track = ELEVEN
Extracted: Artist = IVE, Track = ELEVEN
Extracted: Artist = GOT the beat, Track = Step Back
Extracted: Artist = GOT the beat, Track = Step Back
Extracted: Artist = TAEYEON, Track = INVU
Extracted: Artist = TAEYEON, Track = INVU
Extracted: Artist = TAEYEON, Track = INVU
Extracted: Artist = Kim MinSeok, Track = DrunKen Confession
Extracted: Artist = (G)I-DLE, Track = TOMBOY
Extracted: Artist = NCT DREAM, Track = Glitch Mode
Extracted: Artist = BIGBANG, Track = Still Life
Extracted: Artist = BIGBANG, Track = Still Life
Extracted: Artist = BIGBANG, Track = Still Life
Extracted: Artist = IVE, Track = LOVE DIVE
Extracted: Artist = PSY feat. SUGA, Track = That That (prod.&feat. SUGA of BTS)
Extracted: Artist = PSY feat. SUGA, Track = That That (prod.&feat. SUGA of BTS)
Extracted: Artist = PSY feat. SUGA, Track = That That (prod.&feat. SUGA of BTS)
Extracted: Artist = (G)I-DLE, Track = TOMBOY
Extracted: Artist = NC

In [None]:
def extract_all_artist_tracks(json_filename):
    """
    Reads a JSON file and extracts all of the song ids to merge it with our original data frame.
    
    :param json_filename: Path to the JSON file containing Musixmatch results
    :return: Set of unique (artist, track) tuples
    """
    tracks = list()

    try:
        # Load JSON data
        with open(json_filename, "r", encoding="utf-8") as file:
            data = json.load(file)

        # Ensure data is a list of dictionaries
        if not isinstance(data, list):
            raise ValueError(f"Expected JSON data to be a list, but got {type(data)}")

        # Extract artist-track pairs
        for entry in data:
            if not isinstance(entry, dict):
                print(f"Skipping invalid entry (not a dict): {entry}")
                continue

            try:
                if entry["message"]["body"] == "":
                    tracks.append("Not Found")
                # Ensure the structure exists before accessing keys
                track_info = entry["message"]["body"]["track"]

                # Extract values
                track_id = track_info["track_id"].strip()
                track_name = track_info["track_name"].strip()

                # Debugging: Print extracted data
                print(f"Extracted: Artist = {artist_name}, Track = {track_name}")

                # Add to set if both values exist
                if artist_name and track_name:
                    unique_tracks.add((artist_name, track_name))

            except KeyError as e:
                print(f"Skipping entry due to missing key: {e}")

    except json.JSONDecodeError:
        print("Error: Invalid JSON format in file.")
    except FileNotFoundError:
        print(f"Error: File {json_filename} not found.")
    except Exception as e:
        print(f"Unexpected error: {e}")

    return unique_tracks

json_file = "musixmatch_results.json"
unique_combinations = extract_unique_artist_tracks(json_file)

In [30]:
unique_df_tracks = set(zip(df["Artist"], df["Song"]))

In [29]:
def normalize_and_print_side_by_side(list1, list2, header1="DataFrame Tracks", header2="JSON Tracks", spacing=40):
    """
    Prints two lists side by side with aligned formatting after normalizing (lowercasing) and sorting.
    
    :param list1: First list (e.g., DataFrame unique tracks)
    :param list2: Second list (e.g., JSON unique tracks)
    :param header1: Title for first column
    :param header2: Title for second column
    :param spacing: Width allocated for each column
    """
    # Normalize to lowercase and sort
    list1 = sorted({(artist.lower(), track.lower()) for artist, track in list1})
    list2 = sorted({(artist.lower(), track.lower()) for artist, track in list2})

    # Print headers
    print(f"{header1.ljust(spacing)} {header2}")
    print("=" * (spacing * 2))

    # Print items side by side
    for item1, item2 in zip(list1, list2):
        print(f"{str(item1).ljust(spacing)} {str(item2)}")

    # Handle cases where lists have different lengths
    longer_list = list1 if len(list1) > len(list2) else list2
    extra_items = longer_list[len(list1):] if len(list1) < len(list2) else longer_list[len(list2):]

    for item in extra_items:
        print(f"{str(item).ljust(spacing)} -") if list1 is longer_list else print(f"{'-'.ljust(spacing)} {str(item)}")

# Example usage
normalize_and_print_side_by_side(unique_df_tracks, unique_combinations)


DataFrame Tracks                         JSON Tracks
('(g)i-dle', 'fate')                     ('(g)i-dle', 'fate')
('(g)i-dle', 'klaxon')                   ('(g)i-dle', 'klaxon')
('(g)i-dle', 'nxde')                     ('(g)i-dle', 'nxde')
('(g)i-dle', 'queencard')                ('(g)i-dle', 'queencard')
('(g)i-dle', 'super lady')               ('(g)i-dle', 'super lady')
('(g)i-dle', 'tomboy')                   ('(g)i-dle', 'tomboy')
('ab6ix', 'loser')                       ('ab6ix', 'loser')
('aespa', 'armageddon')                  ('aespa', 'armageddon')
('aespa', 'drama')                       ('aespa', 'drama')
('aespa', 'girls')                       ('aespa', 'girls')
('aespa', 'spicy')                       ('aespa', 'spicy')
('aespa', 'supernova')                   ('aespa', 'supernova')
('aespa', 'up')                          ('aespa', 'up (karina solo)')
('aespa', 'whiplash')                    ('aespa', 'whiplash')
('akmu', 'love lee')                     ('akmu', 'love l

**Wrong Search**
- Seventeen - _World
- Seventeen - love, money, fame is a remix
- TXT - Good Boy Gone Bad is a remix
- Jihyo - Killin' Me Good is english ver
- TXT - Chasing That Feeling found a song by viva
- Jhope Jcole song is just the solo
- NCT - Songbird is the Japanese version

**404 error**
- Le Sserafim - Eve Psyche & Bluebeards Wife
- Wheein - Make Me Happy **Doesn't appear to be in their catalog**
- TXT - Sugar Rush Ride
- CSR - ♡Ticon
- Almost all of Fromis9 songs 3 (Songs)
- lee gi-kwang - predator
- Yooa - Selfish

Insights Findings:
- Wheein appears as Whee In
- TXT can be found under TOMORROW X TOGETHER, which fixes all of their issues
- ♡Ticon => LOVETICON
- Fromis_9 songs have difficulty finding despite being a verified artist. Therefore, opting to use their artist id for their queries
- Lee Gi-Kwang => LEEGIKWANG
- Yooa => 유아
- SEVENTEEN - _WORLD cannot be via queries so I found its trackID: 242182341
- Most of the queries can be resolved using the track search endpoint instead of the matcher endpoint
- Jihyo => Ji Hyo
- Songbird => Songbird (Korean Version)

In [19]:
import requests
import urllib.parse

track = "Songbird (Korean)"
artist = "NCT"
APP_ID = "cf4d7395cd2d7f5618c4057426e93f26"
BASE_URL = "https://api.musixmatch.com/ws/1.1/"

## Matcher Query
query_url = f"{BASE_URL}matcher.track.get?apikey={APP_ID}&q_artist={artist}&q_track={track}&format=json"

## Track Search Query
query_url = f"{BASE_URL}track.search?apikey={APP_ID}&q_artist={artist}&q_track={track}&page_size=2&page=1&s_track_rating=desc&format=json"

## Find artist
# query_url = f"{BASE_URL}artist.search?apikey={APP_ID}&q_artist={artist}&page_size=1"

## Test Query
# query_url = f"{BASE_URL}track.search?apikey={APP_ID}&f_artist_id=34857912&q_track=menow&page_size=1&page=1&format=json"
# query_url = f"{BASE_URL}track.search?apikey={APP_ID}&f_artist_id=31899946&f_album_id=53067298&q_track=Circles&page_size=2&page=1&format=json"
# query_url = f"{BASE_URL}album.tracks.get?apikey={APP_ID}&album_id=53067298&page_size=2&page=1"

# Get mood values
query_url = f"{BASE_URL}track.lyrics.mood.get?apikey={APP_ID}&commontrack_id=5920049"



search = requests.get(query_url).json()
print(json.dumps(search, indent=4))

{
    "message": {
        "header": {
            "status_code": 401,
            "execute_time": 0.01217794418335,
            "hint": "moods not enabled on this plan"
        },
        "body": []
    }
}


In [10]:
import requests

track = "Eve, Psyche & The Bluebeard's wife"
artist = "LE SSERAFIM"
APP_ID = "cf4d7395cd2d7f5618c4057426e93f26"
BASE_URL = "https://api.musixmatch.com/ws/1.1/"

## Matcher Query
# query_url = f"{BASE_URL}matcher.track.get?apikey={APP_ID}&q_artist={artist}&q_track={track}&format=json"

## Track Search Query
query_url = f"{BASE_URL}track.search?apikey={APP_ID}&q_artist={artist}&q_track={track}&format=json&page_size=1"

search = requests.get(query_url).json()
print(json.dumps(search, indent=4))

{
    "message": {
        "header": {
            "status_code": 200,
            "execute_time": 0.025938987731934,
            "available": 5
        },
        "body": {
            "track_list": [
                {
                    "track": {
                        "track_id": 256226625,
                        "track_name": "Eve, Psyche & The Bluebeard\u2019s wife",
                        "track_name_translation_list": [],
                        "track_rating": 71,
                        "commontrack_id": 159108270,
                        "instrumental": 0,
                        "explicit": 1,
                        "has_lyrics": 1,
                        "has_subtitles": 1,
                        "has_richsync": 1,
                        "num_favourite": 34,
                        "album_id": 57090328,
                        "album_name": "UNFORGIVEN",
                        "artist_id": 53333048,
                        "artist_name": "LE SSERAFIM",
           