## Scraping Data

I'm interested in visualizing the similarity of Award Show Winning Songs from popular TV-programs such as Music Bank, Ingikayo, Hanteo, etc. This data will be scraped from the wikipedia's for each program as they are well-preserved and appear to be fairly accurate, even maintaining the date/episode number for some of the tables. This scraped data will then be parsed/cleaned to extract Spotify data that will be observed later in this process.

In [146]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import quote
import pandas as pd
import os

pd.set_option('display.max_rows', 100)

In [147]:
def scrape_award_show_wiki(music_program: str, year: int) -> pd.DataFrame:
    """
    Scrapes a Wikipedia table for award show winners, handling row and column spans.

    Args:
        music_program (str): The music program (e.g., "Music Bank", "Hanteo").
        year (int): The year to scrape.

    Returns:
        pd.DataFrame: DataFrame containing the parsed table data.
    """
    url = f"https://en.wikipedia.org/wiki/List_of_{music_program.replace(' ', '_')}_Chart_winners_({year})"
    url = quote(url, safe=":/()")

    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch page. Status Code: {response.status_code}")

    soup = BeautifulSoup(response.content, 'html.parser')
    tables = soup.find_all("table", {"class": "wikitable"})

    if len(tables) < 2:
        raise Exception("This page does not contain 2 tables")

    table = tables[-1]

    # Extract headers
    header_row = table.find('thead')
    if header_row:
        header_row = header_row.find('tr')
    else:
        header_row = table.find('tr')
    headers = [th.get_text(strip=True) for th in header_row.find_all('th')[:-1]]

    # Extract rows with rowspan/colspan handling
    rows = []
    active_rowspans = []

    for tr in table.find('tbody').find_all("tr")[1:]:
        current_row = [None] * len(headers)

        # Apply active rowspans
        for span in active_rowspans:
            if span['remaining'] > 0:
                start_col = span['start_col']
                end_col = start_col + span['colspan']
                for col in range(start_col, min(end_col, len(current_row))):
                    current_row[col] = span['value']
                span['remaining'] -= 1

        # Remove expired rowspans
        active_rowspans = [span for span in active_rowspans if span['remaining'] > 0]

        # Process current cells
        cells = tr.find_all(['th', 'td'])[:-1]  # Exclude last cell as per original
        current_col = 0

        for cell in cells:
            # Skip filled columns
            while current_col < len(current_row) and current_row[current_col] is not None:
                current_col += 1

            if current_col >= len(current_row):
                break
            
            
            rowspan = int(str(cell.get('rowspan', 1)).replace('"',"").replace("'", ""))
            colspan = int(str(cell.get('colspan', 1)).replace('"',"").replace("'", ""))
            cell_text = cell.get_text(strip=True)

            # Determine end column
            end_col = current_col + colspan
            end_col = min(end_col, len(current_row))

            # Fill current row
            for col in range(current_col, end_col):
                current_row[col] = cell_text

            # Record rowspan for future rows
            if rowspan > 1:
                active_rowspans.append({
                    'value': cell_text,
                    'remaining': rowspan - 1,
                    'start_col': current_col,
                    'colspan': colspan
                })

            current_col = end_col

        # Replace None with empty strings
        current_row = [cell if cell is not None else '' for cell in current_row]
        rows.append(current_row)

    df = pd.DataFrame(rows, columns=headers)
    df = preprocess_special_chars(df, music_program, year)
    
    return df

In [148]:
def preprocess_special_chars(df, music_program, year):
    df["Award Show"] = music_program
    
    # Define the regex pattern to remove special characters and bracketed sequences
    pattern = r"[‡†]|\[[a-zA-Z]\]$"

    # Check if 'Song' column contains '‡', '†', or bracketed characters like [c], [b]
    contains_special_chars_Episode = df["Episode"].str.contains(pattern, na=False)
    contains_special_chars_Artist = df["Artist"].str.contains(pattern, na=False)
    contains_special_chars_song = df["Song"].str.contains(pattern, na=False)
    contains_special_chars_points = df["Points"].str.contains(pattern, na=False)
    contains_special_chars_dates = df["Date"].str.contains(pattern, na=False)


    # Remove special characters and bracketed sequences from the 'Song' and 'Points' columns
    df.loc[contains_special_chars_Episode, "Episode"] = df.loc[contains_special_chars_Episode, "Episode"].str.replace(pattern, "", regex=True)
    df.loc[contains_special_chars_Artist, "Artist"] = df.loc[contains_special_chars_Artist, "Artist"].str.replace(pattern, "", regex=True)
    df.loc[contains_special_chars_song, "Song"] = df.loc[contains_special_chars_song, "Song"].str.replace(pattern, "", regex=True)
    df.loc[contains_special_chars_points, "Points"] = df.loc[contains_special_chars_points, "Points"].str.replace(pattern, "", regex=True)
    df.loc[contains_special_chars_dates, "Date"] = df.loc[contains_special_chars_dates, "Date"].str.replace(pattern, "", regex=True)

    # Attach the year onto the 'Date' column
    df["Date"] = df["Date"] + f", {year}"
    # Strip quotes from all rows in the 'Song' column
    df["Song"] = df["Song"].str.replace('"', '', regex=False).fillna('')
    df["Song"] = df["Song"].str.strip()

    df["Points"] = df["Points"].str.replace('—','0',regex=False)
    df["Points"] = df["Points"].str.replace(',','',regex=False)

    return df

In [149]:
music_bank_list = []
for year in range(2022, 2026):
    music_bank_list.append(scrape_award_show_wiki("Music Bank", year))

music_bank_df = pd.concat(music_bank_list).reset_index(drop=True)
music_bank_df

Unnamed: 0,Episode,Date,Artist,Song,Points,Award Show
0,1102,"January 7, 2022",NCT U,Universe,5930,Music Bank
1,1103,"January 14, 2022",Kep1er,Wa Da Da,3678,Music Bank
2,1104,"January 21, 2022",Enhypen,Blessed-Cursed,8377,Music Bank
3,1105,"January 28, 2022",IU,Winter Sleep,2949,Music Bank
4,1106,"February 4, 2022",Pentagon,Feelin' Like,3426,Music Bank
...,...,...,...,...,...,...
167,1237,"March 21, 2025",Le Sserafim,Hot,9917,Music Bank
168,1238,"March 28, 2025",Nmixx,Know About Me,11040,Music Bank
169,—,"April 4, 2025",Ten,Stunner,7752,Music Bank
170,1239,"April 11, 2025",Close Your Eyes,All My Poetry,5507,Music Bank


In [150]:
m_countdown_list = []
for year in range(2022, 2026):
    m_countdown_list.append(scrape_award_show_wiki("M Countdown", year))

m_countdown_df = pd.concat(m_countdown_list, axis=0).reset_index(drop=True)
m_countdown_df

Unnamed: 0,Episode,Date,Artist,Song,Points,Award Show
0,—,"January 6, 2022",No Broadcast or Winner,No Broadcast or Winner,No Broadcast or Winner,M Countdown
1,735,"January 13, 2022",Kep1er,Wa Da Da,6500,M Countdown
2,736,"January 20, 2022",Kep1er,Wa Da Da,6569,M Countdown
3,737,"January 27, 2022",Wheein,Make Me Happy,7220,M Countdown
4,738,"February 3, 2022",Got the Beat,Step Back,0,M Countdown
...,...,...,...,...,...,...
167,878,"March 20, 2025",Jennie,Like Jennie,6856,M Countdown
168,879,"March 27, 2025",Jennie,Like Jennie,6719,M Countdown
169,880,"April 3, 2025",Jennie,Like Jennie,6636,M Countdown
170,881,"April 10, 2025",J-Hope,Mona Lisa,7638,M Countdown


In [151]:
award_programs = ["Inkigayo", "M Countdown", "Music Bank", "Show Champion", "Show! Music Core", "The Show"]
all_show_list = []
for show in award_programs:
     for year in range(2022, 2026):
        all_show_list.append(scrape_award_show_wiki(show, year)) 

all_show_df = pd.concat(all_show_list, axis=0).reset_index(drop=True)
all_show_df

Unnamed: 0,Episode,Date,Artist,Song,Points,Award Show
0,—,"January 2, 2022","No show, winner not announced","No show, winner not announced",No show winner not announced,Inkigayo
1,1121,"January 9, 2022",Ive,Eleven,8533,Inkigayo
2,1122,"January 16, 2022",Ive,Eleven,6583,Inkigayo
3,1123,"January 23, 2022",Ive,Eleven,5927,Inkigayo
4,1124,"January 30, 2022",Got the Beat,Step Back,5612,Inkigayo
...,...,...,...,...,...,...
1028,372,"March 25, 2025",STAYC,Bebe,7421,The Show
1029,373,"April 1, 2025",Ten,Stunner,8600,The Show
1030,374,"April 8, 2025",Close Your Eyes,All My Poetry,8605,The Show
1031,375,"April 15, 2025",Izna,Sign,8450,The Show


In [152]:
all_show_df_winners = all_show_df.loc[(all_show_df["Points"].str.len() < 7) & (all_show_df["Points"].str.len() > 0), :].reset_index(drop=True)

#### Cleaning Song and Artist Names

There are entries where the current naming convention of a given track or artist will make it impossible to find through the Spotify API. This section will focus on finding those troublesome values and altering them for better search queries.

In [153]:
all_show_df_winners.loc[all_show_df_winners.Song == "That That", :]

Unnamed: 0,Episode,Date,Artist,Song,Points,Award Show
16,1137,"May 15, 2022",Psy,That That,9402,Inkigayo
17,1138,"May 22, 2022",Psy,That That,9108,Inkigayo
18,1139,"May 29, 2022",Psy,That That,7620,Inkigayo
162,753,"May 19, 2022",Psyfeat.Suga,That That,0,M Countdown
473,433,"May 11, 2022",Psyfeat.Suga,That That,4514,Show Champion
474,434,"May 18, 2022",Psyfeat.Suga,That That,4506,Show Champion
600,766,"May 21, 2022",Psy,That That,6586,Show! Music Core


In [154]:
all_show_df_winners.loc[all_show_df_winners.Artist.str.contains(r"feat")]

Unnamed: 0,Episode,Date,Artist,Song,Points,Award Show
162,753,"May 19, 2022",Psyfeat.Suga,That That,0,M Countdown
473,433,"May 11, 2022",Psyfeat.Suga,That That,4514,Show Champion
474,434,"May 18, 2022",Psyfeat.Suga,That That,4506,Show Champion


In [155]:
all_show_df_winners.loc[all_show_df_winners.Artist.str.contains(r"feat"), "Artist"] = "Psy"

all_show_df_winners.loc[all_show_df_winners.Song == "That That", :]

Unnamed: 0,Episode,Date,Artist,Song,Points,Award Show
16,1137,"May 15, 2022",Psy,That That,9402,Inkigayo
17,1138,"May 22, 2022",Psy,That That,9108,Inkigayo
18,1139,"May 29, 2022",Psy,That That,7620,Inkigayo
162,753,"May 19, 2022",Psy,That That,0,M Countdown
473,433,"May 11, 2022",Psy,That That,4514,Show Champion
474,434,"May 18, 2022",Psy,That That,4506,Show Champion
600,766,"May 21, 2022",Psy,That That,6586,Show! Music Core


In [156]:
all_show_df_winners.loc[all_show_df_winners.Song.str.contains(r"\([a-zA-Z]*\)")]

Unnamed: 0,Episode,Date,Artist,Song,Points,Award Show
243,834,"March 21, 2024",V,Fri(end)s,7950,M Countdown
244,835,"March 28, 2024",V,Fri(end)s,7767,M Countdown
633,801,"March 18, 2023",Onew,O (Circle),6590,Show! Music Core
702,874,"October 19, 2024",Aespa,Up(Karinasolo),6698,Show! Music Core


In [157]:
all_show_df_winners.loc[all_show_df_winners.Song.str.contains(r"Karina"), "Song"] = "Up"
all_show_df_winners.iloc[655]

Episode                     825
Date          September 9, 2023
Artist                 Jungkook
Song                      Seven
Points                     7026
Award Show     Show! Music Core
Name: 655, dtype: object

In [158]:
# Replace the song title directly using .loc to avoid the SettingWithCopyWarning
all_show_df_winners.loc[all_show_df_winners["Song"] == "A Travel to the Sky", "Song"] = "하늘 여행"
all_show_df_winners.loc[all_show_df_winners["Song"] == "Wish Lanterns", "Song"] = "풍등"
all_show_df_winners.loc[all_show_df_winners["Song"] == "My Name is Malgeum", "Song"] = "My Name is Malguem"
all_show_df_winners.loc[all_show_df_winners["Song"] == "Bon Voyage", "Song"] = "BONVOYAGE"
all_show_df_winners.loc[all_show_df_winners.Song.str.contains("(Korean ver.)"), "Song"] = all_show_df_winners.loc[all_show_df_winners.Song.str.contains("(Korean ver.)"), "Song"].str.replace(r"(Korean ver.)", "")

all_show_df_winners.loc[all_show_df_winners["Artist"] == 'Jungkook', "Artist"] = "Jung Kook"
all_show_df_winners.loc[all_show_df_winners["Artist"] == 'Jo Yu-ri', "Artist"] = "Jo Yuri"
all_show_df_winners.loc[all_show_df_winners["Artist"] == 'Park Jae-chan', "Artist"] = "Jaechan"
all_show_df_winners.loc[all_show_df_winners["Song"] == "At That Moment", "Artist"] = "WSG Wannabe (Gaya-G)"

  all_show_df_winners.loc[all_show_df_winners.Song.str.contains("(Korean ver.)"), "Song"] = all_show_df_winners.loc[all_show_df_winners.Song.str.contains("(Korean ver.)"), "Song"].str.replace(r"(Korean ver.)", "")


#### Pre-processing Data

While trying to extract the songs and their respective Spotify data, I came across several issues. For example, there are some songs such as "A Trip to the Sky" that isn't listed on Spotify in English but instead represented in Hangul (Korean written language). Another issue that I was encountering is that the search was returning the wrong song, especially if the artist had a track title that is a subset of another track title of theirs. 

In [159]:
# Construct the "Search Query" column using .loc for proper assignments
all_show_df_winners.loc[:,"Search Query"] = (
    "track:" + all_show_df_winners["Song"] + " artist:" +
     all_show_df_winners["Artist"] + " year:2000-" + 
     all_show_df_winners["Date"].str.slice(-4)
)

# Display the updated DataFrame
all_show_df_winners


Unnamed: 0,Episode,Date,Artist,Song,Points,Award Show,Search Query
0,1121,"January 9, 2022",Ive,Eleven,8533,Inkigayo,track:Eleven artist:Ive year:2000-2022
1,1122,"January 16, 2022",Ive,Eleven,6583,Inkigayo,track:Eleven artist:Ive year:2000-2022
2,1123,"January 23, 2022",Ive,Eleven,5927,Inkigayo,track:Eleven artist:Ive year:2000-2022
3,1124,"January 30, 2022",Got the Beat,Step Back,5612,Inkigayo,track:Step Back artist:Got the Beat year:2000-...
4,1125,"February 20, 2022",Got the Beat,Step Back,7224,Inkigayo,track:Step Back artist:Got the Beat year:2000-...
...,...,...,...,...,...,...,...
806,371,"March 11, 2025",Hearts2Hearts,The Chase,8500,The Show,track:The Chase artist:Hearts2Hearts year:2000...
807,372,"March 25, 2025",STAYC,Bebe,7421,The Show,track:Bebe artist:STAYC year:2000-2025
808,373,"April 1, 2025",Ten,Stunner,8600,The Show,track:Stunner artist:Ten year:2000-2025
809,374,"April 8, 2025",Close Your Eyes,All My Poetry,8605,The Show,track:All My Poetry artist:Close Your Eyes yea...


In [160]:
# Group by 'Artist' and count occurrences of each song
song_counts = (
    all_show_df_winners
     .groupby(["Artist", "Song"])["Song"]
     .value_counts()
     .reset_index(name="Frequency"))
song_counts.loc[range(100,157), :]

Unnamed: 0,Artist,Song,Frequency
100,Jihyo,Killin' Me Good,1
101,Jimin,Closer Than This,1
102,Jimin,Like Crazy,4
103,Jimin,Set Me Free Pt. 2,1
104,Jimin,Smeraldo Garden Marching Band,2
105,Jimin,Who,2
106,Jin,The Astronaut,1
107,Jisoo,Earthquake,2
108,Jisoo,Flower,9
109,Jo Yuri,Love Shhh!,1


In [161]:
all_show_df_winners

Unnamed: 0,Episode,Date,Artist,Song,Points,Award Show,Search Query
0,1121,"January 9, 2022",Ive,Eleven,8533,Inkigayo,track:Eleven artist:Ive year:2000-2022
1,1122,"January 16, 2022",Ive,Eleven,6583,Inkigayo,track:Eleven artist:Ive year:2000-2022
2,1123,"January 23, 2022",Ive,Eleven,5927,Inkigayo,track:Eleven artist:Ive year:2000-2022
3,1124,"January 30, 2022",Got the Beat,Step Back,5612,Inkigayo,track:Step Back artist:Got the Beat year:2000-...
4,1125,"February 20, 2022",Got the Beat,Step Back,7224,Inkigayo,track:Step Back artist:Got the Beat year:2000-...
...,...,...,...,...,...,...,...
806,371,"March 11, 2025",Hearts2Hearts,The Chase,8500,The Show,track:The Chase artist:Hearts2Hearts year:2000...
807,372,"March 25, 2025",STAYC,Bebe,7421,The Show,track:Bebe artist:STAYC year:2000-2025
808,373,"April 1, 2025",Ten,Stunner,8600,The Show,track:Stunner artist:Ten year:2000-2025
809,374,"April 8, 2025",Close Your Eyes,All My Poetry,8605,The Show,track:All My Poetry artist:Close Your Eyes yea...


In [162]:
all_show_df_winners.to_csv('data/tables/updated_all_award_show_winners.csv')

In [163]:
all_show_df_winners.loc[all_show_df_winners.Artist == 'Nmixx']

Unnamed: 0,Episode,Date,Artist,Song,Points,Award Show,Search Query
92,1213,"January 28, 2024",Nmixx,Dash,5901,Inkigayo,track:Dash artist:Nmixx year:2000-2024
117,1238,"September 1, 2024",Nmixx,See That?,6591,Inkigayo,track:See That? artist:Nmixx year:2000-2024
141,1262,"March 30, 2025",Nmixx,Know About Me,5070,Inkigayo,track:Know About Me artist:Nmixx year:2000-2025
235,826,"January 25, 2024",Nmixx,Dash,9338,M Countdown,track:Dash artist:Nmixx year:2000-2024
264,856,"August 29, 2024",Nmixx,See That?,10293,M Countdown,track:See That? artist:Nmixx year:2000-2024
396,1192,"January 26, 2024",Nmixx,Dash,11952,Music Bank,track:Dash artist:Nmixx year:2000-2024
427,1219,"August 30, 2024",Nmixx,See That?,11821,Music Bank,track:See That? artist:Nmixx year:2000-2024
457,1238,"March 28, 2025",Nmixx,Know About Me,11040,Music Bank,track:Know About Me artist:Nmixx year:2000-2025
508,469,"March 29, 2023",Nmixx,Love Me Like This,9176,Show Champion,track:Love Me Like This artist:Nmixx year:2000...
583,548,"March 26, 2025",Nmixx,Know About Me,6865,Show Champion,track:Know About Me artist:Nmixx year:2000-2025
