In [1]:
import pandas as pd
from pathlib import Path 

In [2]:
tables_path = Path('data', 'tables')
df = pd.read_csv(tables_path / 'merged_award_show_winners.csv')
df.head()

Unnamed: 0,Show,Date,Artist,Song,Total,Winner,file_path,vggish_dance,vggish_party,vggish_happy,vggish_sad,effnet_party,effnet_happy,effnet_sad,effnet_approachability,effnet_engagement,effnet_timbre_bright,tempo
0,M Countdown,2022-10-27,(G)I-dle,Nxde,0,1,data/audio/nxde_(g)i-dle.flac,91.352266,20.125487,73.873174,82.415563,9.883992,65.411216,90.908891,73.7064,94.578671,50.409603,136.0
1,Show! Music Core,2023-06-10,(G)I-dle,Queencard,6460,1,data/audio/queencard_(g)i-dle.flac,92.974657,14.019252,81.47186,88.088149,1.055119,87.547588,94.290292,80.357134,98.429692,48.345432,130.0
2,Show! Music Core,2023-06-03,(G)I-dle,Queencard,8133,1,data/audio/queencard_(g)i-dle.flac,92.974657,14.019252,81.47186,88.088149,1.055119,87.547588,94.290292,80.357134,98.429692,48.345432,130.0
3,Inkigayo,2023-06-04,(G)I-dle,Queencard,9423,1,data/audio/queencard_(g)i-dle.flac,92.974657,14.019252,81.47186,88.088149,1.055119,87.547588,94.290292,80.357134,98.429692,48.345432,130.0
4,Show Champion,2022-11-09,(G)I-dle,Nxde,5435,1,data/audio/nxde_(g)i-dle.flac,91.352266,20.125487,73.873174,82.415563,9.883992,65.411216,90.908891,73.7064,94.578671,50.409603,136.0


### Minor Processing

I noticed that there were some inconsistencies with the naming conventions for the artist column, so I going back to match any artist that conflict with their winning artist naming convention (if that artist has won before, otherwise do not change)

In [3]:
import pandas as pd
from thefuzz import process

def fuzzy_match_artists(non_winners, df_full, artist_col="Artist", score_cutoff=85):
    full_artist_list = df_full[artist_col].dropna().unique().tolist()
    results = []

    for artist in non_winners:
        match_result = process.extractOne(artist, full_artist_list, score_cutoff=score_cutoff)

        if match_result is not None:
            match, score = match_result
            results.append({
                "original_artist": artist,
                "matched_artist": match,
                "match_score": score
            })
        else:
            results.append({
                "original_artist": artist,
                "matched_artist": None,
                "match_score": None
            })

    return pd.DataFrame(results)


In [9]:
non_winners = most_wins[most_wins.values == 0].keys().tolist()
winners = df.copy()[~df.Artist.isin(non_winners)]

matches = fuzzy_match_artists(non_winners, winners, score_cutoff=92)
match_dict = dict(zip(matches['original_artist'], matches['matched_artist']))

display(matches[~matches['match_score'].isna()].sort_values(by=['match_score', 'original_artist'], ascending=[False, True]))

Unnamed: 0,original_artist,matched_artist,match_score
104,(G)I-DLE,(G)I-dle,100.0
97,BABYMONSTER,BabyMonster,100.0
87,DAY6,Day6,100.0
79,ENHYPEN,Enhypen,100.0
81,EVERGLOW,Everglow,100.0
82,EVNNE,Evnne,100.0
74,EXO,Exo,100.0
45,ILLIT,Illit,100.0
47,ITZY,Itzy,100.0
48,IVE,Ive,100.0


In [10]:
df['Artist'] = df['Artist'].map(match_dict).combine_first(df["Artist"])

One last inconsistency that wasn't found via fuzzy matching was that Hwang Karam's name was misspelled for an entry.

In [11]:
df.loc[df.Artist == 'Hwang Kagam', 'Artist'] = 'Hwang Karam'
df.head()

Unnamed: 0,Show,Date,Artist,Song,Total,Winner,file_path,vggish_dance,vggish_party,vggish_happy,vggish_sad,effnet_party,effnet_happy,effnet_sad,effnet_approachability,effnet_engagement,effnet_timbre_bright,tempo
0,M Countdown,2022-10-27,(G)I-dle,Nxde,0,1,data/audio/nxde_(g)i-dle.flac,91.352266,20.125487,73.873174,82.415563,9.883992,65.411216,90.908891,73.7064,94.578671,50.409603,136.0
1,Show! Music Core,2023-06-10,(G)I-dle,Queencard,6460,1,data/audio/queencard_(g)i-dle.flac,92.974657,14.019252,81.47186,88.088149,1.055119,87.547588,94.290292,80.357134,98.429692,48.345432,130.0
2,Show! Music Core,2023-06-03,(G)I-dle,Queencard,8133,1,data/audio/queencard_(g)i-dle.flac,92.974657,14.019252,81.47186,88.088149,1.055119,87.547588,94.290292,80.357134,98.429692,48.345432,130.0
3,Inkigayo,2023-06-04,(G)I-dle,Queencard,9423,1,data/audio/queencard_(g)i-dle.flac,92.974657,14.019252,81.47186,88.088149,1.055119,87.547588,94.290292,80.357134,98.429692,48.345432,130.0
4,Show Champion,2022-11-09,(G)I-dle,Nxde,5435,1,data/audio/nxde_(g)i-dle.flac,91.352266,20.125487,73.873174,82.415563,9.883992,65.411216,90.908891,73.7064,94.578671,50.409603,136.0


## Initial Inspections

In this section, I want to explore some basic insights found within the data such as what group has the most wins, who is always on the cusp of achieving their first win, and the trends within the total amount of votes observed within certain time periods. Most of this analysis will likely pertain to the first 6 columns: `Show`, `Date`, `Artist`, `Song`, `Total`, and `Winner`

In [None]:
most_wins = df.groupby('Artist')['Winner'].sum().sort_values(ascending=False)
display(most_wins[:10])

Artist
Ive            64
(G)I-dle       42
NewJeans       35
Le Sserafim    34
Aespa          27
Stray Kids     26
Seventeen      26
NCT Dream      23
Ateez          19
Jung Kook      18
Name: Winner, dtype: int64

In [17]:
never_won = most_wins[most_wins.values == 0].keys()
never_won

Index(['xikers', 'A.C.E', 'ALL(H)OURS', 'ARrC', 'ATBO', 'YOUNG POSSE',
       'DAVICHI', 'BamBam', 'Tzuyu', 'UNIS', 'BBGIRLS', 'BIGNaughty', 'BOL4',
       'WENDY', 'Lee Jegyu', 'Lim Jae-hyun', 'NouerA', 'Odd Eye Circle',
       'Kim Sejeong', 'LAPILLUS', 'LUN8', 'Kassy', 'KickFlip', 'Kim Hee-Jae',
       'Parc Jae-jung', 'Jukjae', 'JxW', 'KARD', 'KUN8', 'JD1', 'RESCENE',
       'ROSÉ', 'SAY MY NAME', 'SOMI', 'Infinite', 'JAESSBEE', 'Secret Number',
       'Sung Si-Kyung, Naul', 'THE NEW SIX', 'HUI', 'HWASA', 'Hwang Karam',
       'IRENE', 'TIOT', 'TONY', 'TRI.BE', 'GENBLUE', 'GOT7', 'Golden Child',
       'GroovyRoom', 'HORI7ON', 'HOSHI X WOOZI', 'FANTASY BOYS', 'FIFTY FIFTY',
       'Taeyoung', 'Tophyun', 'DRIPPIN', 'DXMON', 'EPEX', 'CHUU', 'CIX', 'D.O',
       'WEi', 'Woody', 'XG', 'Xdinary Heroes', '8TURN', 'ZiA', '82MAJOR',
       'YOUNITE', 'YUQI', 'ZOZAZZ', 'NEXZ', 'NOWADAYS', 'MEOVV', 'MIRAE',
       'Moon Byul', 'Lee Chae-yeon'],
      dtype='object', name='Artist')