In [93]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests as rq
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.common.by import By
import time

## Collecting and Scraping Data from Melon

In [94]:
titles = []
artists = []
release_dates = []
genres = []
num_comments = []
num_likes = []

# Open chrome window to Melon Top 100 webpage to extract HTML
driver = sl.webdriver.Chrome()
driver.get('https://www.melon.com/chart/index.htm')
time.sleep(5)  

# Find all song information links on Top 100
top_100_links = driver.find_elements(By.CSS_SELECTOR, 'a.song_info')

# Parse title, artist name, release date, genre, # likes, and star ratings of each song
for link in top_100_links:
    link.click()
    time.sleep(3)
    html = driver.page_source
    # individual song parsing
    parsed = bs(html, 'html.parser')

    title = parsed.find('div', {'class:', 'song_name'}).strong.next_sibling.strip()
    artist = parsed.find('a', {'class:', 'artist_name'}).text.strip()
    release_date = parsed.find('dt', text='발매일').find_next_sibling('dd').text.strip()
    genre = parsed.find('dt', text='장르').find_next_sibling('dd').text.strip()
    num_comment = parsed.find('dt', text='댓글').find_next_sibling('dd').text.strip()[:-1]
    num_like = parsed.find('span', {'id': 'd_like_count'}).text.strip()
    
    driver.back()
    time.sleep(3)
    
    titles.append(title)
    artists.append(artist)
    release_dates.append(release_date)
    genres.append(genre)
    num_comments.append(num_comment)
    num_likes.append(num_like)
    
driver.quit()

In [95]:
df = pd.DataFrame()

In [146]:
df['titles'] = titles
df['artists'] = artists
df['release_date'] = release_dates
df['genres'] = genres
df['num_comments'] = num_comments
df['num_likes'] = num_likes

In [147]:
display(df)

Unnamed: 0,titles,title_lengths,artists,release_date,genres,num_likes,num_comments,release_date_quantized
0,Kitsch,6,IVE (아이브),2023.03.27,댄스,32162,1062,0.237430
1,Ditto,5,NewJeans,2022.12.19,댄스,243157,2926,0.969274
2,OMG,3,NewJeans,2023.01.02,댄스,166401,1678,0.000000
3,Hype boy,8,NewJeans,2022.08.01,댄스,249280,1946,0.583799
4,Teddy Bear,10,STAYC(스테이씨),2023.02.14,댄스,65309,1555,0.122905
...,...,...,...,...,...,...,...,...
95,네가 보고 싶은 건 자연스러운 거겠지,20,신예영,2023.01.24,발라드,22199,120,0.061453
96,봄 사랑 벚꽃 말고,10,HIGH4 (하이포),2014.04.08,발라드,216020,405,0.268156
97,Off My Face,11,Justin Bieber,2021.10.08,POP,160843,280,0.770950
98,그댄 행복에 살텐데 (2022),17,최유리,2022.12.27,"발라드, 인디음악",31892,57,0.991620


In [148]:
print(num_comments)
print(num_likes)

['1,062', '2,926', '1,678', '1,946', '1,555', '119', '4,168', '3,119', '309', '8,454', '353', '394', '1,239', '90', '3,757', '284', '1,822', '2,216', '952', '2,536', '805', '1,247', '5,986', '554', '125', '3,561', '1,956', '753', '506', '451', '505', '514', '1,502', '151', '2,723', '43', '389', '179', '238', '253', '1,035', '1,778', '7,220', '726', '452', '19,340', '215', '12,912', '46', '75', '207', '2,281', '2,108', '293', '275', '124', '208', '133', '7,900', '960', '916', '1,249', '407', '5,590', '57', '883', '32', '218', '198', '2,053', '401', '892', '3,604', '4,367', '758', '701', '482', '97', '1,826', '329', '1,620', '3,102', '607', '246', '414', '612', '55', '208', '93', '156', '1,073', '85', '333', '338', '312', '120', '405', '280', '57', '3,650']
['32,162', '243,157', '166,401', '249,280', '65,309', '23,245', '122,223', '291,329', '41,692', '191,530', '140,819', '27,534', '38,250', '15,000', '89,666', '25,685', '47,086', '72,565', '42,451', '217,100', '60,482', '42,191', '165,

## A Little Cleaning and Manipulation

In [149]:
df['title_lengths'] = df.titles.apply(lambda x: len(x))

In [150]:
df = df[['titles', 'title_lengths', 'artists', 'release_date', 'genres', 'num_likes', 'num_comments']]

In [151]:
# I'm simplifying the date calculations because the specifics aren't quite relevant to my goal.
# We'll assume all months have 30 days, except February, which has 28 days.
def quantize_date(date):
    ymd = date.split('.')
    if (ymd[1] == '02'):
        return ( 30 + int(ymd[2])) / 358
    else:
        return ( (30 * (int(ymd[1]) - 2)) + int(ymd[2]) + 28) / 358

# Parsing the numerical values to integers
df['num_comments'] = df.num_comments.apply(lambda x: int(x.replace(',', '')))
df['num_likes'] = df.num_likes.apply(lambda x: int(x.replace(',', '')))
df['release_date_quantized'] = df.release_date.apply(lambda x: quantize_date(x))

In [153]:
display(df)

Unnamed: 0,titles,title_lengths,artists,release_date,genres,num_likes,num_comments,release_date_quantized
0,Kitsch,6,IVE (아이브),2023.03.27,댄스,32162,1062,0.237430
1,Ditto,5,NewJeans,2022.12.19,댄스,243157,2926,0.969274
2,OMG,3,NewJeans,2023.01.02,댄스,166401,1678,0.000000
3,Hype boy,8,NewJeans,2022.08.01,댄스,249280,1946,0.583799
4,Teddy Bear,10,STAYC(스테이씨),2023.02.14,댄스,65309,1555,0.122905
...,...,...,...,...,...,...,...,...
95,네가 보고 싶은 건 자연스러운 거겠지,20,신예영,2023.01.24,발라드,22199,120,0.061453
96,봄 사랑 벚꽃 말고,10,HIGH4 (하이포),2014.04.08,발라드,216020,405,0.268156
97,Off My Face,11,Justin Bieber,2021.10.08,POP,160843,280,0.770950
98,그댄 행복에 살텐데 (2022),17,최유리,2022.12.27,"발라드, 인디음악",31892,57,0.991620


## Exploratory Data Analysis!

In [157]:
df.genres.value_counts()

댄스              27
발라드             25
랩/힙합            10
록/메탈             8
POP              8
발라드, 국내드라마       5
R&B/Soul         5
성인가요/트로트         3
발라드, 인디음악        3
포크/블루스           2
일렉트로니카           1
재즈               1
J-POP            1
인디음악, 포크/블루스     1
Name: genres, dtype: int64

In [162]:
# Comparison of average number of likes across genres
print(df.groupby('genres')['num_likes'].mean().sort_values(ascending=False))

genres
발라드, 국내드라마      213087.600000
댄스              142933.592593
인디음악, 포크/블루스    139311.000000
랩/힙합            133275.300000
POP             129407.625000
록/메탈            103887.500000
R&B/Soul         95011.600000
성인가요/트로트         71966.000000
발라드              61222.920000
J-POP            55901.000000
포크/블루스           46295.500000
재즈               46181.000000
발라드, 인디음악        40604.000000
일렉트로니카           23245.000000
Name: num_likes, dtype: float64


As expected, it seems ballad and dance/K-pop are trending. Surprisingly, indie/blues music also seems to be quite popular.

In [154]:
# Let's see the numerical value correlations
df.corr()

Unnamed: 0,title_lengths,num_likes,num_comments,release_date_quantized
title_lengths,1.0,0.015291,-0.057542,0.086227
num_likes,0.015291,1.0,0.667444,0.140397
num_comments,-0.057542,0.667444,1.0,0.067399
release_date_quantized,0.086227,0.140397,0.067399,1.0


Number of likes and number comments are positively correlated, nothing particularly special there.<br>Being released later in the year is slightly related to higher number of likes, but I wonder why...?<br>Longer title length seems to very very slightly correlate with higher number of likes.

## We can explore the relationship between release date, genre, and number of likes a bit more. Koreans might prefer a certain genre during certain times of the year.