In [1]:
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
import json
import csv
from datetime import datetime

In [664]:
def get_lrclib_lyrics(artist, track):
    url = f"https://lrclib.net/api/get?artist_name={artist}&track_name={track}"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        return data
    else:
        return "Lyrics not found."
    

In [665]:
def get_timestamped_lyrics(song):
    file_path = "/Users/alicemao/Downloads/"+ song +" official_times.srt"
    try:
        with open(file_path, 'r') as f:
            raw_sub = f.read()
        return raw_sub
    except FileNotFoundError:
        return FileNotFoundError

In [666]:
def convertSecondsToMinutes(duration):
    minutes = int(duration/60)
    seconds = int(duration) % 60
    tenths = int(duration % 1 *10)
    return f"{minutes:02}:{seconds:02}.{tenths:02}"

In [667]:
def getDelta(line, replaceChar):
    time_format = "%M:%S:%f"

    start = datetime.strptime(line['start'].replace(replaceChar, ':'), time_format)
    end = datetime.strptime(line['end'].replace(replaceChar, ':'), time_format)

    delta = str(end-start)
    pattern = r"\d{1}:(\d{2}:\d{2}(?:\.\d{2})?)"
    delta = re.findall(pattern, delta)[0]

    if "." not in delta:
        delta += ".00"
    
    delta = delta.replace('.', ':')
    return delta
    

In [668]:
def parse_and_normalize_synced_lyrics(synced_lyrics, officialORlrclib, duration):
    pattern = ''
    
    if officialORlrclib == 'official':
        pattern = r"\d{2}:(\d{2}:\d{2},\d{2})\d{1} --> \d{2}:(\d{2}:\d{2},\d{2})\d{1}\s*(.+)"
    else:
        pattern = r"\[(\d{2}:\d{2}\.\d{2})\]\s*(.*?)(?=\[|$)"
        synced_lyrics = synced_lyrics.replace('\n', '')

    parsed_synced_lyrics = re.findall(pattern, synced_lyrics)


    new_synced_lyrics = []
    # change all commas to . so that it all matches
    if officialORlrclib == 'official':
        for line in parsed_synced_lyrics:

            start = line[0].replace(',', ':')
            end = line[1].replace(',', ':')

            newLine = {'start': start, 'end': end, 'lyric': line[2]}
            newLine["delta"] = getDelta(newLine, ',')

            new_synced_lyrics.append(newLine)
    # Add "end" for each line
    else: 
        for i, line in enumerate(parsed_synced_lyrics):
            lyric = line[1]
            start = line[0]
            end = ''

            if lyric == '':
                continue

            if i + 1 >= len(synced_lyrics):
                end = convertSecondsToMinutes(duration)
            else:
                end = parsed_synced_lyrics[i+1][0]

            newLine = {'start': start, 'end': end, 'lyric': lyric}
            newLine["delta"] = getDelta(newLine, '.')

            new_synced_lyrics.append(newLine)

    return new_synced_lyrics
        

In [669]:
def get_synced_lyrics(artist, song):

    lrclib_results = get_lrclib_lyrics(artist, song)


    if lrclib_results == 'Lyrics not found.':
        print("NOT FOUND")
        return
    print('lrclib_results success')

    synced_lyrics_raw = ''
    officialORlrclib = ''

    get_timestamp = get_timestamped_lyrics(song)
    if get_timestamp is FileNotFoundError:
        synced_lyrics_raw = lrclib_results['syncedLyrics']
        officialORlrclib = 'lrclib'
    else:
        synced_lyrics_raw = get_timestamp
        officialORlrclib = 'official'

    print('timestamp success', officialORlrclib)

    synced_lyrics = parse_and_normalize_synced_lyrics(synced_lyrics_raw, officialORlrclib, lrclib_results['duration'])
    lrclib_results['syncedLyrics'] = synced_lyrics
    
    return lrclib_results

In [670]:
def get_soup(song):
    url_path = "./json/urls.json"

    with open(url_path, 'r') as file:
        all_urls = json.load(file)

    url = ''
    for u in all_urls:
        if u['name'] == song:
            url = u['url']
            print("url found:", url)

    response = requests.get(url)

    soup = BeautifulSoup(response.text, 'html.parser')
    soup = soup.body
    return soup
    

In [840]:
def clean_main_body_html(main_lyrics_body):
    html_input = main_lyrics_body

    ungrouped_span = []
    #get the first few ungrouped spans
    final_span = ''
    for line in html_input:
        if line.name == 'span':
            ungrouped_span.append(line)

        elif line.name == 'p':
            print(ungrouped_span)
            if len(ungrouped_span) == 1:
                final_span = f"<p>{ungrouped_span[0]}</p>"
            elif len(ungrouped_span) > 1:
                final_span = f"<p>{'<br>'.join(ungrouped_span)}</p>"
            break
        
    for line in html_input:
        if line.name == 'p':
            all_spans = line.findAll('span')
            new_spans = []
            for s in all_spans:
                style = s.get('style')
                lines = s.get_text(separator="|", strip=True).split("|")
                for line in lines:
                    new_spans.append(f'<span style="{style}">{line}</span>')
            final_span += f"<p>{'<br>'.join(new_spans)}</p>"

    new_html = BeautifulSoup(final_span, 'html.parser')
    print(new_html.prettify())
    return new_html


In [705]:
def get_color_lyrics_with_container(soup):
    body = soup.find("div", class_="wp-block-group__inner-container is-layout-flow wp-block-group-is-layout-flow")

    song_header = body.div
    color_key = song_header.find_next_sibling('p')
    spacing = color_key.find_next_sibling('div')
    main_lyrics_body = spacing.find_next_sibling('div')
    main_lyrics_body = main_lyrics_body.find_all("div", class_="wp-block-group__inner-container is-layout-flow wp-block-group-is-layout-flow")[1]

    return {"color_key": color_key, "main_lyrics_body": main_lyrics_body}

In [837]:
def get_color_lyrics_with_table(soup):
    body = soup.findAll("tr")

    song_header = body[1]
    main_lyrics_body = body[3].findAll("td")[1]
    color_key = song_header.find('td')

    main_lyrics_body = clean_main_body_html(main_lyrics_body)
    
    return {"song_header": song_header, "color_key": color_key, "main_lyrics_body": main_lyrics_body}

In [894]:
def normalize_lyrics(text):
    text = "".join(text.split())
    text = re.sub(r'[\n\(]', ' ', text)
    text = re.sub(r'[^\w\s가-힣]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    print(text)
    return text

In [674]:
# print('song_header', song_header)
# print('color_key', color_key)
# print('spacing', spacing)
# print('main_lyrics_body', main_lyrics_body)

In [None]:
def get_color_key(color_key):
    colors = color_key.find_all('span')
    print(colors)
    color_coded_key = {}
    members = []

    for c in colors:
        style = c['style']
        hex = style.split('#')[1]
        name = c.text

        color_coded_key.update({hex: name})
        members.append(name)
    return color_coded_key

In [772]:
def get_colored_lyrics(lyrics, color_coded_key):

    for br in lyrics.find_all("br"):
        br.decompose()
    
    colored_lyrics = []
    section_num = 0
    line_num = 0
    member = ''

    span_lyrics_html = lyrics.find_all('span', recursive=False)
    # print(span_lyrics_html)

    for s in span_lyrics_html:
        text = normalize_lyrics(s.text)
        if text:
            hex = s['style'].split('#')[1]
            if hex in color_coded_key:
                member = color_coded_key[hex]
            else:
                print("member")
            colored_lyrics.append({'member': member, 'lyric':text, 'section': section_num, 'line': line_num})
            line_num += 0


    colored_lyrics_html = lyrics.find_all('p')
    for c in colored_lyrics_html:
        line_num = 0
        for line in c:
            text = normalize_lyrics(line.text)
            if text:
                if line.name == 'span':
                    hex = line['style'].split('#')[1]
                    if hex in color_coded_key:
                        member = color_coded_key[hex]
                    else:
                        member = "all"
                colored_lyrics.append({'member': member, 'lyric':text, 'section': section_num, 'line': line_num})
                line_num += 1
        section_num += 1
    return colored_lyrics

In [901]:
def combine_lines(longer_line, shorter_lines, longer_line_is_official):
    index = 0
    line = ''
    found = False
    index_found = 0

    # print('combine', longer_line, shorter_lines)
    while longer_line and index < len(shorter_lines):
        shorter_line = shorter_lines[index]
        shorter_norm = normalize_lyrics(shorter_line['lyric'].lower())

        if shorter_norm in longer_line:
            longer_line = longer_line.replace(shorter_norm, '', 1).strip()

            if not longer_line_is_official:
                line += (shorter_line['lyric'] + ' ')

            index_found += 1
            if longer_line == '':
                found = True
                break
        index += 1
          
    if longer_line_is_official:
        line = longer_line
    line.strip()

    return {'index': index + 1, 'line': line, 'found': found}

In [902]:
def get_synced_and_assigned_lyrics(colored_lyrics, synced_lyrics):
    colored_index = 0
    synced_index = 0

    synced_and_assigned_lyrics = []

    while colored_index < len(colored_lyrics) and synced_index < len(synced_lyrics):
        print(colored_lyrics[colored_index], synced_lyrics[synced_index])

        colored_line = colored_lyrics[colored_index]
        synced_line = synced_lyrics[synced_index]

        final_line = colored_line.copy()

        # print(final_line)
        colored_norm = normalize_lyrics(colored_line['lyric'].lower())
        synced_norm = normalize_lyrics(synced_line['lyric'].lower())


        combined_results = {}
        
        if colored_norm == synced_norm:
            final_line['start'] = synced_line['start']
            final_line['end'] = synced_line['end']
            final_line['delta'] = synced_line['delta']

            colored_index += 1
            synced_index += 1
        else:
            check_further = 5

            if len(synced_norm) > len(colored_norm):
                if check_further + colored_index >= len(colored_lyrics):
                    check_further = len(colored_lyrics) - check_further - 1

                new_colored_lines = colored_lyrics[colored_index : colored_index + check_further].copy()
                combined_results = combine_lines(synced_norm, new_colored_lines, False)

                synced_index += 1
                if combined_results['found']:
                    colored_index += combined_results['index']
                    final_line['lyric'] = combined_results['line']
                else:
                    # colored_index += 1
                    final_line['lyric'] = synced_line['lyric']
                
            else:
                if check_further + synced_index >= len(synced_lyrics):
                    check_further = len(synced_lyrics) - check_further - 1

                combined_results = combine_lines(colored_norm, synced_lyrics[synced_index:synced_index+check_further], True)

                colored_index += 1
                if combined_results['found']:
                    synced_index += combined_results['index']
                    final_line['lyric'] = combined_results['line']
                else:
                    # synced_index += 1
                    final_line['lyric'] = colored_line['lyric']
            
            
            final_line['start'] = synced_line['start']
            final_line['end'] = synced_line['end']
            final_line['delta'] = synced_line['delta']

            if not combined_results['found']:
                final_line['start'] = 'LINE NOT SYNCED'
                final_line['end'] = 'LINE NOT SYNCED'
                final_line['delta'] = 'LINE NOT SYNCED'

        synced_and_assigned_lyrics.append(final_line)
        print(colored_index, synced_index, final_line)
        print()
    return synced_and_assigned_lyrics


In [679]:
# Define the filename and the headers (keys from your dictionary)
def export_to_csv():
    filename = "./data/rock_with_you_line_distribution.csv"
    fields = ["member", "lyric", "section", "line", "start", "end"]

    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fields)
        
        # Write the header row
        writer.writeheader()
        
        # Write all rows at once
        writer.writerows(synced_and_assigned_lyrics)

    print(f"Data successfully exported to {filename}")

In [846]:
song = "very nice"
artist = "seventeen"

meta_info = get_synced_lyrics(artist, song)
synced_lyrics = meta_info['syncedLyrics']

for s in synced_lyrics:
    print(s)
    
soup = get_soup(song)

lrclib_results success
timestamp success lrclib
{'start': '00:03.37', 'end': '00:07.22', 'lyric': '아침엔 모닝콜 필수던 내가', 'delta': '00:03:85'}
{'start': '00:07.22', 'end': '00:11.29', 'lyric': '오늘은 번쩍 번쩍 눈이 떠지는가', 'delta': '00:04:07'}
{'start': '00:11.29', 'end': '00:13.63', 'lyric': '데이트 날이라 그런지', 'delta': '00:02:34'}
{'start': '00:13.63', 'end': '00:15.72', 'lyric': '어제 꿈도 좋은 꿈 꿨지', 'delta': '00:02:09'}
{'start': '00:15.72', 'end': '00:17.77', 'lyric': '새 신발을 신고', 'delta': '00:02:05'}
{'start': '00:17.77', 'end': '00:19.79', 'lyric': '현관문을 열고 나가면', 'delta': '00:02:02'}
{'start': '00:19.79', 'end': '00:22.90', 'lyric': '오늘 날씬 너를 많이 닮아', 'delta': '00:03:11'}
{'start': '00:22.90', 'end': '00:26.81', 'lyric': '너에게 가는 길은 꽃길이 되고', 'delta': '00:03:91'}
{'start': '00:26.81', 'end': '00:30.92', 'lyric': '보일 듯 말듯한 네 마음 보인다면', 'delta': '00:04:11'}
{'start': '00:30.92', 'end': '00:33.52', 'lyric': '온몸이 간질간질 두근두근', 'delta': '00:02:60'}
{'start': '00:33.52', 'end': '00:37.00', 'lyric': '이 기분은 뭐야 어떡해', '

In [861]:
html_results = []
if soup.find(('table')) != None:
    html_results = get_color_lyrics_with_table(soup)
else:
    html_results = get_color_lyrics_with_container(soup)

color_key = html_results['color_key']
main_lyrics_body = html_results['main_lyrics_body']


[<span style="color: #63b76c">Four tres two uno uno two</span>]
<p>
 <span style="color: #63b76c">
  Four tres two uno uno two
 </span>
</p>
<p>
 <span style="color: #3d72e7">
  아침엔 모닝콜 필수던 내가
 </span>
 <br/>
 <span style="color: #3d72e7">
  오늘은 번쩍 번쩍 눈이 떠지는가
 </span>
 <br/>
 <span style="color: #c04737">
  데이트 날이라 그런지
 </span>
 <br/>
 <span style="color: #c04737">
  어제 꿈도 좋은 꿈 꿨지
 </span>
 <br/>
 <span style="color: #855fc4">
  새 신발을 신고
 </span>
 <br/>
 <span style="color: #04a497">
  현관문을 열고 나가면
 </span>
</p>
<p>
 <span style="color: #f1ce5f">
  오늘 날씬 너를 많이 닮아
 </span>
 <br/>
 <span style="color: #f1ce5f">
  너에게 가는 길은 꽃길이 되고
 </span>
 <br/>
 <span style="color: #3d72e7">
  보일 듯 말 듯한 니 맘 보인다면
 </span>
 <br/>
 <span style="color: #04a497">
  온몸이 간질간질 두근두근
 </span>
 <br/>
 <span style="color: #df73ff">
  이 기분은 뭐야 어떡해
 </span>
</p>
<p>
 <span style="color: #ffffff">
  아주 Nice
 </span>
 <br/>
 <span style="color: #ffffff">
  아주 Nice
 </span>
 <br/>
 <span style="color: #ffffff">
  기분 기분 기

In [862]:
color_coded_key = get_color_key(color_key)
print(color_coded_key)

colored_lyrics = get_colored_lyrics(html_results['main_lyrics_body'], color_coded_key)

for c in colored_lyrics:
    print(c)

[<span style="color: #4ea8d4">S. Coups</span>, <span style="color: #e32636">Jeonghan</span>, <span style="color: #df73ff">Joshua</span>, <span style="color: #bede0d">Jun</span>, <span style="color: #c04737">Hoshi</span>, <span style="color: #855fc4">Wonwoo</span>, <span style="color: #ff7d07">Woozi</span>, <span style="color: #3d72e7">DK</span>, <span style="color: #04a497">Mingyu</span>, <span style="color: #a1dad7">The8</span>, <span style="color: #f1ce5f">Seungkwan</span>, <span style="color: #63b76c">Vernon</span>, <span style="color: #d962b6">Dino</span>]
{'4ea8d4': 'S. Coups', 'e32636': 'Jeonghan', 'df73ff': 'Joshua', 'bede0d': 'Jun', 'c04737': 'Hoshi', '855fc4': 'Wonwoo', 'ff7d07': 'Woozi', '3d72e7': 'DK', '04a497': 'Mingyu', 'a1dad7': 'The8', 'f1ce5f': 'Seungkwan', '63b76c': 'Vernon', 'd962b6': 'Dino'}
{'member': 'Vernon', 'lyric': 'Four tres two uno uno two', 'section': 0, 'line': 0}
{'member': 'DK', 'lyric': '아침엔 모닝콜 필수던 내가', 'section': 1, 'line': 0}
{'member': 'DK', 'lyric':

In [903]:
synced_and_assigned_lyrics = get_synced_and_assigned_lyrics(colored_lyrics, synced_lyrics)

print(synced_and_assigned_lyrics)

{'member': 'Vernon', 'lyric': 'Four tres two uno uno two', 'section': 0, 'line': 0} {'start': '00:03.37', 'end': '00:07.22', 'lyric': '아침엔 모닝콜 필수던 내가', 'delta': '00:03:85'}
fourtrestwounounotwo
아침엔모닝콜필수던내가
아침엔모닝콜필수던내가
오늘은번쩍번쩍눈이떠지는가
데이트날이라그런지
어제꿈도좋은꿈꿨지
새신발을신고
1 0 {'member': 'Vernon', 'lyric': 'Four tres two uno uno two', 'section': 0, 'line': 0, 'start': 'LINE NOT SYNCED', 'end': 'LINE NOT SYNCED', 'delta': 'LINE NOT SYNCED'}

{'member': 'DK', 'lyric': '아침엔 모닝콜 필수던 내가', 'section': 1, 'line': 0} {'start': '00:03.37', 'end': '00:07.22', 'lyric': '아침엔 모닝콜 필수던 내가', 'delta': '00:03:85'}
아침엔모닝콜필수던내가
아침엔모닝콜필수던내가
2 1 {'member': 'DK', 'lyric': '아침엔 모닝콜 필수던 내가', 'section': 1, 'line': 0, 'start': '00:03.37', 'end': '00:07.22', 'delta': '00:03:85'}

{'member': 'DK', 'lyric': '오늘은 번쩍 번쩍 눈이 떠지는가', 'section': 1, 'line': 1} {'start': '00:07.22', 'end': '00:11.29', 'lyric': '오늘은 번쩍 번쩍 눈이 떠지는가', 'delta': '00:04:07'}
오늘은번쩍번쩍눈이떠지는가
오늘은번쩍번쩍눈이떠지는가
3 2 {'member': 'DK', 'lyric': '오늘은 번쩍 번쩍 눈이 떠지는가', 'section':

In [547]:
meta_info['synced_lyrics'] = synced_and_assigned_lyrics

del meta_info["plainLyrics"]
del meta_info["syncedLyrics"]

print(meta_info)

{'id': 2433441, 'name': 'Rock with you', 'trackName': 'Rock with you', 'artistName': 'SEVENTEEN', 'albumName': "SEVENTEEN 9th Mini Album 'Attacca'", 'duration': 180.0, 'instrumental': False, 'synced_lyrics': [{'member': 'Joshua', 'lyric': '지금 이 노래가 내가 될 수 있게', 'section': 0, 'line': 0, 'start': '00:00:00', 'end': '00:06:33', 'delta': '00:06:33'}, {'member': 'Joshua', 'lyric': '만들어 준 네가 다가온다', 'section': 0, 'line': 1, 'start': '00:06:33', 'end': '00:10:01', 'delta': '00:03:68'}, {'member': 'Joshua', 'lyric': '셋 둘 하나', 'section': 0, 'line': 2, 'start': '00:10:01', 'end': '00:11:38', 'delta': '00:01:37'}, {'member': 'Jeonghan', 'lyric': '뭐든지 다 주고 싶어', 'section': 1, 'line': 0, 'start': '00:11:38', 'end': '00:14:38', 'delta': '00:03:00'}, {'member': 'Jeonghan', 'lyric': '나에게 너만 있다면', 'section': 1, 'line': 1, 'start': '00:14:38', 'end': '00:16:93', 'delta': '00:02:55'}, {'member': 'Hoshi', 'lyric': 'Won’t let them break your heart oh no', 'section': 1, 'line': 2, 'start': '00:16:93', 'end': '

In [551]:
with open('./data/meta_info_'+song+'.json', 'w', encoding='utf-8') as f:
    json.dump(meta_info, f, ensure_ascii=False)

[<span style="color: #63b76c">Four tres two uno uno two</span>]
<span style="color: #3d72e7">아침엔 모닝콜 필수던 내가
오늘은 번쩍 번쩍 눈이 떠지는가</span>
lines ['아침엔 모닝콜 필수던 내가', '오늘은 번쩍 번쩍 눈이 떠지는가']
new_spans ['<span style="color: #3d72e7">아침엔 모닝콜 필수던 내가</span>', '<span style="color: #3d72e7">오늘은 번쩍 번쩍 눈이 떠지는가</span>']
<span style="color: #c04737">데이트 날이라 그런지
어제 꿈도 좋은 꿈 꿨지</span>
lines ['데이트 날이라 그런지', '어제 꿈도 좋은 꿈 꿨지']
new_spans ['<span style="color: #3d72e7">아침엔 모닝콜 필수던 내가</span>', '<span style="color: #3d72e7">오늘은 번쩍 번쩍 눈이 떠지는가</span>', '<span style="color: #c04737">데이트 날이라 그런지</span>', '<span style="color: #c04737">어제 꿈도 좋은 꿈 꿨지</span>']
<span style="color: #855fc4">새 신발을 신고</span>
lines ['새 신발을 신고']
new_spans ['<span style="color: #3d72e7">아침엔 모닝콜 필수던 내가</span>', '<span style="color: #3d72e7">오늘은 번쩍 번쩍 눈이 떠지는가</span>', '<span style="color: #c04737">데이트 날이라 그런지</span>', '<span style="color: #c04737">어제 꿈도 좋은 꿈 꿨지</span>', '<span style="color: #855fc4">새 신발을 신고</span>']
<span style="color: #04a497">현관문을 