In [446]:
import re
import requests
from bs4 import BeautifulSoup, NavigableString, Tag
import json
import csv
from datetime import datetime

In [447]:
def get_soup(song):
    url_path = "../json/urls.json"

    with open(url_path, 'r') as file:
        all_urls = json.load(file)

    url = ''
    for u in all_urls:
        if u['name'] == song:
            url = u['url']
            print("url found:", url)
    
    response = requests.get(url)

    soup = BeautifulSoup(response.text, 'html.parser')
    soup = soup.body
    return soup
    

In [448]:
def get_color_key(color_key):
    colors = color_key.find_all('span')
    print(colors)
    color_coded_key = {}
    members = []

    for c in colors:
        style = c['style']
        hex = style.split('#')[1]
        name = c.text

        color_coded_key.update({hex: name})
        members.append(name)
    return color_coded_key

In [449]:
def clean_main_body_html(html_results):
    combined_lines_test = ''
    line = ''

    html_results = list(html_results)

    i = 0
    while i < len(html_results):
        line = html_results[i]
        if isinstance(line, Tag) and line.name == 'p':
            combined_lines_test += '<p>'
            combined_lines_test += clean_main_body_html(line.children)
            combined_lines_test += '</p>'
       

        if isinstance(line, Tag) and line.name == 'br':
            combined_lines_test += str(line)
    

        if isinstance(line, Tag) and line.name == 'span':
            combined_lines_test += str(line)
    
            
        if isinstance(line, NavigableString):
            combine_members = ''

            if line.strip() == '[':
                not_bracket = True
                shared_lyrics = ''

                while not_bracket:
                    i += 1
                    line = html_results[i]
                    if line.name == 'span':
                        hex = line['style'].split('#')[1]
                        combine_members += (' #'+ hex)
                     
                    elif isinstance(line, NavigableString):
                        if "]" in line.strip():
                            shared_lyrics = line.strip().split(']')[1]
                            not_bracket = False
                    
                insert_line = f"<span class='multiple_members' style='color:{combine_members}'>{shared_lyrics}</span>"
                line.insert_before(insert_line)
                combined_lines_test += insert_line
            else:
                combined_lines_test += (' ' + line.strip())
        i += 1
    return combined_lines_test

In [450]:
def get_color_lyrics_with_container(soup):
    body = soup.find("div", class_="wp-block-group__inner-container is-layout-flow wp-block-group-is-layout-flow")

    find_divs = body.findAll('div', recursive=False)

    color_key = get_color_key(body.find('p', recursive=False))
    main_lyrics_body = find_divs[-1]

    # Hangul lyrics
    main_lyrics_body = main_lyrics_body.find_all("div", class_="wp-block-group__inner-container is-layout-flow wp-block-group-is-layout-flow")[1]
    # print(main_lyrics_body)

    main_lyrics_body = main_lyrics_body.findAll("div")[-1]
    # print(main_lyrics_body)

    main_lyrics_body = clean_main_body_html(main_lyrics_body)

    return {"color_key": color_key, "main_lyrics_body": main_lyrics_body}

In [451]:
def get_color_lyrics_with_table(soup):
    body = soup.findAll("tr")

    song_header = body[1]
    main_lyrics_body = body[3].findAll("td")[1]
    color_key = get_color_key(song_header.find('td'))

    main_lyrics_body = clean_main_body_html(main_lyrics_body.children)
    
    return {"song_header": song_header, "color_key": color_key, "main_lyrics_body": main_lyrics_body}

In [452]:
def normalize_lyrics(text):
    text = "".join(text.split())
    text = re.sub(r'[\n\(]', ' ', text)
    text = re.sub(r'[^\w\s가-힣]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [453]:
def get_raw_color_coded_html(song):
    soup = get_soup(song)

    html_results = []
    if soup.find(('table')) != None:
        html_results = get_color_lyrics_with_table(soup)
    else:
        html_results = get_color_lyrics_with_container(soup)
    
    lyrics = html_results['main_lyrics_body'].strip()
    
    p_index = lyrics.find('<p>')
    if p_index:
        print('found p', p_index)
        lyrics = '<p>' + lyrics[:p_index] + '</p>' + lyrics[p_index:]

    color_key = html_results['color_key']
    main_lyrics_body = BeautifulSoup(lyrics, 'html.parser')

    return {'color_key': color_key, 'main_lyrics_body': main_lyrics_body}


In [454]:
def export_testing_html(html):
    with open('../html/testing.html', 'w') as f:
        f.write(html)


In [455]:
def get_colored_lyrics(html):

    lyrics = html['main_lyrics_body']
    color_coded_key = html['color_key']

    for br in lyrics.find_all("br"):
        br.decompose()
    
    colored_lyrics = []
    section_num = 0
    line_num = 0
    members = []

    span_lyrics_html = lyrics.find_all('span', recursive=False)

    for s in span_lyrics_html:
        print('something broke', s)


    colored_lyrics_html = lyrics.find_all('p')
    for c in colored_lyrics_html:
        

        line_num = 0
        for line in c:
            text = normalize_lyrics(line.text)
            
            if text:
                if line.name == 'span':
                    members = []
                    hex_re = r"\#([a-zA-Z0-9]{6})"
                    hex_value = re.findall(hex_re, line['style'])

                    for hex in hex_value:
                        print(hex)
                        if hex in color_coded_key:
                            members.append(color_coded_key[hex])
                        else:
                            members = ["SOMETHING BROKE"]
                else:
                    members = ["adlib"]
                colored_lyrics.append({'member': members, 'lyric':line.text, 'section': section_num, 'line': line_num})
                line_num += 1
        section_num += 1
    return colored_lyrics

In [456]:
html = get_raw_color_coded_html('rock with you')
print("html", html['color_key'])
export_testing_html(str(html['main_lyrics_body']))

get_colored_lyrics(html)

url found: https://colorcodedlyrics.com/2021/10/21/seventeen-rock-with-you/
[<span style="color: #4ea8d4">S.Coups</span>, <span style="color: #e32636">Jeonghan</span>, <span style="color: #df73ff">Joshua</span>, <span style="color: #bede0d">Jun</span>, <span style="color: #c04737">Hoshi</span>, <span style="color: #855fc4">Wonwoo</span>, <span style="color: #ff7d07">Woozi</span>, <span style="color: #3d72e7">DK</span>, <span style="color: #04a497">Mingyu</span>, <span style="color: #a1dad7">THE8</span>, <span style="color: #f1ce5f">Seungkwan</span>, <span style="color: #63b76c">Vernon</span>, <span style="color: #d962b6">Dino</span>]
html {'4ea8d4': 'S.Coups', 'e32636': 'Jeonghan', 'df73ff': 'Joshua', 'bede0d': 'Jun', 'c04737': 'Hoshi', '855fc4': 'Wonwoo', 'ff7d07': 'Woozi', '3d72e7': 'DK', '04a497': 'Mingyu', 'a1dad7': 'THE8', 'f1ce5f': 'Seungkwan', '63b76c': 'Vernon', 'd962b6': 'Dino'}
df73ff
df73ff
df73ff
e32636
e32636
c04737
855fc4
04a497
04a497
4ea8d4
4ea8d4
3d72e7
3d72e7
a1dad7
a

[{'member': ['Joshua'],
  'lyric': '지금 이 노래가 내가 될 수 있게',
  'section': 0,
  'line': 0},
 {'member': ['Joshua'], 'lyric': '만들어 준 네가 다가온다', 'section': 0, 'line': 1},
 {'member': ['Joshua'], 'lyric': '셋 둘 하나', 'section': 0, 'line': 2},
 {'member': ['Jeonghan'], 'lyric': '뭐든지 다 주고 싶어', 'section': 1, 'line': 0},
 {'member': ['Jeonghan'], 'lyric': '나에게 너만 있다면', 'section': 1, 'line': 1},
 {'member': ['Hoshi'],
  'lyric': 'Won’t let them break your heart oh no',
  'section': 1,
  'line': 2},
 {'member': ['Wonwoo'], 'lyric': '네가 없다면 난 아무것도 아냐', 'section': 1, 'line': 3},
 {'member': ['Mingyu'],
  'lyric': 'No words are enough for you',
  'section': 2,
  'line': 0},
 {'member': ['Mingyu'], 'lyric': '노랫말로 담고 싶어', 'section': 2, 'line': 1},
 {'member': ['S.Coups'], 'lyric': 'So, 모든 나의 감정 ', 'section': 2, 'line': 2},
 {'member': ['S.Coups'], 'lyric': '너로 읽고 쓰게 해줘', 'section': 2, 'line': 3},
 {'member': ['DK'],
  'lyric': 'I just want to love you',
  'section': 3,
  'line': 0},
 {'member': ['DK'], 'lyr