In [16]:
import re
import requests
from bs4 import BeautifulSoup, NavigableString, Tag
import json
import csv
from datetime import datetime

In [17]:
def get_soup(song):
    url_path = "../json/urls.json"

    with open(url_path, 'r') as file:
        all_urls = json.load(file)

    url = ''
    for u in all_urls:
        if u['name'] == song:
            url = u['url']
            print("url found:", url)
    
    response = requests.get(url)

    soup = BeautifulSoup(response.text, 'html.parser')
    soup = soup.body
    return soup
    

In [18]:
def get_color_key(color_key):
    colors = color_key.find_all('span')
    print(colors)
    color_coded_key = {}
    members = []

    for c in colors:
        style = c['style']
        hex = style.split('#')[1]
        name = c.text

        color_coded_key.update({hex: name})
        members.append(name)
    return color_coded_key

In [19]:
def clean_main_body_html(html_results):
    combined_lines_test = ''
    line = ''

    if not isinstance(html_results, list):
        html_results = list(html_results)

    i = 0
    while i < len(html_results):
        line = html_results[i]
        if isinstance(line, Tag) and line.name == 'p':
            combined_lines_test += '<p>'
            combined_lines_test += clean_main_body_html(line.children)
            combined_lines_test += '</p>'
       

        if isinstance(line, Tag) and line.name == 'br':
            combined_lines_test += str(line)
    

        if isinstance(line, Tag) and line.name == 'span':
            combined_lines_test += str(line)
    
            
        if isinstance(line, NavigableString):
            combine_members = ''

            if line.strip() == '[':
                not_bracket = True
                shared_lyrics = ''

                while not_bracket:
                    i += 1
                    line = html_results[i]
                    if line.name == 'span':
                        hex = line['style'].split('#')[1]
                        combine_members += (' #'+ hex)
                     
                    elif isinstance(line, NavigableString):
                        if "]" in line.strip():
                            shared_lyrics = line.strip().split(']')[1]
                            not_bracket = False
                    
                insert_line = f"<span class='multiple_members' style='color:{combine_members}'>{shared_lyrics}</span>"
                line.insert_before(insert_line)
                combined_lines_test += insert_line
            else:
                combined_lines_test += (' ' + line.strip())
        i += 1
    return combined_lines_test

In [20]:
def get_color_lyrics_with_container(soup):
    body = soup.find("div", class_="wp-block-group__inner-container is-layout-flow wp-block-group-is-layout-flow")

    find_divs = body.findAll('div', recursive=False)

    color_key = get_color_key(body.find('p', recursive=False))
    main_lyrics_body = find_divs[-1]

    # Hangul lyrics
    find_main_lyrics_body = main_lyrics_body.find_all("div", class_="wp-block-group__inner-container is-layout-flow wp-block-group-is-layout-flow")

    if (len(find_main_lyrics_body) >= 3):
        main_lyrics_body = main_lyrics_body.find_all("div", class_="wp-block-group__inner-container is-layout-flow wp-block-group-is-layout-flow")[1]
        main_lyrics_body = main_lyrics_body.findAll("div")[-1]
        main_lyrics_body = clean_main_body_html(main_lyrics_body)

    else:
        main_lyrics_body = body.find(string='English')
        print(main_lyrics_body.parent)
        tag = main_lyrics_body.parent

        while not tag.name == 'p':
            print(tag)
            tag = tag.parent
        print('parent rag', tag)
        tag = tag.find_next_siblings('p')
        print('next', type(tag))
        
        new_html = []
        for t in tag:
            if not t.name == 'p':
                break
            new_html.append(t)

        main_lyrics_body = clean_main_body_html(new_html)
    return {"color_key": color_key, "main_lyrics_body": main_lyrics_body}

In [None]:
def get_color_lyrics_with_table(soup):
    body = soup.findAll("table")

    if (len(body) > 1):
        song_header = body[0]
        main_lyrics_body = body[1]
        color_key = get_color_key(song_header.findAll('td')[2])
       
    else:
        main_lyrics_body = body[0]
        song_header = soup.find('div', class_='wp-block-column is-layout-flow wp-block-column-is-layout-flow')
        color_key = get_color_key(song_header.find('p'))

    main_lyrics_body = main_lyrics_body.findAll("td")[1]
    main_lyrics_body = clean_main_body_html(main_lyrics_body.children)
    
    return {"song_header": song_header, "color_key": color_key, "main_lyrics_body": main_lyrics_body}

In [22]:
def normalize_lyrics(text):
    text = "".join(text.split())
    text = re.sub(r'[\n\(]', ' ', text)
    text = re.sub(r'[^\w\s가-힣]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [23]:
def get_raw_color_coded_html(song):
    soup = get_soup(song)

    html_results = []
    if soup.find(('table')) != None:
        html_results = get_color_lyrics_with_table(soup)
    else:
        html_results = get_color_lyrics_with_container(soup)
    
    lyrics = html_results['main_lyrics_body'].strip()

    print('lyrics', lyrics)
    
    p_index = lyrics.find('<p>')
    if p_index:
        print('found p', p_index)
        lyrics = '<p>' + lyrics[:p_index] + '</p>' + lyrics[p_index:]

    color_key = html_results['color_key']
    main_lyrics_body = BeautifulSoup(lyrics, 'html.parser')

    return {'color_key': color_key, 'main_lyrics_body': main_lyrics_body}


In [24]:
def export_testing_html(html):
    with open('../html/testing.html', 'w') as f:
        f.write(html)


In [25]:
def get_colored_lyrics(html):

    lyrics = html['main_lyrics_body']
    color_coded_key = html['color_key']

    for br in lyrics.find_all("br"):
        br.decompose()
    
    colored_lyrics = []
    section_num = 0
    line_num = 0
    members = []

    span_lyrics_html = lyrics.find_all('span', recursive=False)

    for s in span_lyrics_html:
        print('something broke', s)


    colored_lyrics_html = lyrics.find_all('p')
    for c in colored_lyrics_html:
        

        line_num = 0
        for line in c:
            text = normalize_lyrics(line.text)
            
            if text:
                if line.name == 'span':
                    members = []
                    hex_re = r"\#([a-zA-Z0-9]{6})"
                    hex_value = re.findall(hex_re, line['style'])

                    for hex in hex_value:
                        print(hex)
                        if hex in color_coded_key:
                            members.append(color_coded_key[hex])
                        else:
                            members = ["SOMETHING BROKE"]
                else:
                    members = ["adlib"]
                colored_lyrics.append({'member': members, 'lyric':line.text, 'section': section_num, 'line': line_num})
                line_num += 1
        section_num += 1
    return colored_lyrics

In [53]:
html = get_raw_color_coded_html('left & right')
print("html", html['color_key'])
# export_testing_html(str(html['main_lyrics_body']))

get_colored_lyrics(html)

url found: https://colorcodedlyrics.com/2020/06/22/seventeen-left-amp-right/
song header <div class="wp-block-column is-layout-flow wp-block-column-is-layout-flow">
<p class="has-text-align-center"><span class="has-inline-color" style="color:#4ea8d4">S.Coups</span>, <span class="has-inline-color" style="color:#e32636">Jeonghan</span>, <span class="has-inline-color" style="color:#df73ff">Joshua</span>, <span class="has-inline-color" style="color:#bede0d">Jun</span>,<br/><span class="has-inline-color" style="color:#c04737">Hoshi</span>, <span class="has-inline-color" style="color:#855fc4">Wonwoo</span>, <span class="has-inline-color" style="color:#ff7d07">Woozi</span>, <span class="has-inline-color" style="color:#3d72e7">DK</span>, <span class="has-inline-color" style="color:#04a497">Mingyu</span>,<br/><span class="has-inline-color" style="color:#a1dad7">THE8</span>, <span class="has-inline-color" style="color:#f1ce5f">Seungkwan</span>, <span class="has-inline-color" style="color:#63b76c

[{'member': ['Vernon'], 'lyric': '하나 둘 셋 넷', 'section': 0, 'line': 0},
 {'member': ['Vernon'], 'lyric': 'Left and right', 'section': 1, 'line': 0},
 {'member': ['Vernon'], 'lyric': 'Left and right', 'section': 1, 'line': 1},
 {'member': ['Vernon'], 'lyric': 'Left and right', 'section': 1, 'line': 2},
 {'member': ['Vernon'], 'lyric': 'Seventeen ha', 'section': 2, 'line': 0},
 {'member': ['Jeonghan'],
  'lyric': '잊지 말아야 해 출발선에 설 때',
  'section': 3,
  'line': 0},
 {'member': ['Jeonghan'], 'lyric': '두 눈 부릅뜨고 고갤 들어', 'section': 3, 'line': 1},
 {'member': ['adlib'], 'lyric': ' Come on!', 'section': 3, 'line': 2},
 {'member': ['Wonwoo'], 'lyric': '무릎 꿇고서 추진력을 얻고 나면', 'section': 4, 'line': 0},
 {'member': ['Wonwoo'], 'lyric': '제일 먼저 Baby 앞서갈래', 'section': 4, 'line': 1},
 {'member': ['adlib'], 'lyric': ' Come on!', 'section': 4, 'line': 2},
 {'member': ['S.Coups'],
  'lyric': '친구들 불러 I’ma celebrate',
  'section': 5,
  'line': 0},
 {'member': ['S.Coups'],
  'lyric': '아무도 못 말려 We party today',
  