In [159]:
import base64
import re
import sqlite3
import operator
from bs4 import BeautifulSoup

In [2]:
raw_conn = sqlite3.connect('raw_data/mojim.db')
raw_conn.row_factory = sqlite3.Row

### Process Album

```
href TEXT PRIMARY KEY, name TEXT, title TEXT, meta TEXT, songs TEXT, lyrics_raw TEXT
```

in base64

In [89]:
cur = raw_conn.execute(
    'SELECT href, name, title, meta, songs, lyrics_raw '
    'FROM mojim '
    'WHERE name = "Uy5ILkU="'
)

In [107]:
row = next(cur)

In [73]:
def b64_to_utf8(col):
    return base64.b64decode(col).decode(encoding='utf8')

In [108]:
b64_to_utf8(row['title'])

'PLAY 歌詞 S.H.E( 女朋友 )'

In [109]:
songs = b64_to_utf8(row['songs'])

In [110]:
soup = BeautifulSoup(songs, 'lxml')

In [116]:
song_list = list(map(operator.attrgetter('text'), soup.select('a')))
song_list

['中國話',
 '謝謝你的溫柔',
 '聽袁惟仁彈吉他',
 '五月天',
 '藉口',
 'Boom',
 '再別康橋',
 '倫敦大橋垮下來',
 '說你愛我',
 '好心情',
 '老婆']

In [132]:
len(song_list)

11

In [206]:
lyrics_raw = b64_to_utf8(row['lyrics_raw'])

In [207]:
soup = BeautifulSoup(lyrics_raw, 'lxml')

In [123]:
soup.find('h2')

<h2>【 PLAY 】【 國語 】【 2007-05 】</h2>

In [169]:
row['href']

'http://mojim.com/tw101283x17.htm'

In [280]:
def filter_lyrics(lyric_strings):
    lyric_writer = None
    song_writer = None
    filtered_lyrics = []
    for line in lyric_strings:
        if line.startswith('['):
            # dynamic lyrics
            continue
        elif line.startswith('更多更詳盡歌詞') or '魔鏡歌詞網' in line:
            continue
        elif '****' in line or '----' in line or 'CDATA' in line:
            continue
        elif '作詞' in line:
            lyric_writer = line
        elif '作曲' in line:
            song_writer = line
        else:
            filtered_lyrics.append(line)
    return song_writer, lyric_writer, '\n'.join(filtered_lyrics)

In [281]:
def parse_lyric_table(soup):
    lyric_table = soup.find('table')
    elems = lyric_table.select('dl')[-1].findChildren(name=re.compile(r'(dd|dt)'))
    for dt, dd in zip(*[iter(elems)]*2):
        try:
            song_name = dt.find(lambda tag: tag.name == 'a' and tag.has_attr('href')).text
        except Exception:
            song_name = None
        try:
            song_writer, lyric_writer, lyrics = filter_lyrics(dd.strings)
        except:
            song_writer, lyric_writer, lyrics = None, None, None
        yield song_name, song_writer, lyric_writer, lyrics

In [282]:
list(parse_lyric_table(soup))

[('中國話',
  '作曲：鄭楠',
  '作詞：鄭楠 / 施人誠',
  '\u3000扁擔寬\u3000板凳長\u3000扁擔\u3000想綁在板凳上\n\u3000扁擔寬\u3000板凳長\u3000扁擔\u3000想綁在板凳上\n\u3000倫敦瑪莉蓮\u3000買了\u3000旗袍送\u3000媽媽\n\u3000莫斯科的夫司基\u3000愛上牛肉麵疙瘩\n\u3000各種顏色的皮膚\u3000各種顏色的頭髮\n\u3000嘴裡唸的\u3000說的\u3000開始流行中國話\n\u3000多少年我們苦練\u3000英文發音和文法\n\u3000這幾年\u3000換他們\u3000捲著舌頭學\n\u3000平上去入的變化\u3000平平仄仄平平仄\n\u3000好聰明的中國人\u3000好優美的中國話\n\u3000扁擔寬\u3000板凳長\u3000扁擔\u3000想綁在板凳上\n\u3000板凳\u3000不讓\u3000扁擔\u3000綁在板凳上\n\u3000扁擔\u3000偏要綁在板凳上\n\u3000板凳\u3000偏偏\u3000不讓\u3000扁擔\u3000綁在那板凳上\n\u3000到底扁擔寬\u3000還是板凳長\n\u3000哥哥\u3000弟弟\u3000坡前坐\n\u3000坡上\u3000臥著一隻鵝\u3000坡下\u3000流著一條河\n\u3000哥哥說\u3000寬寬的河\u3000弟弟說\u3000白白的鵝\n\u3000鵝要過河\u3000河要渡鵝\n\u3000不知是那鵝過河\u3000還是河渡鵝\n＃全世界\u3000都在學\u3000中國話\n\u3000孔夫子的話\u3000越來越國際化\n\u3000全世界\u3000都在講\u3000中國話\n\u3000我們說的話\u3000讓世界都認真聽話＃\n\u3000紐約蘇珊娜\u3000開了間禪風Lounge Bar\n\u3000柏林來的沃夫岡\u3000拿胡琴配著電吉他\n\u3000各種顏色的皮膚\u3000各種顏色的頭髮\n\u3000嘴裡唸的\u3000說的\u3000開始流行中國話\n\u3000多少年我們苦練\u3000英文發音和文法\n\u3000這幾年\u3000換他們\u3000捲著舌頭學\n\u3000平上去入的變化\u3000仄仄平平仄