In [1]:
import pandas as pd
import pickle
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from app.model.pipeline import SongPipeline
from app.model.utils import clean

In [2]:
model = AutoModelForSequenceClassification.from_pretrained("./app/saved_model")
tokenizer = AutoTokenizer.from_pretrained("./app/saved_model")

In [3]:
pipe = SongPipeline(
            model=model,
            tokenizer=tokenizer,
            device=0, # gpu number, -1 if cpu used
            return_all_scores=True,
            function_to_apply='sigmoid'
        )

In [11]:
def top_k_emotions(data:list, threshold:float, k:int):
    texts = data['lyric']
    titles = data['title']
    singers = data['singer']
    urls = data['url']
    global error_idss 
    
    if not 0 <= threshold <=1:
        raise ValueError("theshold must be a float b/w 0 ~ 1.")
    results = {}
    for idx, text in enumerate(texts):
        try:
            cur_result = {}
            text = clean(text)
            for out in pipe(text, stride=128, return_overflowing_tokens=True,
                            padding=True, truncation=True)[0]:
                if out["score"] > threshold:
                    cur_result[out["label"]] = round(out["score"], 2)
            cur_result = sorted(cur_result.items(), key=lambda x: x[1], reverse=True)
            preview = text[:50] + "..."
            results[titles[idx]] = (cur_result[:k], singers[idx], urls[idx], preview)
        except:
            error_idss.append(idx)
        
    return results

In [12]:
import os
from collections import defaultdict
songs_database = defaultdict(list)
error_idss = []

for filename in os.listdir("data"):
    file = pd.read_csv(os.path.join("./data", filename))
    result = top_k_emotions(data=file, threshold=0.0, k=5)
    for title, content in result.items():
        feelings, singer, url, preview = content
        if feelings[0][0] != '없음':
            feeling = feelings[0][0]
        else:
            feeling = feelings[1][0]
        songs_database[feeling].append((title, singer, url, preview))



In [13]:
error_idss

[49, 35, 49, 4, 17, 37, 49]

In [15]:
songs_database.keys()

dict_keys(['즐거움/신남', '슬픔', '행복', '아껴주는', '기대감', '안타까움/실망', '기쁨', '짜증', '깨달음', '힘듦/지침', '비장함', '고마움', '화남/분노', '당황/난처', '불안/걱정', '감동/감탄', '흐뭇함(귀여움/예쁨)'])

In [16]:
with open('./songs_database.pkl', 'wb') as f:
    pickle.dump(songs_database, f)

In [17]:
with open('./songs_database.pkl', 'rb') as f:
    d = pickle.load(f)

In [18]:
d

defaultdict(list,
            {'즐거움/신남': [('강남스타일',
               '싸이 (PSY)',
               'https://www.melon.com/song/detail.htm?songId=3853978',
               '오빤 강남스타일 강남스타일 낮에는 따사로운 인간적인 여자 커피 한잔의 여유를 아는 품격 있...'),
              ('크레용 (Crayon)',
               'G-DRAGON',
               'https://www.melon.com/song/detail.htm?songId=3906377',
               '크레용 (Crayon) GET YOUR CRAYON GET YOUR CRAYON 머리 어깨...'),
              ('립스틱 (LIPSTICK)',
               '오렌지 캬라멜',
               'https://www.melon.com/song/detail.htm?songId=3903316',
               '립스틱 스틱 세우고 립스틱 스틱 세우고 어찌나 눈이 높던지 애인이 있는 건지 살짝 시크해보...'),
              ('너는 나 나는 너',
               '지코 (ZICO)',
               'https://www.melon.com/song/detail.htm?songId=8028724',
               '넌 나고 난 너야 난 너고 넌 나야 마음이 같다면 둘은 서로가 될 거야 넌 나고 난 너야 ...'),
              ('러시안 룰렛 (Russian Roulette)',
               'Red Velvet (레드벨벳)',
               'https://www.melon.com/song/detail.htm?songId=9634954',
               'La La