In [24]:
# load config

import json
import os.path
import types

with open('config.json') as f:
    config = json.load(f)

In [25]:
# init db

from sqlalchemy import create_engine
from sqlalchemy.ext.automap import automap_base

engine = create_engine(config['db'])

Base = automap_base()
Base.prepare(engine, reflect=True)

Track = Base.classes.tracks
Comment = Base.classes.comments
TrackEmotion = Base.classes.track_emotions

print('classes:', list(Base.classes))
print('track:', vars(Track))

classes: [<class 'sqlalchemy.ext.automap.artists'>, <class 'sqlalchemy.ext.automap.albums'>, <class 'sqlalchemy.ext.automap.comments'>, <class 'sqlalchemy.ext.automap.users'>, <class 'sqlalchemy.ext.automap.tracks'>, <class 'sqlalchemy.ext.automap.playlists'>, <class 'sqlalchemy.ext.automap.playlist_tracks'>, <class 'sqlalchemy.ext.automap.track_emotions'>]
track: {'__table__': Table('tracks', MetaData(), Column('id', BIGINT(), table=<tracks>, primary_key=True, nullable=False, server_default=DefaultClause(<sqlalchemy.sql.elements.TextClause object at 0x117db5be0>, for_update=False)), Column('name', TEXT(), table=<tracks>), Column('pop', BIGINT(), table=<tracks>), Column('publish_time', BIGINT(), table=<tracks>), Column('lyric', TEXT(), table=<tracks>), schema=None), '__module__': 'sqlalchemy.ext.automap', '__doc__': None, '_sa_class_manager': <ClassManager of <class 'sqlalchemy.ext.automap.tracks'> at 114e1cea0>, '__init__': <function __init__ at 0x1122ac160>, 'comments_collection': <s

In [26]:
# 获取已标注好的情感: track_emotions_collection

from sqlalchemy.orm import Session

session = Session(engine)

t = session.query(Track).order_by(Track.pop.desc())[0]
list(
    map(
        lambda x: f'{x.emotion} {x.intensity}',
        sorted(
            t.track_emotions_collection,
            key=lambda x: -x.intensity
        )
    )
)

['PB 0.28915776373271723',
 'PH 0.22500498277984793',
 'NB 0.13196447527078803',
 'NC 0.10762529059675584',
 'PG 0.0669708566180034',
 'PA 0.06277133598592072',
 'PF 0.05137391504696592',
 'NN 0.0371243899615215',
 'PE 0.015441845294230462',
 'ND 0.007877356250153731',
 'NE 0.004687788463095236']

In [27]:
# 把上面这种结果映射到 emotext.EmotionResult: 让数据按照一个固定顺序，缺失上零值

import emotext

emotext.emotions  # 顺序

['PA',
 'PE',
 'PD',
 'PH',
 'PG',
 'PB',
 'PK',
 'NA',
 'NB',
 'NJ',
 'NH',
 'PF',
 'NI',
 'NC',
 'NG',
 'NE',
 'ND',
 'NN',
 'NK',
 'NL',
 'PC']

In [28]:
from typing import List
from collections import namedtuple

# 一开始 emotext.EmotionResult 不知道怎么想的用了 dict
# 应该用 namedtuple 的。。不管了，在这里转一下吧。

Emotion = namedtuple('Emotion', emotext.emotions)


def keys(self):
    return self._fields


def values(self):
    return tuple(self)


Emotion.keys = keys
Emotion.values = values


def emotion_vector(emotions: List[TrackEmotion]) -> Emotion:
    elems = dict.fromkeys(emotext.emotions, 0)
    elems.update({x.emotion: x.intensity for x in emotions})
    ev = Emotion(**elems)

    return ev


e = emotion_vector(t.track_emotions_collection)
e

Emotion(PA=0.06277133598592072, PE=0.015441845294230462, PD=0, PH=0.22500498277984793, PG=0.0669708566180034, PB=0.28915776373271723, PK=0, NA=0, NB=0.13196447527078803, NJ=0, NH=0, PF=0.05137391504696592, NI=0, NC=0.10762529059675584, NG=0, NE=0.004687788463095236, ND=0.007877356250153731, NN=0.0371243899615215, NK=0, NL=0, PC=0)

In [29]:
# 做训练数据集了

import time
import os.path

MAX_DATA_SIZE = 12000

# data_ids = []
# data = []
data = {
    'ids': [],
    'emo': [],
}

# query db
for t in session.query(Track).order_by(Track.pop.desc(), Track.id)[:MAX_DATA_SIZE]:
    if not t.track_emotions_collection:
        continue
    data['ids'].append(t.id)
    data['emo'].append(emotion_vector(t.track_emotions_collection))

# save JSON
savefile = os.path.join('savedata', f'{int(time.time())}-{len(data["ids"])}.json')
with open(savefile, 'w') as f:
    json.dump(data, f)

print(len(data['ids']), f'samples: saved as {savefile}')
print('sample [0]:', data['ids'][0], data['emo'][0])

8989 samples: saved as savedata/1646210871-8989.json
sample [0]: 60008 Emotion(PA=0.1398622996458419, PE=0, PD=0, PH=0.41148582859157773, PG=0, PB=0.11334713828394757, PK=0, NA=0, NB=0.040660937589600016, NJ=0, NH=0, PF=0.06367286256663753, NI=0, NC=0.06174120171419717, NG=0, NE=0, ND=0.13856680617549905, NN=0.030662925432699092, NK=0, NL=0, PC=0)


In [30]:
# 训练样本

import numpy as np

X = np.array(data['emo'])
X.shape

(8989, 21)

In [31]:
# 拟合邻近模型

from sklearn.neighbors import NearestNeighbors
from joblib import dump, load  # for model persistence, see: https://scikit-learn.org/stable/modules/model_persistence.html#model-persistence


def neighbors_fit(X) -> NearestNeighbors:
    """拟合并保存模型

    :param X: [[...]...]
    :return: 训练好的 NearestNeighbors
    """
    nbrs = NearestNeighbors(n_neighbors=10, algorithm='ball_tree').fit(X)
    # 上面的参数 n_neighbors 是查询 nbrs.kneighbors([...]) 不传结果数量时返回的默认值

    savefile = os.path.join('savemodels', f'{int(time.time())}-{len(X)}.joblib')
    dump(nbrs, savefile)

    return nbrs


nbrs = neighbors_fit(X)

In [41]:
# 测试

t = '后悔也都没有用 还不如一切没有发生过 不过就是又少一个诗人 换一个人沉迷你的笑'

Emotext = emotext.Emotions()


def softmax_dict(x: dict):
    s = sum(v for v in x.values())
    for k in x:
        x[k] /= s
    return x


r = Emotext.emotion_count(t)
r.emotions = softmax_dict(r.emotions)
e = Emotion(**r.emotions)
e

Emotion(PA=0.0, PE=0.0, PD=0.0, PH=0.0, PG=0.2428551306285703, PB=0.0, PK=0.0, NA=0.0, NB=0.0, NJ=0.41260819965515805, NH=0.175202571704109, PF=0.0, NI=0.0, NC=0.0, NG=0.0, NE=0.1693340980121628, ND=0.0, NN=0.0, NK=0.0, NL=0.0, PC=0.0)

In [42]:
distances, indices = nbrs.kneighbors([e], 10)


def print_nbrs(distances, indices):
    for i in range(len(indices[0])):
        idx = indices[0][i]
        dst = distances[0][i]

        id = data['ids'][idx]
        t = session.query(Track).where(Track.id == id)[0]
        ar = list(map(lambda a: a.name, t.artists_collection))

        print(f'dist={dst:.4f}: ({t.id})\t {t.name} - {ar}')


print_nbrs(distances, indices)

dist=0.3137: (108983)	 会有那么一天 - ['林俊杰']
dist=0.3740: (27731486)	 Talk Dirty (feat. 2 Chainz) - ['Jason Derulo', '2 Chainz']
dist=0.3758: (1329999687)	 50 Feet - ['SoMo']
dist=0.3804: (210287)	 遗憾 - ['陈洁仪']
dist=0.3808: (307018)	 遗憾 - ['许美静']
dist=0.3980: (25650033)	 遗憾 - ['李代沫']
dist=0.4004: (424262521)	 Rolling in the deep - ['廖佳琳']
dist=0.4019: (1943186)	 Blanc - ['Sylvain Chauveau']
dist=0.4051: (17405587)	 Still D.R.E. - ['Snoop Dogg', 'Dr. Dre']
dist=0.4052: (34834450)	 雷克雅未克 - ['麦浚龙', '周国贤']


In [34]:
# 整理一下上面的步骤

def recommend_from_text(text: str):
    """给文本，算情感，找近邻，作推荐

    :param text: 推荐种子文本
    :return: (emotion, distances, tracks): 计算得到的 text 情感，和推荐结果：距离与曲目
    """

    # emotext
    r = Emotext.emotion_count(text)
    r.emotions = softmax_dict(r.emotions)
    e = Emotion(**r.emotions)

    # recommend
    distances, indices = nbrs.kneighbors([e], 10)

    # result tracks
    tracks = []
    for i in range(len(indices[0])):
        idx = indices[0][i]
        id = data['ids'][idx]
        t = session.query(Track).where(Track.id == id)[0]
        tracks.append(t)

    return e, distances, tracks


def print_nbrs(distances, tracks):
    for d, t in zip(distances[0], tracks):
        ar = list(map(lambda a: a.name, t.artists_collection))
        print(f'dist={d:.4f}: ({t.id})\t {t.name} - {ar}')


emotion, distances, tracks = recommend_from_text('后悔也都没有用 还不如一切没有发生过 不过就是又少一个诗人 换一个人沉迷你的笑')
print(emotion)
print_nbrs(distances, tracks)

Emotion(PA=0.0, PE=0.0, PD=0.0, PH=0.0, PG=0.2428551306285703, PB=0.0, PK=0.0, NA=0.0, NB=0.0, NJ=0.41260819965515805, NH=0.175202571704109, PF=0.0, NI=0.0, NC=0.0, NG=0.0, NE=0.1693340980121628, ND=0.0, NN=0.0, NK=0.0, NL=0.0, PC=0.0)
dist=0.3137: (108983)	 会有那么一天 - ['林俊杰']
dist=0.3740: (27731486)	 Talk Dirty (feat. 2 Chainz) - ['Jason Derulo', '2 Chainz']
dist=0.3758: (1329999687)	 50 Feet - ['SoMo']
dist=0.3804: (210287)	 遗憾 - ['陈洁仪']
dist=0.3808: (307018)	 遗憾 - ['许美静']
dist=0.3980: (25650033)	 遗憾 - ['李代沫']
dist=0.4004: (424262521)	 Rolling in the deep - ['廖佳琳']
dist=0.4019: (1943186)	 Blanc - ['Sylvain Chauveau']
dist=0.4051: (17405587)	 Still D.R.E. - ['Snoop Dogg', 'Dr. Dre']
dist=0.4052: (34834450)	 雷克雅未克 - ['麦浚龙', '周国贤']


In [35]:
def _recommend_and_print(text):
    emotion, distances, tracks = recommend_from_text(text)
    print(f'{text=}')
    print(emotion)
    print('recommend:')
    print_nbrs(distances, tracks)


_recommend_and_print('喜欢你')

text='喜欢你'
Emotion(PA=0.0, PE=0.0, PD=0.0, PH=0.0, PG=0.0, PB=1.0, PK=0.0, NA=0.0, NB=0.0, NJ=0.0, NH=0.0, PF=0.0, NI=0.0, NC=0.0, NG=0.0, NE=0.0, ND=0.0, NN=0.0, NK=0.0, NL=0.0, PC=0.0)
recommend:
dist=0.0000: (467590240)	 Anzap - ['Erkan KILIÇ']
dist=0.1014: (82028)	 好好恋爱 - ['方力申']
dist=0.1542: (453927759)	 宝贝 (in the night) - ['张悬']
dist=0.1663: (538610029)	 Slow - ['落日飞车']
dist=0.1684: (187134)	 朋友 - ['周华健']
dist=0.1799: (36150596)	 Atlas Koynak - ['Hazritiali']
dist=0.1834: (546279760)	 目不转睛 - ['王以太']
dist=0.2207: (28718300)	 形影不离 - ['张纹嘉']
dist=0.2453: (209732)	 爱情36计 - ['蔡依林']
dist=0.2763: (386542)	 拥抱 - ['五月天']


In [36]:
_recommend_and_print('讨厌你')

text='讨厌你'
Emotion(PA=0.0, PE=0.0, PD=0.0, PH=0.0, PG=0.0, PB=0.0, PK=0.0, NA=0.0, NB=0.0, NJ=0.0, NH=0.0, PF=0.0, NI=0.0, NC=0.0, NG=0.0, NE=0.0, ND=1.0, NN=0.0, NK=0.0, NL=0.0, PC=0.0)
recommend:
dist=0.0000: (1311035602)	 Gucci Flip Flops (feat. Lil Yachty) - ['Lil Yachty', 'Bhad Bhabie']
dist=0.2344: (28303867)	 Come Back Down (Original Mix) - ['TJR', 'Benji Madden']
dist=0.3047: (27808044)	 丑八怪 - ['薛之谦']
dist=0.3094: (1303079913)	 活死人2018Cypher - ['法老', '龙崎', '许宏阳Oxy', 'Buzzy', '小安迪LilAndy', 'YLevoled', '活死人', '小精灵', '戾仁Lyrin', '杨和苏KeyNG']
dist=0.3797: (569282112)	 Superstar (Miami Classic Mix) - ['Chris Decay', 'ELLA']
dist=0.3983: (406346416)	 海盗 - ['周杰伦', '蔡依林']
dist=0.4247: (16660836)	 What Kind Of Woman Is This? - ['Buddy Guy']
dist=0.4367: (29544259)	 No Glamour in the Hammer - ['Whitehorse']
dist=0.4589: (174120)	 厌弃 - ['许廷铿']
dist=0.4621: (21534286)	 I Believe I Can Fly - ['R. Kelly']


In [37]:
_recommend_and_print('想念你')

text='想念你'
Emotion(PA=0.0, PE=0.0, PD=0.0, PH=0.0, PG=0.0, PB=0.0, PK=0.0, NA=0.0, NB=0.0, NJ=0.0, NH=0.0, PF=1.0, NI=0.0, NC=0.0, NG=0.0, NE=0.0, ND=0.0, NN=0.0, NK=0.0, NL=0.0, PC=0.0)
recommend:
dist=0.3521: (375328)	 小情歌 - ['苏打绿']
dist=0.3756: (254059)	 情歌 - ['梁静茹']
dist=0.3993: (68450)	 小情歌 - ['苏打绿']
dist=0.4017: (1306333181)	 想对你说情话 - ['I·D·C']
dist=0.4389: (491787693)	 情话 - ['余佳运']
dist=0.4628: (30148268)	 念念不忘 - ['麦浚龙']
dist=0.4952: (86279)	 情歌王 - ['古巨基']
dist=0.5180: (5231418)	 玛尼情歌 - ['上官红燕']
dist=0.5192: (28577372)	 我就不爱唱情歌 - ['大张伟']
dist=0.5206: (34341351)	 爱的纪念 - ['Richard Clayderman']


In [38]:
_recommend_and_print('开心！')

text='开心！'
Emotion(PA=0.625, PE=0.0, PD=0.0, PH=0.0, PG=0.0, PB=0.0, PK=0.0, NA=0.0, NB=0.0, NJ=0.0, NH=0.0, PF=0.0, NI=0.0, NC=0.0, NG=0.0, NE=0.0, ND=0.0, NN=0.375, NK=0.0, NL=0.0, PC=0.0)
recommend:
dist=0.1928: (423997607)	 Dirty Water - ['Marc E. Bassy']
dist=0.1942: (544937574)	 最亲的人（Cover 陆海涛） - ['汤子龙']
dist=0.2463: (329125)	 眉飞色舞 - ['郑秀文']
dist=0.2693: (537856438)	 BOOMYEAR(新年好) - ['李棒棒Muti', 'HtNine', 'J.Boss', 'BOMMER']
dist=0.2705: (574925512)	 Rollin' On - ['椅子乐团 The Chairs']
dist=0.2756: (78266)	 情人 - ['杜德伟']
dist=0.2799: (487587087)	 愿望 - ['司徒骏文']
dist=0.2879: (536243886)	 Stranger Things (Alan Walker Remix) - ['OneRepublic', 'Kygo', 'Alan Walker']
dist=0.3000: (422094342)	 Kalle - ['Adrian Gaxha']
dist=0.3010: (29984255)	 Call of the ambulance - ['flash8', 'oldliu']


In [39]:
_recommend_and_print('伤心')

text='伤心'
Emotion(PA=0.0, PE=0.0, PD=0.0, PH=0.0, PG=0.0, PB=0.0, PK=0.0, NA=0.0, NB=1.0, NJ=0.0, NH=0.0, PF=0.0, NI=0.0, NC=0.0, NG=0.0, NE=0.0, ND=0.0, NN=0.0, NK=0.0, NL=0.0, PC=0.0)
recommend:
dist=0.0352: (545084839)	 Saga Man - ['Miradiljan-Bawudun']
dist=0.1209: (546985787)	 Olganimda Yeglama - ['Young~Ali']
dist=0.1830: (436514179)	 Bitalay - ['али музыка']
dist=0.2160: (28310935)	 无能为力 - ['纣王老胡']
dist=0.2442: (30569561)	 黯然销魂 - ['玄觞']
dist=0.2468: (484311380)	 Katti Katti - ['Muzo']
dist=0.2554: (1323303678)	 BAD! - ['XXXTENTACION']
dist=0.2641: (484314362)	 Katti Gulum (2) - ['Muzo']
dist=0.2653: (484311378)	 Satarim - ['Muzo']
dist=0.2715: (349079)	 失恋 - ['草蜢']


In [40]:
_recommend_and_print('心如止水')

text='心如止水'
Emotion(PA=0.0, PE=0.0, PD=0.0, PH=1.0, PG=0.0, PB=0.0, PK=0.0, NA=0.0, NB=0.0, NJ=0.0, NH=0.0, PF=0.0, NI=0.0, NC=0.0, NG=0.0, NE=0.0, ND=0.0, NN=0.0, NK=0.0, NL=0.0, PC=0.0)
recommend:
dist=0.0924: (26608741)	 甜蜜蜜 - ['邓丽君']
dist=0.1121: (566435171)	 我们美丽的祖国 - ['宝宝巴士']
dist=0.1731: (33419837)	 安静 - ['黄子韬']
dist=0.1776: (169794)	 天下无双 - ['张靓颖']
dist=0.1948: (188376)	 披星戴月 - ['张敬轩']
dist=0.2017: (556995768)	 曲率飞行 - ['昨夜派对（L.N Party）']
dist=0.2118: (229223)	 月亮代表我的心 - ['邓丽君']
dist=0.2388: (29019489)	 1874 (Live) - ['陈慧娴']
dist=0.2402: (285100)	 霞光 - ['曲锦楠']
dist=0.2526: (400579056)	 可爱女人 - ['周杰伦']


目测 MAX 开 10000 效果差不多已经最好了，后面的数据质量不高，出来结果乱七八糟的。

效果确实不太行。

可以考虑把推荐筛除来的一堆结果再用某种方法筛一次。