In [122]:
# load config

import json
import os.path
import types

with open('config.json') as f:
    config = json.load(f)

In [123]:
# init db

from sqlalchemy import create_engine
from sqlalchemy.ext.automap import automap_base

engine = create_engine(config['db'])

Base = automap_base()
Base.prepare(engine, reflect=True)

Track = Base.classes.tracks
Comment = Base.classes.comments
TrackEmotion = Base.classes.track_emotions

print('classes:', list(Base.classes))
print('track:', vars(Track))

classes: [<class 'sqlalchemy.ext.automap.artists'>, <class 'sqlalchemy.ext.automap.albums'>, <class 'sqlalchemy.ext.automap.comments'>, <class 'sqlalchemy.ext.automap.users'>, <class 'sqlalchemy.ext.automap.tracks'>, <class 'sqlalchemy.ext.automap.playlists'>, <class 'sqlalchemy.ext.automap.playlist_tracks'>, <class 'sqlalchemy.ext.automap.track_emotions'>]
track: {'__table__': Table('tracks', MetaData(), Column('id', BIGINT(), table=<tracks>, primary_key=True, nullable=False, server_default=DefaultClause(<sqlalchemy.sql.elements.TextClause object at 0x130f3aee0>, for_update=False)), Column('name', TEXT(), table=<tracks>), Column('pop', BIGINT(), table=<tracks>), Column('publish_time', BIGINT(), table=<tracks>), Column('lyric', TEXT(), table=<tracks>), schema=None), '__module__': 'sqlalchemy.ext.automap', '__doc__': None, '_sa_class_manager': <ClassManager of <class 'sqlalchemy.ext.automap.tracks'> at 12db67770>, '__init__': <function __init__ at 0x1245b85e0>, 'id': <sqlalchemy.orm.att

In [124]:
# 获取已标注好的情感: track_emotions_collection

from sqlalchemy.orm import Session

session = Session(engine)

t = session.query(Track).order_by(Track.pop.desc())[0]
list(
    map(
        lambda x: f'{x.emotion} {x.intensity}',
        sorted(
            t.track_emotions_collection,
            key=lambda x: -x.intensity
        )
    )
)

['PB 0.28915776373271723',
 'PH 0.22500498277984793',
 'NB 0.13196447527078803',
 'NC 0.10762529059675584',
 'PG 0.0669708566180034',
 'PA 0.06277133598592072',
 'PF 0.05137391504696592',
 'NN 0.0371243899615215',
 'PE 0.015441845294230462',
 'ND 0.007877356250153731',
 'NE 0.004687788463095236']

In [125]:
# 把上面这种结果映射到 emotext.EmotionResult: 让数据按照一个固定顺序，缺失上零值

import emotext

emotext.emotions  # 顺序

['PA',
 'PE',
 'PD',
 'PH',
 'PG',
 'PB',
 'PK',
 'NA',
 'NB',
 'NJ',
 'NH',
 'PF',
 'NI',
 'NC',
 'NG',
 'NE',
 'ND',
 'NN',
 'NK',
 'NL',
 'PC']

In [126]:
from typing import List
from collections import namedtuple

# 一开始 emotext.EmotionResult 不知道怎么想的用了 dict
# 应该用 namedtuple 的。。不管了，在这里转一下吧。

Emotion = namedtuple('Emotion', emotext.emotions)


def keys(self):
    return self._fields


def values(self):
    return tuple(self)


Emotion.keys = keys
Emotion.values = values


def emotion_vector(emotions: List[TrackEmotion]) -> Emotion:
    elems = dict.fromkeys(emotext.emotions, 0)
    elems.update({x.emotion: x.intensity for x in emotions})
    ev = Emotion(**elems)

    return ev


e = emotion_vector(t.track_emotions_collection)
e

Emotion(PA=0.06277133598592072, PE=0.015441845294230462, PD=0, PH=0.22500498277984793, PG=0.0669708566180034, PB=0.28915776373271723, PK=0, NA=0, NB=0.13196447527078803, NJ=0, NH=0, PF=0.05137391504696592, NI=0, NC=0.10762529059675584, NG=0, NE=0.004687788463095236, ND=0.007877356250153731, NN=0.0371243899615215, NK=0, NL=0, PC=0)

In [127]:
# 做训练数据集了

import time
import os.path

MAX_DATA_SIZE = 12000

# data_ids = []
# data = []
data = {
    'ids': [],
    'emo': [],
}

# query db
for t in session.query(Track).order_by(Track.pop.desc(), Track.id)[:MAX_DATA_SIZE]:
    if not t.track_emotions_collection:
        continue
    data['ids'].append(t.id)
    data['emo'].append(emotion_vector(t.track_emotions_collection))

# save JSON
savefile = os.path.join('savedata', f'{int(time.time())}-{len(data["ids"])}.json')
with open(savefile, 'w') as f:
    json.dump(data, f)

print(len(data['ids']), f'samples: saved as {savefile}')
print('sample [0]:', data['ids'][0], data['emo'][0])

8989 samples: saved as savedata/1646114955-8989.json
sample [0]: 60008 Emotion(PA=0.1398622996458419, PE=0, PD=0, PH=0.41148582859157773, PG=0, PB=0.11334713828394757, PK=0, NA=0, NB=0.040660937589600016, NJ=0, NH=0, PF=0.06367286256663753, NI=0, NC=0.06174120171419717, NG=0, NE=0, ND=0.13856680617549905, NN=0.030662925432699092, NK=0, NL=0, PC=0)


In [128]:
# 训练样本

import numpy as np

X = np.array(data['emo'])
X.shape

(8989, 21)

In [129]:
# 拟合邻近模型

from sklearn.neighbors import NearestNeighbors
from joblib import dump, load  # for model persistence, see: https://scikit-learn.org/stable/modules/model_persistence.html#model-persistence

def neighbors_fit(X) -> NearestNeighbors:
    """拟合并保存模型

    :param X: [[...]...]
    :return: 训练好的 NearestNeighbors
    """
    nbrs = NearestNeighbors(n_neighbors=10, algorithm='ball_tree').fit(X)
    # 上面的参数 n_neighbors 是查询 nbrs.kneighbors([...]) 不传结果数量时返回的默认值

    savefile = os.path.join('savemodels', f'{int(time.time())}-{len(X)}.joblib')
    dump(nbrs, savefile)

    return nbrs

nbrs = neighbors_fit(X)

In [130]:
# 测试

t = '后悔也都没有用 还不如一切没有发生过 不过就是又少一个诗人 换一个人沉迷你的笑'

Emotext = emotext.Emotions()

r = Emotext.emotion_count(t)
e = Emotion(**r.emotions)

e

Emotion(PA=0, PE=0, PD=0, PH=0, PG=2.5371288177625, PB=0, PK=0, NA=0, NB=0, NJ=4.310553996041667, NH=1.8303566099925, PF=0, NI=0, NC=0, NG=0, NE=1.7690481514, ND=0, NN=0, NK=0, NL=0, PC=0)

In [131]:
distances, indices = nbrs.kneighbors([e], 10)

def print_nbrs(distances, indices):
    for i in range(len(indices[0])):
        idx = indices[0][i]
        dst = distances[0][i]

        id = data['ids'][idx]
        t = session.query(Track).where(Track.id == id)[0]
        ar = list(map(lambda a: a.name, t.artists_collection))

        print(f'dist={dst:.4f}: ({t.id})\t {t.name} - {ar}')

print_nbrs(distances, indices)

dist=5.2126: (210287)	 遗憾 - ['陈洁仪']
dist=5.2308: (27731486)	 Talk Dirty (feat. 2 Chainz) - ['Jason Derulo', '2 Chainz']
dist=5.2419: (34834450)	 雷克雅未克 - ['麦浚龙', '周国贤']
dist=5.2424: (307018)	 遗憾 - ['许美静']
dist=5.2449: (108983)	 会有那么一天 - ['林俊杰']
dist=5.2636: (25650033)	 遗憾 - ['李代沫']
dist=5.2736: (409647388)	 3 Strikes - ['Terror Jr']
dist=5.2784: (64825)	 太阳照常升起 - ['陈奕迅']
dist=5.2963: (566436203)	 加油歌 - ['宝宝巴士']
dist=5.3183: (1943186)	 Blanc - ['Sylvain Chauveau']


In [132]:
# 整理一下上面的步骤

def recommend_from_text(text: str):
    """给文本，算情感，找近邻，作推荐

    :param text: 推荐种子文本
    :return: (emotion, distances, tracks): 计算得到的 text 情感，和推荐结果：距离与曲目
    """

    # emotext
    r = Emotext.emotion_count(text)
    e = Emotion(**r.emotions)

    # recommend
    distances, indices = nbrs.kneighbors([e], 10)

    # result tracks
    tracks = []
    for i in range(len(indices[0])):
        idx = indices[0][i]
        id = data['ids'][idx]
        t = session.query(Track).where(Track.id == id)[0]
        tracks.append(t)

    return e, distances, tracks

def print_nbrs(distances, tracks):
    for d, t in zip(distances[0], tracks):
        ar = list(map(lambda a: a.name, t.artists_collection))
        print(f'dist={d:.4f}: ({t.id})\t {t.name} - {ar}')

emotion, distances, tracks = recommend_from_text('后悔也都没有用 还不如一切没有发生过 不过就是又少一个诗人 换一个人沉迷你的笑')
print(emotion)
print_nbrs(distances, tracks)

Emotion(PA=0, PE=0, PD=0, PH=0, PG=2.5371288177625, PB=0, PK=0, NA=0, NB=0, NJ=4.310553996041667, NH=1.8303566099925, PF=0, NI=0, NC=0, NG=0, NE=1.7690481514, ND=0, NN=0, NK=0, NL=0, PC=0)
dist=5.2126: (210287)	 遗憾 - ['陈洁仪']
dist=5.2308: (27731486)	 Talk Dirty (feat. 2 Chainz) - ['Jason Derulo', '2 Chainz']
dist=5.2419: (34834450)	 雷克雅未克 - ['麦浚龙', '周国贤']
dist=5.2424: (307018)	 遗憾 - ['许美静']
dist=5.2449: (108983)	 会有那么一天 - ['林俊杰']
dist=5.2636: (25650033)	 遗憾 - ['李代沫']
dist=5.2736: (409647388)	 3 Strikes - ['Terror Jr']
dist=5.2784: (64825)	 太阳照常升起 - ['陈奕迅']
dist=5.2963: (566436203)	 加油歌 - ['宝宝巴士']
dist=5.3183: (1943186)	 Blanc - ['Sylvain Chauveau']


In [133]:
def _recommend_and_print(text):
    emotion, distances, tracks = recommend_from_text(text)
    print(f'{text=}')
    print(emotion)
    print('recommend:')
    print_nbrs(distances, tracks)

_recommend_and_print('喜欢你')

text='喜欢你'
Emotion(PA=0, PE=0, PD=0, PH=0, PG=0, PB=28.512942015100002, PK=0, NA=0, NB=0, NJ=0, NH=0, PF=0, NI=0, NC=0, NG=0, NE=0, ND=0, NN=0, NK=0, NL=0, PC=0)
recommend:
dist=27.5129: (467590240)	 Anzap - ['Erkan KILIÇ']
dist=27.6027: (82028)	 好好恋爱 - ['方力申']
dist=27.6463: (538610029)	 Slow - ['落日飞车']
dist=27.6511: (453927759)	 宝贝 (in the night) - ['张悬']
dist=27.6672: (36150596)	 Atlas Koynak - ['Hazritiali']
dist=27.6699: (187134)	 朋友 - ['周华健']
dist=27.6810: (546279760)	 目不转睛 - ['王以太']
dist=27.7144: (28718300)	 形影不离 - ['张纹嘉']
dist=27.7308: (209732)	 爱情36计 - ['蔡依林']
dist=27.7646: (386542)	 拥抱 - ['五月天']


In [134]:
_recommend_and_print('讨厌你')

text='讨厌你'
Emotion(PA=0, PE=0, PD=0, PH=0, PG=0, PB=0, PK=0, NA=0, NB=0, NJ=0, NH=0, PF=0, NI=0, NC=0, NG=0, NE=0, ND=57.262561127590004, NN=0, NK=0, NL=0, PC=0)
recommend:
dist=56.2626: (1311035602)	 Gucci Flip Flops (feat. Lil Yachty) - ['Lil Yachty', 'Bhad Bhabie']
dist=56.4572: (28303867)	 Come Back Down (Original Mix) - ['TJR', 'Benji Madden']
dist=56.5381: (1303079913)	 活死人2018Cypher - ['法老', '龙崎', '许宏阳Oxy', 'Buzzy', '小安迪LilAndy', 'YLevoled', '活死人', '小精灵', '戾仁Lyrin', '杨和苏KeyNG']
dist=56.5398: (27808044)	 丑八怪 - ['薛之谦']
dist=56.5987: (569282112)	 Superstar (Miami Classic Mix) - ['Chris Decay', 'ELLA']
dist=56.6339: (406346416)	 海盗 - ['周杰伦', '蔡依林']
dist=56.6539: (16660836)	 What Kind Of Woman Is This? - ['Buddy Guy']
dist=56.6709: (29544259)	 No Glamour in the Hammer - ['Whitehorse']
dist=56.6727: (5141647)	 Viva La Vida - ['Coldplay']
dist=56.6824: (21534286)	 I Believe I Can Fly - ['R. Kelly']


In [135]:
_recommend_and_print('想念你')

text='想念你'
Emotion(PA=0, PE=0, PD=0, PH=0, PG=0, PB=0, PK=0, NA=0, NB=0, NJ=0, NH=0, PF=44.551225326, NI=0, NC=0, NG=0, NE=0, ND=0, NN=0, NK=0, NL=0, PC=0)
recommend:
dist=43.8703: (375328)	 小情歌 - ['苏打绿']
dist=43.8875: (254059)	 情歌 - ['梁静茹']
dist=43.9200: (1306333181)	 想对你说情话 - ['I·D·C']
dist=43.9218: (68450)	 小情歌 - ['苏打绿']
dist=43.9311: (491787693)	 情话 - ['余佳运']
dist=43.9710: (30148268)	 念念不忘 - ['麦浚龙']
dist=43.9910: (86279)	 情歌王 - ['古巨基']
dist=43.9967: (28577372)	 我就不爱唱情歌 - ['大张伟']
dist=43.9982: (34341351)	 爱的纪念 - ['Richard Clayderman']
dist=44.0050: (5231418)	 玛尼情歌 - ['上官红燕']


In [136]:
_recommend_and_print('开心！')

text='开心！'
Emotion(PA=40.6617832809, PE=0, PD=0, PH=0, PG=0, PB=0, PK=0, NA=0, NB=0, NJ=0, NH=0, PF=0, NI=0, NC=0, NG=0, NE=0, ND=0, NN=24.397069968540002, NK=0, NL=0, PC=0)
recommend:
dist=46.5647: (566435190)	 把舞儿跳起来 - ['宝宝巴士']
dist=46.6690: (5281403)	 欢乐年年 - ['杜德伟', '叶蒨文']
dist=46.6705: (5281404)	 恭喜你 - ['太极乐队']
dist=46.6870: (5281402)	 祝新岁 - ['吕方', '张卫健']
dist=46.7089: (5281405)	 恭喜恭喜 - ['林志颖', '钟镇涛']
dist=46.7378: (28953350)	 七夕 - ['许嵩']
dist=46.7581: (29450091)	 小宝贝 - ['夏天播放']
dist=46.7615: (26548584)	 Happy - ['Pharrell Williams']
dist=46.7830: (395167)	 恭喜恭喜 - ['中国娃娃']
dist=46.7876: (422094342)	 Kalle - ['Adrian Gaxha']


In [137]:
_recommend_and_print('伤心')

text='伤心'
Emotion(PA=0, PE=0, PD=0, PH=0, PG=0, PB=0, PK=0, NA=0, NB=37.3467328502, NJ=0, NH=0, PF=0, NI=0, NC=0, NG=0, NE=0, ND=0, NN=0, NK=0, NL=0, PC=0)
recommend:
dist=36.3770: (545084839)	 Saga Man - ['Miradiljan-Bawudun']
dist=36.4492: (546985787)	 Olganimda Yeglama - ['Young~Ali']
dist=36.4968: (436514179)	 Bitalay - ['али музыка']
dist=36.5484: (28310935)	 无能为力 - ['纣王老胡']
dist=36.5704: (484311380)	 Katti Katti - ['Muzo']
dist=36.5727: (30569561)	 黯然销魂 - ['玄觞']
dist=36.5784: (1323303678)	 BAD! - ['XXXTENTACION']
dist=36.5845: (484311378)	 Satarim - ['Muzo']
dist=36.5875: (484314362)	 Katti Gulum (2) - ['Muzo']
dist=36.5967: (349079)	 失恋 - ['草蜢']


In [143]:
_recommend_and_print('心如止水')

text='心如止水'
Emotion(PA=0, PE=0, PD=0, PH=58.517265373, PG=0, PB=0, PK=0, NA=0, NB=0, NJ=0, NH=0, PF=0, NI=0, NC=0, NG=0, NE=0, ND=0, NN=0, NK=0, NL=0, PC=0)
recommend:
dist=57.5966: (566435171)	 我们美丽的祖国 - ['宝宝巴士']
dist=57.6006: (26608741)	 甜蜜蜜 - ['邓丽君']
dist=57.6722: (33419837)	 安静 - ['黄子韬']
dist=57.6825: (169794)	 天下无双 - ['张靓颖']
dist=57.6825: (229223)	 月亮代表我的心 - ['邓丽君']
dist=57.6979: (556995768)	 曲率飞行 - ['昨夜派对（L.N Party）']
dist=57.7002: (188376)	 披星戴月 - ['张敬轩']
dist=57.7340: (29019489)	 1874 (Live) - ['陈慧娴']
dist=57.7374: (454698352)	 Mercy - ['Bishop Briggs']
dist=57.7386: (285100)	 霞光 - ['曲锦楠']


目测 MAX 开 10000 效果差不多已经最好了，后面的数据质量不高，出来结果乱七八糟的。

效果确实不太行。

可以考虑把推荐筛除来的一堆结果再用某种方法筛一次。