In [1]:
# load config

import json
import types

with open('config.json') as f:
    config = json.load(f)

In [2]:
# init db

from sqlalchemy import create_engine
from sqlalchemy.ext.automap import automap_base

engine = create_engine(config['db'])

Base = automap_base()
Base.prepare(engine, reflect=True)

Track = Base.classes.tracks
Comment = Base.classes.comments
TrackEmotion = Base.classes.track_emotions

print('classes:', list(Base.classes))
print('track:', vars(Track))

classes: [<class 'sqlalchemy.ext.automap.artists'>, <class 'sqlalchemy.ext.automap.albums'>, <class 'sqlalchemy.ext.automap.comments'>, <class 'sqlalchemy.ext.automap.users'>, <class 'sqlalchemy.ext.automap.tracks'>, <class 'sqlalchemy.ext.automap.playlists'>, <class 'sqlalchemy.ext.automap.playlist_tracks'>, <class 'sqlalchemy.ext.automap.track_emotions'>]
track: {'__table__': Table('tracks', MetaData(), Column('id', BIGINT(), table=<tracks>, primary_key=True, nullable=False, server_default=DefaultClause(<sqlalchemy.sql.elements.TextClause object at 0x1127852e0>, for_update=False)), Column('name', TEXT(), table=<tracks>), Column('pop', BIGINT(), table=<tracks>), Column('publish_time', BIGINT(), table=<tracks>), Column('lyric', TEXT(), table=<tracks>), schema=None), '__module__': 'sqlalchemy.ext.automap', '__doc__': None, '_sa_class_manager': <ClassManager of <class 'sqlalchemy.ext.automap.tracks'> at 1127b3590>, '__init__': <function __init__ at 0x1127b7b80>, 'comments_collection': <s

In [3]:
# 获取已标注好的情感: track_emotions_collection

from sqlalchemy.orm import Session

session = Session(engine)

t = session.query(Track).order_by(Track.pop.desc())[0]
list(
    map(
        lambda x: f'{x.emotion} {x.intensity}',
        sorted(
            t.track_emotions_collection,
            key=lambda x: -x.intensity
        )
    )
)

['PB 0.28915776373271723',
 'PH 0.22500498277984793',
 'NB 0.13196447527078803',
 'NC 0.10762529059675584',
 'PG 0.0669708566180034',
 'PA 0.06277133598592072',
 'PF 0.05137391504696592',
 'NN 0.0371243899615215',
 'PE 0.015441845294230462',
 'ND 0.007877356250153731',
 'NE 0.004687788463095236']

In [4]:
# 把上面这种结果映射到 emotext.EmotionResult: 让数据按照一个固定顺序，缺失上零值

import emotext

emotext.emotions  # 顺序

['PA',
 'PE',
 'PD',
 'PH',
 'PG',
 'PB',
 'PK',
 'NA',
 'NB',
 'NJ',
 'NH',
 'PF',
 'NI',
 'NC',
 'NG',
 'NE',
 'ND',
 'NN',
 'NK',
 'NL',
 'PC']

In [5]:
from typing import List
from collections import namedtuple

# 一开始 emotext.EmotionResult 不知道怎么想的用了 dict
# 应该用 namedtuple 的。。不管了，在这里转一下吧。

Emotion = namedtuple('Emotion', emotext.emotions)


def keys(self):
    return self._fields


def values(self):
    return tuple(self)


Emotion.keys = keys
Emotion.values = values


def emotion_vector(emotions: List[TrackEmotion]) -> Emotion:
    elems = dict.fromkeys(emotext.emotions, 0)
    elems.update({x.emotion: x.intensity for x in emotions})
    ev = Emotion(**elems)

    return ev


e = emotion_vector(t.track_emotions_collection)
e

Emotion(PA=0.06277133598592072, PE=0.015441845294230462, PD=0, PH=0.22500498277984793, PG=0.0669708566180034, PB=0.28915776373271723, PK=0, NA=0, NB=0.13196447527078803, NJ=0, NH=0, PF=0.05137391504696592, NI=0, NC=0.10762529059675584, NG=0, NE=0.004687788463095236, ND=0.007877356250153731, NN=0.0371243899615215, NK=0, NL=0, PC=0)

In [32]:
# 做训练数据集了

MAX_DATA_SIZE = 1000

data_ids = []
data = []

for t in session.query(Track).order_by(Track.pop.desc(), Track.id)[:MAX_DATA_SIZE]:
    if not t.track_emotions_collection:
        continue
    data_ids.append(t.id)
    data.append(emotion_vector(t.track_emotions_collection))

print(len(data), data_ids[0], data[0])

726 60008 Emotion(PA=0.1398622996458419, PE=0, PD=0, PH=0.41148582859157773, PG=0, PB=0.11334713828394757, PK=0, NA=0, NB=0.040660937589600016, NJ=0, NH=0, PF=0.06367286256663753, NI=0, NC=0.06174120171419717, NG=0, NE=0, ND=0.13856680617549905, NN=0.030662925432699092, NK=0, NL=0, PC=0)


In [33]:
import numpy as np

X = np.array(data)
X.shape

(726, 21)

In [34]:
from sklearn.neighbors import NearestNeighbors

nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(X)

In [36]:
s = Emotion(PA=0.1398622996458419, PE=0, PD=0, PH=0.41148582859157773, PG=0, PB=0.11334713828394757, PK=0, NA=0,
            NB=0.040660937589600016, NJ=0, NH=0, PF=0.06367286256663753, NI=0, NC=0.06174120171419717, NG=0, NE=0,
            ND=0.13856680617549905, NN=0.030662925432699092, NK=0, NL=0, PC=0)
distances, indices = nbrs.kneighbors([s], 10)

print(f'kneighbors of {s}: \n{indices=}\n{distances=}')

kneighbors of Emotion(PA=0.1398622996458419, PE=0, PD=0, PH=0.41148582859157773, PG=0, PB=0.11334713828394757, PK=0, NA=0, NB=0.040660937589600016, NJ=0, NH=0, PF=0.06367286256663753, NI=0, NC=0.06174120171419717, NG=0, NE=0, ND=0.13856680617549905, NN=0.030662925432699092, NK=0, NL=0, PC=0): 
indices=array([[  0, 226, 442, 532, 593, 588, 666, 104, 617, 181]])
distances=array([[0.        , 0.14438086, 0.1590296 , 0.1638293 , 0.16469269,
        0.16578893, 0.16783254, 0.17223264, 0.1739159 , 0.17633226]])


In [37]:
for i in range(len(indices[0])):
    idx = indices[0][i]
    dst = distances[0][i]

    id = data_ids[idx]
    t = session.query(Track).where(Track.id == id)[0]
    ar = list(map(lambda a: a.name, t.artists_collection))

    print(f'dist={dst:.4f}: ({t.id})\t{t.name} - {ar}')

dist=0.0000: (60008)	桃花朵朵开 - ['阿牛']
dist=0.1444: (101906)	只要为你活一天 - ['刘家昌']
dist=0.1590: (144378)	死不了 - ['任贤齐']
dist=0.1638: (165347)	风度 - ['汪苏泷']
dist=0.1647: (168053)	故乡 - ['许巍']
dist=0.1658: (167955)	完美生活(Live) - ['许巍']
dist=0.1678: (185792)	爱的初体验 - ['张震岳']
dist=0.1722: (67358)	你有心 - ['关心妍']
dist=0.1739: (171254)	沉默是金(Live) - ['许冠杰']
dist=0.1763: (92311)	西厢 - ['后弦']


...意外了，效果还比较不错。。再试一次：

In [39]:
s = data[500]

distances, indices = nbrs.kneighbors([s], 10)

for i in range(len(indices[0])):
    idx = indices[0][i]
    dst = distances[0][i]

    id = data_ids[idx]
    t = session.query(Track).where(Track.id == id)[0]
    ar = list(map(lambda a: a.name, t.artists_collection))

    print(f'dist={dst:.4f}: ({t.id})\t{t.name} - {ar}')

dist=0.0000: (156193)	为你写诗 - ['吴克群']
dist=0.1192: (185884)	退后 - ['周杰伦']
dist=0.1263: (60102)	天使的翅膀 - ['安琥']
dist=0.1592: (92255)	你还欠我一个拥抱 - ['后弦', 'Sara']
dist=0.1600: (66525)	有没有人告诉你 - ['陈楚生']
dist=0.1626: (165377)	等不到你 - ['汪苏泷']
dist=0.1630: (96087)	爱情诺曼底 - ['黄征']
dist=0.1654: (185697)	花海 - ['周杰伦']
dist=0.1775: (103886)	爱情惹的祸 - ['kenta-k.uz.']
dist=0.1781: (167732)	千百度 - ['许嵩']


In [40]:
s = data[250]

distances, indices = nbrs.kneighbors([s], 10)

for i in range(len(indices[0])):
    idx = indices[0][i]
    dst = distances[0][i]

    id = data_ids[idx]
    t = session.query(Track).where(Track.id == id)[0]
    ar = list(map(lambda a: a.name, t.artists_collection))

    print(f'dist={dst:.4f}: ({t.id})\t{t.name} - {ar}')

dist=0.0000: (108242)	她说 - ['林俊杰']
dist=0.1029: (157276)	Last Dance - ['伍佰 & China Blue']
dist=0.1121: (110452)	爱转角 - ['罗志祥']
dist=0.1218: (139359)	且听风吟 - ['朴树']
dist=0.1225: (150565)	二十二 - ['陶喆']
dist=0.1230: (108254)	一眼万年 - ['林俊杰']
dist=0.1360: (167937)	断桥残雪 - ['许嵩']
dist=0.1379: (65642)	一生何求 - ['陈百强']
dist=0.1410: (187341)	春夏秋冬 - ['张国荣']
dist=0.1412: (168107)	故乡 - ['许巍']


其实这些歌都差不多，数据特殊了。。明天再研究。