# Packages importing

In [2]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

from gensim.test.utils import get_tmpfile
from gensim.models import word2vec

# Model training

In [3]:
# Open data file
file = open('./dataset/yes_i2v/train2.txt','r')
playlists = word2vec.LineSentence(file)

#
path = get_tmpfile("item2vec.model")
    
# Modelling
model = word2vec.Word2Vec(playlists, sg=1, min_count=1, window=10, iter=10, size=100)
model.save("item2vec.model")

file.close()

2018-09-11 22:55:51,067 : INFO : collecting all words and their counts
2018-09-11 22:55:51,077 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-09-11 22:55:51,109 : INFO : PROGRESS: at sentence #10000, processed 63920 words, keeping 3151 word types
2018-09-11 22:55:51,140 : INFO : PROGRESS: at sentence #20000, processed 129243 words, keeping 3167 word types
2018-09-11 22:55:51,169 : INFO : PROGRESS: at sentence #30000, processed 187038 words, keeping 3167 word types
2018-09-11 22:55:51,199 : INFO : PROGRESS: at sentence #40000, processed 242147 words, keeping 3167 word types
2018-09-11 22:55:51,232 : INFO : PROGRESS: at sentence #50000, processed 309108 words, keeping 3167 word types
2018-09-11 22:55:51,263 : INFO : PROGRESS: at sentence #60000, processed 378158 words, keeping 3167 word types
2018-09-11 22:55:51,292 : INFO : PROGRESS: at sentence #70000, processed 443365 words, keeping 3167 word types
2018-09-11 22:55:51,323 : INFO : PROGRESS: at sentence

2018-09-11 22:56:13,673 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-09-11 22:56:13,674 : INFO : EPOCH - 6 : training on 1580122 raw words (1560723 effective words) took 3.3s, 469363 effective words/s
2018-09-11 22:56:14,710 : INFO : EPOCH 7 - PROGRESS: at 28.67% examples, 440455 words/s, in_qsize 5, out_qsize 0
2018-09-11 22:56:15,722 : INFO : EPOCH 7 - PROGRESS: at 54.66% examples, 420109 words/s, in_qsize 5, out_qsize 0
2018-09-11 22:56:16,740 : INFO : EPOCH 7 - PROGRESS: at 85.59% examples, 438404 words/s, in_qsize 5, out_qsize 0
2018-09-11 22:56:17,151 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-09-11 22:56:17,192 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-09-11 22:56:17,206 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-09-11 22:56:17,207 : INFO : EPOCH - 7 : training on 1580122 raw words (1560865 effective words) took 3.5s, 442270 effective words/s
2018-09-11 22:56:18,251

# Getting similar items by name

In [4]:
model.wv.similar_by_word('3029', topn =20)

2018-09-11 22:56:33,898 : INFO : precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):


[('2610', 0.9918410778045654),
 ('3115', 0.9901841282844543),
 ('1110', 0.98928302526474),
 ('1106', 0.988652229309082),
 ('2777', 0.9873285293579102),
 ('1108', 0.9860349297523499),
 ('2801', 0.9849622249603271),
 ('3027', 0.9832320213317871),
 ('1107', 0.9832187294960022),
 ('3028', 0.980350136756897),
 ('1077', 0.9749118685722351),
 ('1072', 0.9734851121902466),
 ('105', 0.9733776450157166),
 ('1070', 0.9730795621871948),
 ('1041', 0.9713771343231201),
 ('1079', 0.9712492227554321),
 ('1087', 0.9678834080696106),
 ('1062', 0.9653787016868591),
 ('1037', 0.9644817113876343),
 ('1066', 0.9640263915061951)]

# t-SNE

In [179]:
from sklearn.manifold import TSNE
import pandas as pd

vocab = list(model.wv.vocab)
X = model[vocab]

tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)

df = pd.DataFrame(X_tsne, index=vocab, columns=['x', 'y'])
print(df.head())

  """


           x         y
0 -48.885067  5.443846
1 -44.316376 -5.828230
2 -52.608604  2.166635
3 -48.853947 -2.362439
4 -49.298435 -2.150290


In [180]:
# Representative points
# usher
df2 = df.loc[['3', '1744','79']]

# Guns'n rose
df3 = df.loc[['318', '365','314']]

# Bob Marley
df4 = df.loc[['1040', '3029','1110']]

# Fall out boys
df5 = df.loc[['2641', '2966','2348']]

print(df2)

              x         y
3    -48.853947 -2.362439
1744 -42.078548  1.058095
79   -42.875626 -5.258737


In [181]:
%matplotlib
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
import numpy as np

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

x = df['x']
y = df['y']

# Calculate the point density
xy = np.vstack([x,y])
z = gaussian_kde(xy)(xy)

# Plot a scatter
ax.scatter(x, y, c=z, s=10, edgecolor='')

#
for i,pos in df2.iterrows():
    print(pos)
    ax.plot(pos[0],pos[1],'m^')

#
for i,pos in df3.iterrows():
    print(pos)
    ax.plot(pos[0],pos[1],'bv')
    
#
for i,pos in df4.iterrows():
    print(pos)
    ax.plot(pos[0],pos[1],'yD')

#
for i,pos in df5.iterrows():
    print(pos)
    ax.plot(pos[0],pos[1],'rx')
    
# # Show annotating of items in df2
# for word, pos in df2.iterrows():
#     ax.annotate(word, pos)
# for word, pos in df3.iterrows():
#     ax.annotate(word, pos)
    
plt.show()

Using matplotlib backend: Qt5Agg
x   -48.853947
y    -2.362439
Name: 3, dtype: float32
x   -42.078548
y     1.058095
Name: 1744, dtype: float32
x   -42.875626
y    -5.258737
Name: 79, dtype: float32
x    23.850960
y   -36.922859
Name: 318, dtype: float32
x    28.731955
y   -40.050190
Name: 365, dtype: float32
x    28.174059
y   -15.112495
Name: 314, dtype: float32
x    -1.294396
y    64.735275
Name: 1040, dtype: float32
x     4.238662
y    70.469826
Name: 3029, dtype: float32
x     5.145927
y    71.447586
Name: 1110, dtype: float32
x   -19.509104
y    -8.304080
Name: 2641, dtype: float32
x   -15.760060
y   -12.127275
Name: 2966, dtype: float32
x   -19.897903
y    -9.086158
Name: 2348, dtype: float32


# Use of genre data

In [5]:
# 
import pandas as pd

df_sum = pd.read_table("./dataset/yes_i2v/summary.txt", delimiter='\t', header=None, names=('Num', 'Music Name', 'Artist', 'Tags'))
print(df_sum.head(),'\n')

# Print similiraties by number of music
def print_simi(music_num):
    simi_list = model.wv.similar_by_word(str(music_num), topn =5)
    print("Searching Music: ",music_num, '\t',df_sum.iat[music_num, 1], '\t',df_sum.iat[music_num, 2], "\n")
    for l in simi_list:
        print("Similar Music: ", df_sum.iat[int(l[0]), 1], '\t',df_sum.iat[int(l[0]), 2])
        print(l)

        
print_simi(1040)
        

   Num                                         Music Name      Artist  \
0    0                       Gucci Time (w\/ Swizz Beatz)  Gucci Mane   
1    1  Aston Martin Music (w\/ Drake & Chrisette Mich...   Rick Ross   
2    2                      Get Back Up (w\/ Chris Brown)        T.I.   
3    3                 Hot Toddy (w\/ Jay-Z & Ester Dean)       Usher   
4    4                                       Whip My Hair      Willow   

                                             Tags  
0                                         115 173  
1      14 27 62 88 90 110 115 123 155 173 190 214  
2                                         115 173  
3                                        2 72 173  
4  2 6 24 52 62 72 88 107 115 126 141 155 173 190   

Searching Music:  1040 	 Three Little Birds 	 Bob Marley & The Wailers 

Similar Music:  One Love \/ People Get Ready 	 Bob Marley & The Wailers
('1059', 0.9395015835762024)
Similar Music:  Is This Love? 	 Bob Marley & The Wailers
('1099', 0.92983

  if np.issubdtype(vec.dtype, np.int):


In [6]:
from collections import Counter

df_tag = pd.read_table("./dataset/yes_small/tag_hash.txt", delimiter=',', header=None, names=('Num', 'Tag'))

# Get tags of all of five similar musics
def get_tags(music_num):
    simi_list = model.wv.similar_by_word(str(music_num), topn =5)
    tags = []
    for l in simi_list:
        tags_str = df_sum.iat[int(l[0]), 3]
        tags_list = tags_str.split()
        for i in tags_list:
            tags.append(i)
    return(tags)
    
print(get_tags(8),'\n')

# Get top n of common tags in five similar musics
def top_n_tags(music_num, topn=3):
    tags = get_tags(music_num)
    tags_counts = Counter(tags)
    top_n = tags_counts.most_common(topn)
    top_n_tag_name = []
    for i in top_n:
        top_n_tag_name.append(df_tag.iat[int(i[0]),1])
#         print(df_tag.iat[int(i[0]),1])
#     for j in top_n:
#         times.append(j[1])
    return top_n, top_n_tag_name

print(top_n_tags(8,topn=4))  
top_n_tags(8)

['50', '62', '72', '88', '90', '98', '115', '88', '115', '155', '173', '90', '115', '155', '173', '250'] 

([('115', 3), ('88', 2), ('90', 2), ('155', 2)], [' wjlb-fm', ' hip hop', ' rap', ' wkqi-fm'])


  if np.issubdtype(vec.dtype, np.int):


([('115', 3), ('88', 2), ('90', 2)], [' wjlb-fm', ' hip hop', ' rap'])

In [24]:
def genre_of_simis(music_num):
    print_simi(music_num)
    print('\n')
    print("Top N common tags: ",top_n_tags(music_num)[1])
    times = []
    for i in top_n_tags(music_num)[0]:
        times.append(i[1])
    print("Appearance times of common tags: ", times,'\n')
    
    
genre_of_simis(2519)

Searching Music:  2519 	 Watching Airplanes 	 Gary Allan 

Similar Music:  Good Morning Beautiful 	 Steve Holy
('513', 0.9254266619682312)
Similar Music:  International Harvester 	 Craig Morgan
('1888', 0.9051247835159302)
Similar Music:  Something To Be Proud Of 	 Montgomery Gentry
('481', 0.9023962616920471)
Similar Music:  Lesson In Leavin' 	 Jo Dee Messina
('586', 0.9023303389549255)
Similar Music:  Free And Easy (Down The Road I Go) 	 Dierks Bentley
('479', 0.8975435495376587)


Top N common tags:  [' country', ' 00s', ' good song']
Appearance times of common tags:  [4, 3, 2] 



  if np.issubdtype(vec.dtype, np.int):


## Comparing rec-music's tags with searching-music's tags

In [8]:
def cal_common_tags(music_num):
    tags_str = df_sum.iat[music_num, 3]
    tags_list = tags_str.split()
    list_a = []
    for i in tags_list:
        list_a.append(i)
#     print(list_a)
    #
    list_b = []
    top_n = top_n_tags(music_num, topn=5)[0]
    for j in top_n:
        list_b.append(j[0])
#     print(list_b,'\n')
    #
    result = list(set(list_a) & set(list_b))
    if len(result):    # not empty
#         print(result)
#         print('true')
        return 1
    else:
#         print('false')
        return 0
        

def cal_acc():
    tf_list = []
    for i in range(0, 2015):
#         print(i)
        tf_list.append(cal_common_tags(i))
    for j in range(2016, 3167):
#         print(j)
        tf_list.append(cal_common_tags(j))
    total = len(tf_list)
    right_count =[]
    for i in tf_list:
        if i ==1:
            right_count.append(i)
    right = len(right_count)
    print(right/total)

    
cal_acc()

  if np.issubdtype(vec.dtype, np.int):


0.9071383449147189
