In [11]:
import pandas as pd
from gensim.models import Word2Vec
import itertools
# from spotlight.factorization.implicit import ImplicitFactorizationModel

# 1 Загрузка и обработка данных

---
## 1.1 История взаимодействия пользователей и айтемов
---

In [42]:
# BOTIFY_DATA_DIR = "/Users/n.anokhin/Projects/recsys-course/botify/data/"
data_interaction_history = pd.read_json("../data.json", lines=True)[["user", "time", "track"]].copy()
data_interaction_history.head()

Unnamed: 0,user,time,track
0,3255,1.0,2659
1,3255,0.09,2910
2,3255,0.26,45829
3,3255,0.04,39183
4,3255,0.01,46211


In [43]:
deletion_threshold = 0.3

print(f'Количество взаимодействий до удаления: {data_interaction_history.shape}')
data_interaction_history = data_interaction_history[data_interaction_history.time > deletion_threshold]
print(f'Количество взаимодействий после удаления: {data_interaction_history.shape}')

Количество взаимодействий до удаления: (8035, 3)
Количество взаимодействий после удаления: (3291, 3)


In [44]:
data_interaction_history = data_interaction_history.sort_values(by=['user'])
grouped_interaction_history = data_interaction_history.groupby('user').agg(list)
grouped_interaction_history.head()

Unnamed: 0_level_0,time,track
user,Unnamed: 1_level_1,Unnamed: 2_level_1
4,[1.0],[26145]
66,[1.0],[30417]
75,[1.0],[40019]
114,"[1.0, 1.0]","[31095, 41710]"
123,[1.0],[933]


In [45]:
# Сгруппировать пользователей по треком для создания кластеров
mean_len = 0
cnt_len = 0
for row in grouped_interaction_history.iterrows():
    mean_len += len(row[1].track)
    cnt_len += 1
    
print(f"Среднее количество прослушиваний: {mean_len/cnt_len}")

Среднее количество прослушиваний: 3.428125


---
## 1.2 Треки
---

In [30]:
tracks = pd.read_json("../tracks.json", lines=True)[["artist", "title", "track"]].copy()
tracks.head(3)

Unnamed: 0,artist,title,track
0,Jack Johnson,The Cove,0
1,Billy Preston,Nothing from Nothing,1
2,Paco De Lucia,Entre Dos Aguas,2


In [31]:
tracks[tracks.track == 2659]

Unnamed: 0,artist,title,track
2659,Young Money featuring Lloyd,BedRock (Radio Edit) (feat.Lloyd),2659


In [32]:
# Преоборазуем имена артистов в нижний регист и соединим их в одно слово с момощью '_'
artist_new = ['_'.join(name_artist.lower().split()) for name_artist in tracks['artist']] 
tracks["artist"] = artist_new
tracks.head(3)

Unnamed: 0,artist,title,track
0,jack_johnson,The Cove,0
1,billy_preston,Nothing from Nothing,1
2,paco_de_lucia,Entre Dos Aguas,2


---
## 1.3 Создание набора данных для обучения Word2Vec

----

In [78]:
lines = []
for row in grouped_interaction_history.iterrows():
#     print([tracks[tracks.track == cur_track].artist.iloc[0] for cur_track in row[1].track])
    lines.append([tracks[tracks.track == cur_track].artist.iloc[0] for cur_track in row[1].track])
#     lines[-1] += '\n'

# with open("../data_word2vec.txt", 'w', encoding='utf-8') as f:
#     f.writelines(lines)


# 2 Обучение модели

In [79]:
model =  Word2Vec(sentences = lines,
                   vector_size=100,
                   window=2,
                   workers=4,
                   min_count=2,
                   seed=42)

model.save("../word2vec.model")

In [82]:
word_list = list(model.wv.index_to_key)

for words in word_list[1:4]:
    print('Similar Words for :',words)
    
    print(model.wv.similar_by_word(words))
    print('--------------------------\n')

Similar Words for : justin_bieber
[('interpol', 0.3228917717933655), ('daddy_yankee', 0.3192751109600067), ('ke$ha', 0.2966158092021942), ('jimmy_eat_world', 0.29390156269073486), ("plain_white_t's", 0.27308106422424316), ('scandinavian_music_group', 0.2667798697948456), ('dean_evenson', 0.26605960726737976), ('kid_cudi_/_mgmt_/_ratatat', 0.24694415926933289), ('dimmu_borgir', 0.24367579817771912), ('tiny_vipers', 0.23995180428028107)]
--------------------------

Similar Words for : train
[('southside_spinners', 0.3363018333911896), ('edward_sharpe_&_the_magnetic_zeros', 0.2604934275150299), ('mark_knopfler', 0.25393134355545044), ('traveling_wilburys', 0.24604935944080353), ('pj_harvey', 0.2302273064851761), ('morcheeba', 0.22037379443645477), ('billy_currington', 0.21870946884155273), ("plain_white_t's", 0.21692246198654175), ('propellerheads', 0.2140015959739685), ('godsmack', 0.21158666908740997)]
--------------------------

Similar Words for : kings_of_leon
[('falling_up', 0.37430

In [93]:
vectors = model.wv[word_list]
vectors_df = pd.DataFrame(vectors)
# vectors_df['word'] = word_list

In [94]:
vectors_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.007944,0.005371,0.003019,-0.001133,-0.001642,0.007355,-0.008097,0.004292,-0.006284,-0.008046,...,-0.001918,0.006652,0.006943,-0.002577,0.008332,-0.004402,-0.005211,0.003981,0.002564,-0.007583
1,0.00686,-0.005943,0.005826,-0.009747,0.005588,0.00615,0.005901,0.003475,-0.000934,0.004015,...,0.003264,0.003084,0.002365,-0.003178,0.009462,-0.008594,-0.003258,-0.007477,-0.003446,0.009126
2,-0.002558,0.008159,-0.0003,0.003969,-0.001116,-0.004512,0.005522,0.009624,-0.005077,0.005583,...,0.005629,-0.004227,0.00899,-0.003693,0.001294,0.000196,0.006913,-0.001066,-0.00451,-0.009893
3,-0.002344,0.006586,-0.010041,0.007977,0.000512,-0.006612,-0.003439,0.001513,-0.004313,-0.007983,...,0.00402,-0.0054,-0.00255,0.001399,0.004862,0.007019,-0.009248,-0.005785,-0.00457,-0.004015
4,0.010364,0.005645,-0.003869,0.009489,-0.000336,0.000377,-0.000719,-0.006542,0.008115,-0.009791,...,0.007875,0.008335,0.005089,0.001581,-0.003737,-0.002183,-0.007616,-0.002219,0.008318,-0.001873


In [97]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_matrix = pd.DataFrame(cosine_similarity(vectors_df))
cosine_matrix.columns = list(word_list)

In [122]:
reco_articles = {}
i = 0
for col_name in cosine_matrix.columns:
    tmp = cosine_matrix[[col_name]].sort_values(by=col_name, ascending=False)
    tmp = tmp.iloc[1:]
    tmp = tmp.head(20)
#     print(tmp.index)
#     break
    recommended_articles = [word_list[ind] for ind in tmp.index] # list(tracks[tracks['artist'].isin(tmp.index)]['artist'].values)
#     chosen_article = list(tracks[tracks['artist'] == col_name]['artist'].values)
#     print(f"chosen_article: {chosen_article}")
#     print(f"recommended_articles: {recommended_articles}")
#     break
    tmp = {'Chosen-Articles': col_name,
           'Recommended-Articles':recommended_articles}
    reco_articles[i] = tmp
    i = i+1
    del tmp
print('Ended')

Ended


In [123]:
df_reco = pd.concat([pd.DataFrame(v) for k, v in reco_articles.items()])
df_reco.head(100)

Unnamed: 0,Chosen-Articles,Recommended-Articles
0,linkin_park,john_brown\'s_body
1,linkin_park,brand_new
2,linkin_park,moving_hearts
3,linkin_park,nickel_creek
4,linkin_park,third_eye_blind
...,...,...
15,soltero,simian_mobile_disco
16,soltero,the_temper_trap
17,soltero,barry_tuckwell/academy_of_st_martin-in-the-fie...
18,soltero,gwen_stefani
