# Content-based

### word2vecVN

In [1]:
# Model trained on Le et al.'s data (window-size 5, 400 dims)
!wget https://thiaisotajppub.s3-ap-northeast-1.amazonaws.com/publicfiles/baomoi.model.bin

--2021-12-22 15:15:38--  https://thiaisotajppub.s3-ap-northeast-1.amazonaws.com/publicfiles/baomoi.model.bin
Resolving thiaisotajppub.s3-ap-northeast-1.amazonaws.com (thiaisotajppub.s3-ap-northeast-1.amazonaws.com)... 52.219.136.131
Connecting to thiaisotajppub.s3-ap-northeast-1.amazonaws.com (thiaisotajppub.s3-ap-northeast-1.amazonaws.com)|52.219.136.131|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 708212586 (675M) [application/macbinary]
Saving to: ‘baomoi.model.bin’


2021-12-22 15:16:10 (21.9 MB/s) - ‘baomoi.model.bin’ saved [708212586/708212586]



## File paths

In [1]:
data_root = './data'
user_1 = data_root + '/user1.csv'
user_2 = data_root + '/user2.csv'
user_3 = data_root + '/user3.csv'
full_data = data_root + '/full_data.csv'

oov400 = data_root + '/oov400_list.txt'

w2vec_400 = '/content/baomoi.model.bin'

## Libraries

In [None]:
!pip install texthero
!pip install num2words
!pip install viet-text-tools
!pip install pyvi

In [27]:
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from gensim import models
import texthero as hero
from num2words import num2words
from viet_text_tools import normalize_diacritics
from pyvi import ViTokenizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from scipy.stats import pearsonr

In [7]:
w2v_model = models.KeyedVectors.load_word2vec_format(w2vec_400, binary = True)

In [8]:
type(w2v_model)

gensim.models.keyedvectors.Word2VecKeyedVectors

In [9]:
w2v_model.most_similar('lk')#tình

[('thanh_hà_b', 0.7464016675949097),
 ('s=', 0.716158390045166),
 ('cccc', 0.6638514995574951),
 ('liền_kề', 0.65683913230896),
 ('chcc', 0.6486336588859558),
 ('Đại_học_vân_canh', 0.6474381685256958),
 ('xala', 0.6207592487335205),
 ('minh_giang_Đầm_và', 0.6187067031860352),
 ('dt', 0.5980429649353027),
 ('linh*', 0.5929301977157593)]

In [10]:
vocab = w2v_model.vocab

In [11]:
word_vec_dict = {}
for word in vocab:
  word_vec_dict[word] = w2v_model.get_vector(word)

In [12]:
import pickle

pickle.dump(word_vec_dict, open('word_vec_dict.sav', 'wb'))

In [13]:
k = pickle.load(open('/content/word_vec_dict.sav', 'rb'))

In [14]:
type(k)

dict

## Data

In [None]:
data_df = pd.read_csv(full_data)
user1_df = pd.read_csv(user_1)
user2_df = pd.read_csv(user_2)
user3_df = pd.read_csv(user_3)

In [18]:
raw_data_track = data_df.track_name
raw_user1_track = user1_df.track_name
raw_user2_track = user2_df.track_name
raw_user3_track = user3_df.track_name

## Preprocessing

In [19]:
def num_to_words(s):
  for w in s.split():
    if w.isdigit():
      s = s.replace(w, num2words(float(w), lang='vi'))
  return s

def remove_oov(s):
  oov_list = []
  with open(oov400, 'r') as f:
    for w in f:
      oov_list.append(w.strip())
  for word in s.split():
    if word in oov_list:
      s = s.replace(word, '')
  return s

def preprocessing(series):
  series = series.pipe(hero.fillna)
  series = series.pipe(hero.lowercase)
  series = series.pipe(hero.remove_punctuation)
  series = series.pipe(hero.remove_whitespace)
  series = series.apply(num_to_words)
  series = series.apply(normalize_diacritics)
  series = series.apply(remove_oov)
  series = series.pipe(hero.remove_whitespace)
  return list(series)

# def out_vocab(s):
#   word_vocab = []
#   for word in w2v_model.vocab:
#     word_vocab.append(word)
#   oov_list = []
#   for sentence in s:
#     for word in sentence.split():
#       if word not in word_vocab:
#         oov_list.append(word)
#   return oov_list

In [20]:
data_track = preprocessing(raw_data_track)
user1_track = preprocessing(raw_user1_track)
user2_track = preprocessing(raw_user2_track)
user3_track = preprocessing(raw_user3_track)

In [21]:
print(len(data_track))

1781


In [22]:
track_drop = []
for i in range(len(data_track)):
  if data_track[i] == '':
    print(i, end = ', ')
    track_drop.append(i)

47, 110, 232, 234, 243, 271, 517, 617, 647, 1315, 1345, 

In [None]:
# data_df.iloc[track_drop, :]

In [23]:
temp = []
for track in data_track:
  if track != '':
    temp.append(track)
data_track = temp
del temp
print(len(data_track))

1770


## Embedding and similarity functons

In [56]:
def vi_tokenize(ls_track):
  temp = []
  for track in ls_track:
    temp.append(ViTokenizer.tokenize(track))
  return temp

In [61]:
data_track = vi_tokenize(data_track)
user1_track = vi_tokenize(user1_track)
user2_track = vi_tokenize(user2_track)
user3_track = vi_tokenize(user3_track)

In [63]:
embeded_dims = 400

tok = Tokenizer(filters='')
tok.fit_on_texts(data_track)
vocab_size = len(tok.word_index) + 1

In [64]:
max_seq = 1
for track in data_track:
  if len(track.split()) > max_seq:
    max_seq = len(track.split())
print(max_seq)

15


In [65]:
def seq_padding(data, maxlen):
  encoded_seq_matrix = tok.texts_to_sequences(data)
  padded_seq_matrix = pad_sequences(encoded_seq_matrix, maxlen=maxlen, padding = 'post')
  return padded_seq_matrix

def embedding(padded, maxlen):
  embeded_matrix = np.zeros(shape = (vocab_size, embeded_dims))
  # embeded_matrix.setflags(write=1)
  for word, i in tok.word_index.items():
    embed_vector = word_vec_dict.get(word)
    if embed_vector is not None:
      embeded_matrix[i] = embed_vector
  
  embeddings = []
  for seq in padded: # get each padded sequence
    # temp = pd.Series()
    # for i in range(len(seq)):
    #   temp = pd.concat([temp, pd.Series(embeded_matrix[seq[i]])])
    # temp = np.array(temp).reshape(-1, embeded_dims)
    # embeddings.append(temp)
    temp = []
    for i in range(len(seq)):
      temp.append(np.mean(embeded_matrix[seq[i]]))
    embeddings.append(temp)
    del temp
  return np.array(embeddings)

def cosine_sim(x, y):
  k = np.dot(x, y.T)/(np.linalg.norm(x) * np.linalg.norm(y))
  return k

def pearson_sim(x, y):
  k, _ = pearsonr(x, y)
  return k

In [66]:
padded_track = seq_padding(data_track, max_seq)
padded_user1 = seq_padding(user1_track, max_seq)
padded_user2 = seq_padding(user2_track, max_seq)
padded_user3 = seq_padding(user3_track, max_seq)
print(padded_track)
print(padded_user1)
print(padded_user2)
print(padded_user3)

[[  20  212  253 ...    0    0    0]
 [ 742   54  213 ...    0    0    0]
 [ 181   18  743 ...    0    0    0]
 ...
 [  90   34  148 ...    0    0    0]
 [ 112  124  160 ...    0    0    0]
 [ 336  129 1637 ...    0    0    0]]
[[ 293   23  309   67   12    0    0    0    0    0    0    0    0    0
     0]
 [1442  215   62  467    0    0    0    0    0    0    0    0    0    0
     0]
 [1001 1002  344    0    0    0    0    0    0    0    0    0    0    0
     0]
 [  16  293    5    7   84    0    0    0    0    0    0    0    0    0
     0]
 [ 181   18  743    0    0    0    0    0    0    0    0    0    0    0
     0]
 [ 181    6  546   12   13    0    0    0    0    0    0    0    0    0
     0]
 [ 756   42    1    0    0    0    0    0    0    0    0    0    0    0
     0]
 [ 742   54  213   35   21    0    0    0    0    0    0    0    0    0
     0]
 [  89  126   16    2    0    0    0    0    0    0    0    0    0    0
     0]
 [ 232  229  361    0    0    0    0    0    0    0 

In [67]:
embeded_track = embedding(padded_track, max_seq)
embedded_user1 = embedding(padded_user1, max_seq)
embedded_user2 = embedding(padded_user2, max_seq)
embedded_user3 = embedding(padded_user3, max_seq)
print(embeded_track)
print(embedded_user1)
print(embedded_user2)
print(embedded_user3)

[[ 0.08403356 -0.02541728  0.03174873 ...  0.          0.
   0.        ]
 [-0.03130917 -0.04701964 -0.1238172  ...  0.          0.
   0.        ]
 [-0.10703138 -0.01422054 -0.00100994 ...  0.          0.
   0.        ]
 ...
 [ 0.09853326  0.01828721  0.04342335 ...  0.          0.
   0.        ]
 [ 0.10175381 -0.05758433  0.02685704 ...  0.          0.
   0.        ]
 [-0.07868152  0.08431231 -0.14304986 ...  0.          0.
   0.        ]]
[[-0.00665436 -0.05368769  0.03820119 -0.14003186 -0.00520911  0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.        ]
 [ 0.00451962 -0.02977728  0.01461879 -0.01424627  0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.        ]
 [-0.04711027 -0.00135678  0.02898023  0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.        ]
 [-0.07527336 -0.00665436 -0.04217839  0.04

## Arrange track_name and its feature extraction

In [68]:
def fillback_data(data_track, user1_track, user2_track, user3_track):
  data_track_name = data_df.track_name
  user1_track_name = user1_df.track_name
  user2_track_name = user2_df.track_name
  user3_track_name = user3_df.track_name
  data_track_name.drop(track_drop, inplace = True)
  data_track_name.reset_index(drop = True, inplace = True)

  df = pd.DataFrame()
  df['org_name'] = data_track_name
  df['preprocessed_name'] = data_track
  df['padding_name'] = list(map(lambda row: row, padded_track))
  df['embedding_name'] = list(map(lambda row: row, embeded_track))

  df1 = pd.DataFrame()
  df1['org_name'] = user1_track_name
  df1['preprocessed_name'] = user1_track
  df1['padding_name'] = list(map(lambda row: row, padded_user1))
  df1['embedding_name'] = list(map(lambda row: row, embedded_user1))

  df2 = pd.DataFrame()
  df2['org_name'] = user2_track_name
  df2['preprocessed_name'] = user2_track
  df2['padding_name'] = list(map(lambda row: row, padded_user2))
  df2['embedding_name'] = list(map(lambda row: row, embedded_user2))

  df3 = pd.DataFrame()
  df3['org_name'] = user3_track_name
  df3['preprocessed_name'] = user3_track
  df3['padding_name'] = list(map(lambda row: row, padded_user3))
  df3['embedding_name'] = list(map(lambda row: row, embedded_user3))

  return df, df1, df2, df3

In [69]:
f_data_track, f_user1_track, f_user2_track, f_user3_track = fillback_data(data_track, user1_track, user2_track, user3_track)

In [70]:
user1_track[1]

'dnb chill original mix'

In [71]:
f_user1_track.iloc[1, :]

org_name                                      DnB Chill (Original Mix)
preprocessed_name                               dnb chill original mix
padding_name         [1442, 215, 62, 467, 0, 0, 0, 0, 0, 0, 0, 0, 0...
embedding_name       [0.004519624118984211, -0.0297772849915782, 0....
Name: 1, dtype: object

## Experiments on recommendations

In [72]:
def top_10_similarity(user_embedded_track, raw_user_track, similarity = 'cosine'):
  if similarity.lower() not in ['cosine', 'pearson']:
    return None

  f_data_track, _, __, ___ = fillback_data(data_track, user1_track, user2_track, user3_track)
  temp = []
  if similarity.lower() == 'cosine':
    for name, embed in zip(f_data_track.org_name, f_data_track.embedding_name):
      if name in list(raw_user_track): continue
      sim = cosine_sim(user_embedded_track, np.array(embed))
      temp.append((name, sim))
  
  else:
    for name, embed in zip(f_data_track.org_name, f_data_track.embedding_name):
      if name in list(raw_user_track): continue
      sim = pearson_sim(user_embedded_track, np.array(embed))
      temp.append((name, sim))
  
  temp.sort(key = lambda x: x[1], reverse = True)
  return temp[:10]

In [None]:
# Get recommend for each track of user 1
# pearson, cosine
pear_recommend_user1 = []
cosine_recommend_user1 = []
for track in embedded_user1:
  pear_recommend_user1.append(top_10_similarity(track, raw_user1_track, similarity = 'pearson'))
  cosine_recommend_user1.append(top_10_similarity(track, raw_user1_track, similarity = 'cosine'))

In [None]:
# Get recommend for each track of user 2
# pearson, cosine
pear_recommend_user2 = []
cosine_recommend_user2 = []
for track in embedded_user2:
  pear_recommend_user2.append(top_10_similarity(track, raw_user2_track, similarity = 'pearson'))
  cosine_recommend_user2.append(top_10_similarity(track, raw_user2_track, similarity = 'cosine'))

In [None]:
# Get recommend for each track of user 3
# pearson, cosine
pear_recommend_user3 = []
cosine_recommend_user3 = []
for track in embedded_user3:
  pear_recommend_user3.append(top_10_similarity(track, raw_user3_track, similarity = 'pearson'))
  cosine_recommend_user3.append(top_10_similarity(track, raw_user3_track, similarity = 'cosine'))

In [74]:
pear_recommend_user2

[[('Càng Lớn Càng Cô Đơn', 0.7765023019832661),
  ('Hãy Để Anh Yêu Em Lần Nữa', 0.7763516957320088),
  ('Cho Em Lời Cuối', 0.7726481899890761),
  ('Ai Rồi Cũng Khác', 0.7656432742762591),
  ('có hẹn với thanh xuân', 0.7619172959991671),
  ('Nếu em thấy cô đơn', 0.7577041658556558),
  ('Ai Đưa Em Về', 0.7502448407343926),
  ('Làm Gì Phải Hốt', 0.7444783556816919),
  ('từ chối nhẹ nhàng thôi', 0.7439345341743858),
  ('Xin Cho Tôi Được Yêu', 0.7416374019128308)],
 [('Đom Đóm', 0.9999999999999999),
  ('Mặt Trăng', 0.9999999999999999),
  ('Katy', 0.9999999999999999),
  ('Tại Sao', 0.9999999999999999),
  ('Tình Cờ', 0.9999999999999999),
  ('Oanh', 0.9999999999999999),
  ('Berlin', 0.9999999999999998),
  ('Lửng Lơ', 0.9999999999999998),
  ('Thất Tình', 0.9999999999999998),
  ('ToGetHer', 0.9999999999999998)],
 [('Anh Sẽ Về Sớm Thôi', 0.7645014735134541),
  ('Chỉ Anh Hiểu Em', 0.7630906420749414),
  ('Chi Anh Hieu Em', 0.7619345123245223),
  ('Lời Sám Hối Của Kẻ Hấp Hối', 0.7369433188353293),


In [None]:
cosine_recommend_user2

[[('Gửi Anh Xa Nhớ', 0.8193181293188707),
  ('Người Đi Xa Mãi', 0.8164796233958908),
  ('Hãy Để Anh Yêu Em Lần Nữa', 0.8129821336131315),
  ('Cho Em Lời Cuối', 0.8107051530607989),
  ('Bao Lâu Ta Lại Yêu Một Người', 0.8047767814841236),
  ('Nếu Có Quay Về', 0.8045541627063351),
  ('Ai Rồi Cũng Khác', 0.8045414429894655),
  ('Đợi Em Trở Về', 0.8044246163670293),
  ('Nếu Phải Xa Nhau', 0.7976081327750625),
  ('Tha Rang Nhu The', 0.7967533378547059)],
 [('Đôi Lời', 0.9998386109107208),
  ('Nho Em', 0.9996348660867693),
  ('Trò Đùa - Lofi', 0.9991390552946141),
  ('Tiễn Em', 0.9965368320029894),
  ('Kém Duyên', 0.9957234481863052),
  ('Do Ai?', 0.995700569408309),
  ('Cưới Thôi', 0.9948733934818893),
  ('1 Phút', 0.9946780212787432),
  ('Thầm Mong', 0.9916216131288974),
  ('Mai đây', 0.9912934595629096)],
 [('Xin Em Cho Tôi Một Cơ Hội', 0.8298991802657743),
  ('vâng anh đi đi (liu riu version)', 0.7914250565887155),
  ('Chi Anh Hieu Em', 0.7679965815183721),
  ('Cho Tôi Lang Thang', 0.7667