# Content-based

### word2vecVN

In [1]:
# Model trained on Le et al.'s data (window-size 5, 400 dims)
!wget https://thiaisotajppub.s3-ap-northeast-1.amazonaws.com/publicfiles/baomoi.model.bin

--2021-12-22 14:42:25--  https://thiaisotajppub.s3-ap-northeast-1.amazonaws.com/publicfiles/baomoi.model.bin
Resolving thiaisotajppub.s3-ap-northeast-1.amazonaws.com (thiaisotajppub.s3-ap-northeast-1.amazonaws.com)... 52.219.17.50
Connecting to thiaisotajppub.s3-ap-northeast-1.amazonaws.com (thiaisotajppub.s3-ap-northeast-1.amazonaws.com)|52.219.17.50|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 708212586 (675M) [application/macbinary]
Saving to: ‘baomoi.model.bin’


2021-12-22 14:43:03 (18.2 MB/s) - ‘baomoi.model.bin’ saved [708212586/708212586]



## File paths

In [1]:
data_root = './data'
user_1 = data_root + '/user1.csv'
user_2 = data_root + '/user2.csv'
user_3 = data_root + '/user3.csv'
full_data = data_root + '/full_data.csv'

oov400 = data_root + '/oov400_list.txt'

w2vec_400 = '/content/baomoi.model.bin'

## Libraries

In [3]:
!pip install texthero
!pip install num2words
!pip install viet-text-tools
!pip install pyvi

Collecting texthero
  Downloading texthero-1.1.0-py3-none-any.whl (24 kB)
Collecting nltk>=3.3
  Downloading nltk-3.6.6-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 7.0 MB/s 
Collecting unidecode>=1.1.1
  Downloading Unidecode-1.3.2-py3-none-any.whl (235 kB)
[K     |████████████████████████████████| 235 kB 55.3 MB/s 
Collecting regex>=2021.8.3
  Downloading regex-2021.11.10-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (749 kB)
[K     |████████████████████████████████| 749 kB 48.5 MB/s 
Installing collected packages: regex, unidecode, nltk, texthero
  Attempting uninstall: regex
    Found existing installation: regex 2019.12.20
    Uninstalling regex-2019.12.20:
      Successfully uninstalled regex-2019.12.20
  Attempting uninstall: nltk
    Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
      Successfully uninstalled nltk-3.2.5
Successfully installed nltk-3.6.6 regex-2021.11.10 texthero-1.1.0 unidecode-1.3.2
Collecting

In [4]:
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from gensim import models
import texthero as hero
from num2words import num2words
from viet_text_tools import normalize_diacritics
from pyvi import ViTokenizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from scipy.stats import pearsonr

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
w2v_model = models.KeyedVectors.load_word2vec_format(w2vec_400, binary = True)

In [6]:
type(w2v_model)

gensim.models.keyedvectors.Word2VecKeyedVectors

In [7]:
w2v_model.most_similar('lk')#tình

[('thanh_hà_b', 0.7464016675949097),
 ('s=', 0.716158390045166),
 ('cccc', 0.6638514995574951),
 ('liền_kề', 0.65683913230896),
 ('chcc', 0.6486336588859558),
 ('Đại_học_vân_canh', 0.6474381685256958),
 ('xala', 0.6207592487335205),
 ('minh_giang_Đầm_và', 0.6187067031860352),
 ('dt', 0.5980429649353027),
 ('linh*', 0.5929301977157593)]

In [8]:
vocab = w2v_model.vocab

In [9]:
word_vec_dict = {}
for word in vocab:
  word_vec_dict[word] = w2v_model.get_vector(word)

In [10]:
import pickle

pickle.dump(word_vec_dict, open('word_vec_dict.sav', 'wb'))

In [11]:
k = pickle.load(open('/content/word_vec_dict.sav', 'rb'))

In [12]:
type(k)

dict

## Data

In [13]:
data_df = pd.read_csv(full_data)
user1_df = pd.read_csv(user_1)
user2_df = pd.read_csv(user_2)
user3_df = pd.read_csv(user_3)

In [36]:
raw_data_track = data_df.track_name
raw_user1_track = user1_df.track_name
raw_user2_track = user2_df.track_name
raw_user3_track = user3_df.track_name

## Preprocessing

In [37]:
def num_to_words(s):
  for w in s.split():
    if w.isdigit():
      s = s.replace(w, num2words(float(w), lang='vi'))
  return s

def remove_oov(s):
  oov_list = []
  with open(oov400, 'r') as f:
    for w in f:
      oov_list.append(w.strip())
  for word in s.split():
    if word in oov_list:
      s = s.replace(word, '')
  return s

def preprocessing(series):
  series = series.pipe(hero.fillna)
  series = series.pipe(hero.lowercase)
  series = series.pipe(hero.remove_punctuation)
  series = series.pipe(hero.remove_whitespace)
  series = series.apply(num_to_words)
  series = series.apply(normalize_diacritics)
  series = series.apply(remove_oov)
  series = series.pipe(hero.remove_whitespace)
  return list(series)

# def out_vocab(s):
#   word_vocab = []
#   for word in w2v_model.vocab:
#     word_vocab.append(word)
#   oov_list = []
#   for sentence in s:
#     for word in sentence.split():
#       if word not in word_vocab:
#         oov_list.append(word)
#   return oov_list

In [38]:
data_track = preprocessing(raw_data_track)
user1_track = preprocessing(raw_user1_track)
user2_track = preprocessing(raw_user2_track)
user3_track = preprocessing(raw_user3_track)

In [39]:
print(len(data_track))

1781


In [40]:
track_drop = []
for i in range(len(data_track)):
  if data_track[i] == '':
    print(i, end = ', ')
    track_drop.append(i)

47, 110, 232, 234, 243, 271, 517, 617, 647, 1315, 1345, 

In [41]:
# data_df.iloc[track_drop, :]

In [42]:
temp = []
for track in data_track:
  if track != '':
    temp.append(track)
data_track = temp
del temp
print(len(data_track))

1770


## Embedding and similarity functons

In [43]:
embeded_dims = 400

tok = Tokenizer()
tok.fit_on_texts(data_track)
vocab_size = len(tok.word_index) + 1

In [44]:
max_seq = 1
for track in data_track:
  if len(track.split()) > max_seq:
    max_seq = len(track.split())
print(max_seq)

17


In [45]:
def seq_padding(data, maxlen):
  encoded_seq_matrix = tok.texts_to_sequences(data)
  padded_seq_matrix = pad_sequences(encoded_seq_matrix, maxlen=maxlen, padding = 'post')
  return padded_seq_matrix

def embedding(padded, maxlen):
  embeded_matrix = np.zeros(shape = (vocab_size, embeded_dims))
  # embeded_matrix.setflags(write=1)
  for word, i in tok.word_index.items():
    embed_vector = word_vec_dict.get(word)
    if embed_vector is not None:
      embeded_matrix[i] = embed_vector
  
  embeddings = []
  for seq in padded: # get each padded sequence
    # temp = pd.Series()
    # for i in range(len(seq)):
    #   temp = pd.concat([temp, pd.Series(embeded_matrix[seq[i]])])
    # temp = np.array(temp).reshape(-1, embeded_dims)
    # embeddings.append(temp)
    temp = []
    for i in range(len(seq)):
      temp.append(np.mean(embeded_matrix[seq[i]]))
    embeddings.append(temp)
    del temp
  return np.array(embeddings)

def cosine_sim(x, y):
  k = np.dot(x, y.T)/(np.linalg.norm(x) * np.linalg.norm(y))
  return k

def pearson_sim(x, y):
  k, _ = pearsonr(x, y)
  return k

In [46]:
padded_track = seq_padding(data_track, max_seq)
padded_user1 = seq_padding(user1_track, max_seq)
padded_user2 = seq_padding(user2_track, max_seq)
padded_user3 = seq_padding(user3_track, max_seq)
print(padded_track)
print(padded_user1)
print(padded_user2)
print(padded_user3)

[[ 23 182 307 ...   0   0   0]
 [363  72 254 ...   0   0   0]
 [216  21  20 ...   0   0   0]
 ...
 [ 45  42 173 ...   0   0   0]
 [132 104 160 ...   0   0   0]
 [421 150 807 ...   0   0   0]]
[[ 310   27  393   87   11    0    0    0    0    0    0    0    0    0
     0    0    0]
 [1313  256   68  560    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [ 683  171  428 1005  288    0    0    0    0    0    0    0    0    0
     0    0    0]
 [  14  310    4    9   93    0    0    0    0    0    0    0    0    0
     0    0    0]
 [ 216   21   20  594  183    0    0    0    0    0    0    0    0    0
     0    0    0]
 [ 216   21    7  621   11   13    0    0    0    0    0    0    0    0
     0    0    0]
 [ 483  258   59    1    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [ 363   72  254   19   26    0    0    0    0    0    0    0    0    0
     0    0    0]
 [ 109  125   14    3    0    0    0    0    0    0    0    0    0    0
     0    0    0

In [47]:
embeded_track = embedding(padded_track, max_seq)
embedded_user1 = embedding(padded_user1, max_seq)
embedded_user2 = embedding(padded_user2, max_seq)
embedded_user3 = embedding(padded_user3, max_seq)
print(embeded_track)
print(embedded_user1)
print(embedded_user2)
print(embedded_user3)

[[ 0.08403356 -0.02541728  0.03174873 ...  0.          0.
   0.        ]
 [-0.03130917 -0.04701964 -0.1238172  ...  0.          0.
   0.        ]
 [-0.04028999 -0.09138102 -0.01422054 ...  0.          0.
   0.        ]
 ...
 [ 0.09853326  0.01828721  0.04342335 ...  0.          0.
   0.        ]
 [ 0.10175381 -0.05758433  0.02685704 ...  0.          0.
   0.        ]
 [-0.07868152  0.08431231 -0.12648652 ...  0.          0.
   0.        ]]
[[-0.00665436 -0.05368769  0.03820119 -0.14003186 -0.00520911  0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.        ]
 [ 0.00451962 -0.02977728  0.01461879 -0.01424627  0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.        ]
 [ 0.06071658  0.02026855  0.05153608  0.06919722  0.02898023  0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.     

## Arrange track_name and its feature extraction

In [48]:
def fillback_data(data_track, user1_track, user2_track, user3_track):
  data_track_name = data_df.track_name
  user1_track_name = user1_df.track_name
  user2_track_name = user2_df.track_name
  user3_track_name = user3_df.track_name
  data_track_name.drop(track_drop, inplace = True)
  data_track_name.reset_index(drop = True, inplace = True)

  df = pd.DataFrame()
  df['org_name'] = data_track_name
  df['preprocessed_name'] = data_track
  df['padding_name'] = list(map(lambda row: row, padded_track))
  df['embedding_name'] = list(map(lambda row: row, embeded_track))

  df1 = pd.DataFrame()
  df1['org_name'] = user1_track_name
  df1['preprocessed_name'] = user1_track
  df1['padding_name'] = list(map(lambda row: row, padded_user1))
  df1['embedding_name'] = list(map(lambda row: row, embedded_user1))

  df2 = pd.DataFrame()
  df2['org_name'] = user2_track_name
  df2['preprocessed_name'] = user2_track
  df2['padding_name'] = list(map(lambda row: row, padded_user2))
  df2['embedding_name'] = list(map(lambda row: row, embedded_user2))

  df3 = pd.DataFrame()
  df3['org_name'] = user3_track_name
  df3['preprocessed_name'] = user3_track
  df3['padding_name'] = list(map(lambda row: row, padded_user3))
  df3['embedding_name'] = list(map(lambda row: row, embedded_user3))

  return df, df1, df2, df3

In [49]:
f_data_track, f_user1_track, f_user2_track, f_user3_track = fillback_data(data_track, user1_track, user2_track, user3_track)

In [50]:
user1_track[1]

'dnb chill original mix'

In [51]:
f_user1_track.iloc[1, :]

org_name                                      DnB Chill (Original Mix)
preprocessed_name                               dnb chill original mix
padding_name         [1313, 256, 68, 560, 0, 0, 0, 0, 0, 0, 0, 0, 0...
embedding_name       [0.004519624118984211, -0.0297772849915782, 0....
Name: 1, dtype: object

## Experiments on recommendations

In [70]:
def top_10_similarity(user_embedded_track, raw_user_track, similarity = 'cosine'):
  if similarity.lower() not in ['cosine', 'pearson']:
    return None

  f_data_track, _, __, ___ = fillback_data(data_track, user1_track, user2_track, user3_track)
  temp = []
  if similarity.lower() == 'cosine':
    for name, embed in zip(f_data_track.org_name, f_data_track.embedding_name):
      if name in list(raw_user_track): continue
      sim = cosine_sim(user_embedded_track, np.array(embed))
      temp.append((name, sim))
  
  else:
    for name, embed in zip(f_data_track.org_name, f_data_track.embedding_name):
      if name in list(raw_user_track): continue
      sim = pearson_sim(user_embedded_track, np.array(embed))
      temp.append((name, sim))
  
  temp.sort(key = lambda x: x[1], reverse = True)
  return temp[:10]

In [71]:
# Get recommend for each track of user 1
# pearson, cosine
pear_recommend_user1 = []
cosine_recommend_user1 = []
for track in embedded_user1:
  pear_recommend_user1.append(top_10_similarity(track, raw_user1_track, similarity = 'pearson'))
  cosine_recommend_user1.append(top_10_similarity(track, raw_user1_track, similarity = 'cosine'))

In [72]:
# Get recommend for each track of user 2
# pearson, cosine
pear_recommend_user2 = []
cosine_recommend_user2 = []
for track in embedded_user2:
  pear_recommend_user2.append(top_10_similarity(track, raw_user2_track, similarity = 'pearson'))
  cosine_recommend_user2.append(top_10_similarity(track, raw_user2_track, similarity = 'cosine'))

In [73]:
# Get recommend for each track of user 3
# pearson, cosine
pear_recommend_user3 = []
cosine_recommend_user3 = []
for track in embedded_user3:
  pear_recommend_user3.append(top_10_similarity(track, raw_user3_track, similarity = 'pearson'))
  cosine_recommend_user3.append(top_10_similarity(track, raw_user3_track, similarity = 'cosine'))

In [74]:
pear_recommend_user2

[[('Gửi Anh Xa Nhớ', 0.7883439914457534),
  ('Người Đi Xa Mãi', 0.7847957667433556),
  ('Hãy Để Anh Yêu Em Lần Nữa', 0.7813846498221825),
  ('Cho Em Lời Cuối', 0.777914263379685),
  ('Bao Lâu Ta Lại Yêu Một Người', 0.7724085611624149),
  ('Ai Rồi Cũng Khác', 0.7710145506811358),
  ('Đợi Em Trở Về', 0.7709195837971156),
  ('Nếu Có Quay Về', 0.7708851064053134),
  ('Nếu Phải Xa Nhau', 0.7626524903892149),
  ('Tha Rang Nhu The', 0.7611706659667573)],
 [('Đôi Lời', 0.9998574053706696),
  ('Nho Em', 0.9996773303494944),
  ('Trò Đùa - Lofi', 0.9992399291977879),
  ('Tiễn Em', 0.9969352475276807),
  ('Kém Duyên', 0.9962143558638433),
  ('Do Ai?', 0.9962050614112038),
  ('Cưới Thôi', 0.9954605874458536),
  ('1 Phút', 0.9952872924518334),
  ('Thầm Mong', 0.9926036547467731),
  ('Mai đây', 0.9922822137246519)],
 [('Xin Em Cho Tôi Một Cơ Hội', 0.8225465681800739),
  ('vâng anh đi đi (liu riu version)', 0.8036965898406014),
  ('Cho Tôi Lang Thang', 0.7681218209026226),
  ('Chi Anh Hieu Em', 0.7625

In [75]:
cosine_recommend_user2

[[('Gửi Anh Xa Nhớ', 0.8193181293188707),
  ('Người Đi Xa Mãi', 0.8164796233958908),
  ('Hãy Để Anh Yêu Em Lần Nữa', 0.8129821336131315),
  ('Cho Em Lời Cuối', 0.8107051530607989),
  ('Bao Lâu Ta Lại Yêu Một Người', 0.8047767814841236),
  ('Nếu Có Quay Về', 0.8045541627063351),
  ('Ai Rồi Cũng Khác', 0.8045414429894655),
  ('Đợi Em Trở Về', 0.8044246163670293),
  ('Nếu Phải Xa Nhau', 0.7976081327750625),
  ('Tha Rang Nhu The', 0.7967533378547059)],
 [('Đôi Lời', 0.9998386109107208),
  ('Nho Em', 0.9996348660867693),
  ('Trò Đùa - Lofi', 0.9991390552946141),
  ('Tiễn Em', 0.9965368320029894),
  ('Kém Duyên', 0.9957234481863052),
  ('Do Ai?', 0.995700569408309),
  ('Cưới Thôi', 0.9948733934818893),
  ('1 Phút', 0.9946780212787432),
  ('Thầm Mong', 0.9916216131288974),
  ('Mai đây', 0.9912934595629096)],
 [('Xin Em Cho Tôi Một Cơ Hội', 0.8298991802657743),
  ('vâng anh đi đi (liu riu version)', 0.7914250565887155),
  ('Chi Anh Hieu Em', 0.7679965815183721),
  ('Cho Tôi Lang Thang', 0.7667