In [None]:
from google.colab import drive
drive.mount('/content/drive')

## **BERT** **Embeddings** **Preparation**

In [None]:
!pip install tensorflow_text

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import pickle
import tensorflow_hub as hub
import tensorflow_text as text

Loading Dataset

In [None]:
data1 = pickle.load(open('/content/drive/MyDrive/Minor Project/30music.pkl', 'rb'))

In [None]:
data1['Frequency'] = data1.groupby('UserId')['UserId'].transform('count')
data1.sort_values('Frequency', inplace=True, ascending=False)

In [None]:
data = data1[:6000]

In [None]:
words = list(data['ItemId'].unique())

In [None]:
words

In [None]:
bert_preprocess_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
bert_encoder_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'

bert_preprocess = hub.KerasLayer(bert_preprocess_url)
bert_model = hub.KerasLayer(bert_encoder_url)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def get_similarity(sentences):
    preprocessed_text = bert_preprocess(sentences)
    result = bert_model(preprocessed_text)
    return cosine_similarity([result['pooled_output'][0]], [result['pooled_output'][1]])

In [None]:
def get_embedding(sentence, embedding_length):
    preprocessed_text = bert_preprocess(sentence)
    result = bert_model(preprocessed_text)
    return result['pooled_output'][0][0:embedding_length]

In [None]:
def prepare_dataset(window, top_n, words, data):
   
    df = data.groupby(by=['SessionId'])
    for s, d in df:
        d = d.sort_values(by=['Time'])
    
    basket = []
    next_song = []
    for s, d in df:
        words = list(d['ItemId'])
        i = 1
        j = top_n
        m = window+top_n
        if m > len(d):
            m = len(d)
        while j < m:
            basket.append(words[0:i])
            next_song.append(words[i:j])
            i += 1
            j += 1
        for i in range(len(d)-window-top_n-1):
            basket.append(words[i:i+window])
            next_song.append(words[i+window:i+window+top_n])

    return (basket, next_song)

In [None]:
def get_dataset(window, top_n, words, data, m, emb_length):
    
    (basket, next_song) = prepare_dataset(window, top_n, words, data)
            
    song_basket = np.zeros((len(basket), window, emb_length), dtype=np.int32)
    recommended_song = np.zeros((len(next_song), top_n), dtype=np.int32)
    for i, each_words in enumerate(basket):
       for j, each_word in enumerate(each_words):
          emb = m[each_word]
          for k in range(emb_length):
            song_basket[i, j, k] = emb[k]
    
    for i, top_songs in enumerate(next_song):
        for j, song  in enumerate(top_songs):
            recommended_song[i, j] = song
    
    return (song_basket, recommended_song)

Preparing Embeddings

In [None]:
m = {}
for w in words:
  m[w] = get_embedding([str(w)], 50)

pickle.dump(m, open(f'/content/drive/MyDrive/Minor Project/bert_embs_50.pkl'))

(song_basket, recommended_song) = get_dataset(5, 1, words, data, m, 50)

pickle.dump(song_basket, open(f'/content/drive/MyDrive/Minor Project/song_basket_bert_5_1_50.pkl', 'wb'))
pickle.dump(recommended_song, open(f'/content/drive/MyDrive/Minor Project/recommended_song_bert_5_1_50.pkl', 'wb'))

In [None]:
m = pickle.load(open('/content/drive/MyDrive/Minor Project/bert_embs.pkl', 'rb'))

In [None]:
(song_basket, recommended_song) = get_dataset(5, 1, words, data, m, 50)

In [None]:
pickle.dump(song_basket, open('/content/drive/MyDrive/Minor Project/song_basket_bert_5_1_50.pkl', 'wb'))

In [None]:
pickle.dump(recommended_song, open('/content/drive/MyDrive/Minor Project/recommended_song_bert_5_1_50.pkl', 'wb'))