In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## **BERT**

In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
data = pickle.load(open('/content/drive/MyDrive/Minor Project/30music.pkl', 'rb')).rename(columns = {'UserId': 'user_id', 'SessionId': 'session_id', 'ItemId': 'song_id', 'Time': 'time', 'ArtistId': 'artist_id'})
data.head(5)

Unnamed: 0,user_id,session_id,song_id,time,artist_id
0,27063,1889046,2691760,1402997433,337496
1,27063,1889046,2691717,1402997784,337496
2,27063,1889046,2691636,1402998064,337496
3,27063,1889046,2691702,1402998298,337496
4,27063,1889046,2691783,1402998576,337496


In [3]:
len(data['user_id'].unique()), len(data['session_id'].unique()), len(data['song_id'].unique())

(21, 63, 817)

In [4]:
un = data['song_id'].unique()
len(un)

817

In [5]:
session_lengths = data.groupby('session_id').size()
data.shape, session_lengths

((1000, 5),
 session_id
 1204341    10
 1204342    10
 1204346    10
 1204347     5
 1218982    11
            ..
 549752     10
 549754     12
 549755      6
 549756      4
 956148      5
 Length: 63, dtype: int64)

In [6]:
data = data[np.in1d(data.session_id, session_lengths[session_lengths>1].index)]
item_supports = data.groupby('song_id').size()
data = data[np.in1d(data.song_id, item_supports[item_supports>=5].index)]
session_lengths = data.groupby('session_id').size()
data1 = data[np.in1d(data.session_id, session_lengths[session_lengths>=2].index)]

In [7]:
groups = data1.groupby('session_id')
aggregated = groups['song_id'].agg(sequence = lambda x: list(map(str, x)))
init_ts = groups['time'].min()
users = groups['user_id'].min()
data = aggregated.join(init_ts).join(users)
data.reset_index(inplace=True)
data.head()

Unnamed: 0,session_id,sequence,time,user_id
0,1914594,"[2026734, 2026731, 2026734, 2026731, 2026734, ...",1411136679,34053
1,3621,"[903913, 903913, 903913, 903913, 903913, 90391...",1421310258,40265
2,3624,"[3550262, 3550262, 3550262, 3550262]",1421345989,40265
3,3626,"[3550262, 1801147, 1801147, 1801147]",1421419115,40265
4,3629,"[1801147, 1801147, 1801147, 1801147, 1801147]",1421435583,40265


In [8]:
from collections import Counter
cnt = Counter()
data.sequence.map(cnt.update)

0    None
1    None
2    None
3    None
4    None
Name: sequence, dtype: object

In [9]:
sequence_length = data.sequence.map(len).values
n_sessions_per_user = data.groupby('user_id').size()

In [10]:
print('Number of items: {}'.format(len(cnt)))
print('Number of users: {}'.format(data.user_id.nunique()))
print('Number of sessions: {}'.format(len(data)))
print('\nSession length:\n\tAverage: {:.2f}\n\tMedian: {}\n\tMin: {}\n\tMax: {}'.format(
    sequence_length.mean(), 
    np.quantile(sequence_length, 0.5), 
    sequence_length.min(), 
    sequence_length.max()))
print('Sessions per user:\n\tAverage: {:.2f}\n\tMedian: {}\n\tMin: {}\n\tMax: {}'.format(
    n_sessions_per_user.mean(), 
    np.quantile(n_sessions_per_user, 0.5), 
    n_sessions_per_user.min(), 
    n_sessions_per_user.max()))

Number of items: 5
Number of users: 2
Number of sessions: 5

Session length:
	Average: 7.60
	Median: 5.0
	Min: 4
	Max: 14
Sessions per user:
	Average: 2.50
	Median: 2.5
	Min: 1
	Max: 4


In [11]:
dictList = data['sequence']
for i in range(0,len(dictList)):
        if len(dictList[i])<6:
            w=6-len(dictList[i])
            dictList[i]=['10000']*w+dictList[i]

data['sequence'] = dictList

data.head()

Unnamed: 0,session_id,sequence,time,user_id
0,1914594,"[2026734, 2026731, 2026734, 2026731, 2026734, ...",1411136679,34053
1,3621,"[903913, 903913, 903913, 903913, 903913, 90391...",1421310258,40265
2,3624,"[10000, 10000, 3550262, 3550262, 3550262, 3550...",1421345989,40265
3,3626,"[10000, 10000, 3550262, 1801147, 1801147, 1801...",1421419115,40265
4,3629,"[10000, 1801147, 1801147, 1801147, 1801147, 18...",1421435583,40265


In [None]:
!pip install tensorflow_text

In [13]:
from nltk.tokenize import sent_tokenize, word_tokenize
import tensorflow_hub as hub
import tensorflow_text as text

In [14]:
bert_preprocess_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
bert_encoder_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'

bert_preprocess = hub.KerasLayer(bert_preprocess_url)
bert_model = hub.KerasLayer(bert_encoder_url)

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

def get_similarity(sentences):
    preprocessed_text = bert_preprocess(sentences)
    result = bert_model(preprocessed_text)
    return cosine_similarity([result['pooled_output'][0]], [result['pooled_output'][1]])

In [16]:
def get_embedding(sentence, embedding_length):
    preprocessed_text = bert_preprocess(sentence)
    result = bert_model(preprocessed_text)
    return result['pooled_output'][0][0:embedding_length]

In [19]:
emb_length = 50

a = data['sequence']
bert = []
for i in range(len(a)):
    seq = a[i]
    seq_vector = []
    for j in range(len(seq)):
        item = seq[j]
        seq_vector.append(get_embedding([str(item)], emb_length))
    bert.append(seq_vector)

data['bert'] = bert

In [None]:
X = []

for i in range(len(bert)):
    seq = bert[i]
    if len(seq)>= 5:
        for j in range(0,(len(seq)-5)):  
            X.append(seq[j:j+5])
            #y.append(seq[j+5])

X = np.array(X)

X.shape

In [None]:
v = data['sequence'].tolist()
y = []
for i in range(len(v)):
    seq = v[i]
    if len(seq)>= 5:
        for j in range(0,(len(seq)-5)):  
            #X.append(seq[j:j+5])
            y.append(seq[j+5])

In [None]:
y = list(map(int, y))
label_encoding_data = list(m)
from sklearn.preprocessing import LabelEncoder
product_label=LabelEncoder()
product_label.fit(label_encoding_data)
y = product_label.transform(y)

type(y), max(y)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X.reshape(X.shape[0], -1)).reshape(X.shape)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
from numpy import array
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten,Reshape
from keras.layers.convolutional import Conv1D, Conv2D
from keras.layers.convolutional import MaxPooling1D, MaxPooling2D
import tensorflow as tf
from keras.models import Model

In [None]:
tf.config.run_functions_eagerly(True)

In [None]:
n_steps=5
n_features=50
batch_size=32
total_vocab=len(m)

In [None]:
# define model
# vertical_model = Sequential()
first_input = Input(shape=(n_steps, n_features, 1))
vertical_model = Conv2D(filters=512, kernel_size=(5,1), activation='relu', padding='valid')(first_input)
vertical_model = Reshape(target_shape = (vertical_model.shape[2], vertical_model.shape[3]))(vertical_model)
vertical_model = MaxPooling1D(pool_size = vertical_model.shape[1])(vertical_model)
vertical_model = Flatten()(vertical_model)

# horizontal_model = Sequential()
second_input = Input(shape=(n_steps, n_features, 1))
horizontal_model = Conv2D(filters=512, kernel_size=(3,50), activation='relu', padding='valid')(second_input)
# print(horizontal_model.shape)
horizontal_model = Reshape(target_shape = (horizontal_model.shape[1]*horizontal_model.shape[2], horizontal_model.shape[3]))(horizontal_model)
horizontal_model = MaxPooling1D(pool_size = horizontal_model.shape[1])(horizontal_model)
horizontal_model = Flatten()(horizontal_model)

# merge_model = Sequential()
merged = tf.keras.layers.Concatenate()([vertical_model, horizontal_model])
# merge_model.add(merged)

merge_model = Dense(256, activation='relu')(merged)
merge_model = Dense(64, activation='relu')(merged)
merge_model = Dense(total_vocab, activation='softmax')(merge_model)
model = Model(inputs = [first_input, second_input], outputs = merge_model)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',metrics=['accuracy'])
model.summary()

In [None]:
# fit model
history = model.fit([X_train, X_train], 
          y_train, 
          epochs=50,
          validation_split=0.1,
          verbose=2)

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(np.arange(len(history.history['accuracy'])), history.history['accuracy'], color='blue')
plt.plot(np.arange(len(history.history['val_accuracy'])), history.history['val_accuracy'], color='orange')
plt.legend(['train', 'validation'])
plt.title('Model accuracy')
plt.show()

In [None]:
plt.plot(np.arange(len(history.history['loss'])), history.history['loss'], color='blue')
plt.plot(np.arange(len(history.history['val_loss'])), history.history['val_loss'], color='orange')
plt.legend(['train', 'validation'])
plt.title('Model loss')
plt.show()