In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## **Word2Vec**

In [None]:
import numpy as np
import pandas as pd
import pickle

In [None]:
data = pickle.load(open('/content/drive/MyDrive/Minor Project/30music.pkl', 'rb')).rename(columns = {'UserId': 'user_id', 'SessionId': 'session_id', 'ItemId': 'song_id', 'Time': 'time', 'ArtistId': 'artist_id'})
data.head(5)

Unnamed: 0,user_id,session_id,song_id,time,artist_id
0,27063,1889046,2691760,1402997433,337496
1,27063,1889046,2691717,1402997784,337496
2,27063,1889046,2691636,1402998064,337496
3,27063,1889046,2691702,1402998298,337496
4,27063,1889046,2691783,1402998576,337496


In [None]:
len(data['song_id'].unique())

34645

In [None]:
session_lengths = data.groupby('session_id').size()
data.shape, session_lengths

((50000, 5),
 session_id
 1047030     11
 1047034     11
 1047072     21
 1047073     15
 1047074     11
           ... 
 995808      26
 995809      10
 995894       6
 995901      60
 995903     128
 Length: 2942, dtype: int64)

In [None]:
data = data[np.in1d(data.session_id, session_lengths[session_lengths>1].index)]
item_supports = data.groupby('song_id').size()
data = data[np.in1d(data.song_id, item_supports[item_supports>=5].index)]
session_lengths = data.groupby('session_id').size()
data1 = data[np.in1d(data.session_id, session_lengths[session_lengths>=2].index)]

In [None]:
groups = data1.groupby('session_id')
aggregated = groups['song_id'].agg(sequence = lambda x: list(map(str, x)))
init_ts = groups['time'].min()
users = groups['user_id'].min()
data = aggregated.join(init_ts).join(users)
data.reset_index(inplace=True)
data.head()

Unnamed: 0,session_id,sequence,time,user_id
0,1047073,"[2798569, 2798569]",1391361459,12572
1,1047074,"[2798815, 2798815, 3343146]",1391364748,12572
2,1047078,"[2799599, 2799599]",1391535211,12572
3,1072002,"[314212, 136742]",1395868047,21212
4,1072005,"[314212, 314212, 314212, 1857216, 3755626, 185...",1395931174,21212


In [None]:
from collections import Counter
cnt = Counter()
data.sequence.map(cnt.update)

0      None
1      None
2      None
3      None
4      None
       ... 
805    None
806    None
807    None
808    None
809    None
Name: sequence, Length: 810, dtype: object

In [None]:
sequence_length = data.sequence.map(len).values
n_sessions_per_user = data.groupby('user_id').size()

In [None]:
print('Number of items: {}'.format(len(cnt)))
print('Number of users: {}'.format(data.user_id.nunique()))
print('Number of sessions: {}'.format(len(data)))
print('\nSession length:\n\tAverage: {:.2f}\n\tMedian: {}\n\tMin: {}\n\tMax: {}'.format(
    sequence_length.mean(), 
    np.quantile(sequence_length, 0.5), 
    sequence_length.min(), 
    sequence_length.max()))
print('Sessions per user:\n\tAverage: {:.2f}\n\tMedian: {}\n\tMin: {}\n\tMax: {}'.format(
    n_sessions_per_user.mean(), 
    np.quantile(n_sessions_per_user, 0.5), 
    n_sessions_per_user.min(), 
    n_sessions_per_user.max()))

Number of items: 863
Number of users: 239
Number of sessions: 810

Session length:
	Average: 7.78
	Median: 4.0
	Min: 2
	Max: 169
Sessions per user:
	Average: 3.39
	Median: 2.0
	Min: 1
	Max: 43


In [None]:
dictList = data['sequence']
for i in range(0,len(dictList)):
        if len(dictList[i])<6:
            w=6-len(dictList[i])
            dictList[i]=['10000']*w+dictList[i]

data['sequence'] = dictList

data.head()

Unnamed: 0,session_id,sequence,time,user_id
0,1047073,"[10000, 10000, 10000, 10000, 2798569, 2798569]",1391361459,12572
1,1047074,"[10000, 10000, 10000, 2798815, 2798815, 3343146]",1391364748,12572
2,1047078,"[10000, 10000, 10000, 10000, 2799599, 2799599]",1391535211,12572
3,1072002,"[10000, 10000, 10000, 10000, 314212, 136742]",1395868047,21212
4,1072005,"[314212, 314212, 314212, 1857216, 3755626, 185...",1395931174,21212


In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
import gensim
from gensim.models import Word2Vec

In [None]:
model = Word2Vec(data['sequence'], min_count = 1, vector_size = 150, window = 5, workers = 20)

In [None]:
#find full vocabular
entire_products=[]
for key in model.wv.index_to_key:
    entire_products.append(key)

vectors = model.syn1neg

In [None]:
a = data['sequence']
word2vec_data = []
for i in range(len(a)):
    seq = a[i]
    seq_vector = []
    for j in range(len(seq)):
        item = seq[j]
        index = entire_products.index(item)
        item_vector = vectors[index]
        seq_vector.append(item_vector)
    word2vec_data.append(seq_vector)

data['word2vec_songs'] = word2vec_data

In [None]:
X = []

for i in range(len(word2vec_data)):
    seq = word2vec_data[i]
    if len(seq)>= 5:
        for j in range(0,(len(seq)-5)):  
            X.append(seq[j:j+5])
            #y.append(seq[j+5])

X = np.array(X)

X.shape

(3761, 5, 150)

In [None]:
v = data['sequence'].tolist()
y = []
for i in range(len(v)):
    seq = v[i]
    if len(seq)>= 5:
        for j in range(0,(len(seq)-5)):  
            #X.append(seq[j:j+5])
            y.append(seq[j+5])

In [None]:
y = list(map(int, y))
label_encoding_data = list(map(int, entire_products))
from sklearn.preprocessing import LabelEncoder
product_label=LabelEncoder()
product_label.fit(label_encoding_data)
y = product_label.transform(y)

type(y), max(y)

(numpy.ndarray, 863)

In [None]:
data

Unnamed: 0,session_id,sequence,time,user_id,word2vec_songs
0,1047073,"[10000, 10000, 10000, 10000, 2798569, 2798569]",1391361459,12572,"[[-0.3713296, -0.033110447, -0.024135217, -0.2..."
1,1047074,"[10000, 10000, 10000, 2798815, 2798815, 3343146]",1391364748,12572,"[[-0.3713296, -0.033110447, -0.024135217, -0.2..."
2,1047078,"[10000, 10000, 10000, 10000, 2799599, 2799599]",1391535211,12572,"[[-0.3713296, -0.033110447, -0.024135217, -0.2..."
3,1072002,"[10000, 10000, 10000, 10000, 314212, 136742]",1395868047,21212,"[[-0.3713296, -0.033110447, -0.024135217, -0.2..."
4,1072005,"[314212, 314212, 314212, 1857216, 3755626, 185...",1395931174,21212,"[[-0.009991194, -0.0012773755, 0.00024308628, ..."
...,...,...,...,...,...
805,995801,"[10000, 10000, 10000, 3791273, 3362034, 3791273]",1391645977,36135,"[[-0.3713296, -0.033110447, -0.024135217, -0.2..."
806,995802,"[3791273, 3791273, 3791273, 3791273, 3791273, ...",1391686922,36135,"[[-0.013292257, -0.0018995431, 0.0010803611, -..."
807,995804,"[10000, 10000, 10000, 10000, 3791273, 1035704]",1391732295,36135,"[[-0.3713296, -0.033110447, -0.024135217, -0.2..."
808,995901,"[3785891, 3785941, 3785906, 3785892, 3785938, ...",1405306632,36135,"[[-0.0008515109, -0.0002014215, 0.00075316103,..."


In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X.reshape(X.shape[0], -1)).reshape(X.shape)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
from numpy import array
from keras.models import Sequential
from keras.layers import Dense, Input
from keras.layers import Flatten,Reshape
from keras.layers.convolutional import Conv1D, Conv2D
from keras.layers.convolutional import MaxPooling1D, MaxPooling2D
import tensorflow as tf
from keras.models import Model

In [None]:
tf.config.run_functions_eagerly(True)

In [None]:
n_steps=5
n_features=150
batch_size=32
total_vocab=len(entire_products)

In [None]:
# define model
# vertical_model = Sequential()
first_input = Input(shape=(n_steps, n_features, 1))
vertical_model = Conv2D(filters=512, kernel_size=(5,1), activation='relu', padding='valid')(first_input)
vertical_model = Reshape(target_shape = (vertical_model.shape[2], vertical_model.shape[3]))(vertical_model)
vertical_model = MaxPooling1D(pool_size = vertical_model.shape[1])(vertical_model)
vertical_model = Flatten()(vertical_model)

# horizontal_model = Sequential()
second_input = Input(shape=(n_steps, n_features, 1))
horizontal_model = Conv2D(filters=512, kernel_size=(3,50), activation='relu', padding='valid')(second_input)
# print(horizontal_model.shape)
horizontal_model = Reshape(target_shape = (horizontal_model.shape[1]*horizontal_model.shape[2], horizontal_model.shape[3]))(horizontal_model)
horizontal_model = MaxPooling1D(pool_size = horizontal_model.shape[1])(horizontal_model)
horizontal_model = Flatten()(horizontal_model)

# merge_model = Sequential()
merged = tf.keras.layers.Concatenate()([vertical_model, horizontal_model])
# merge_model.add(merged)

merge_model = Dense(256, activation='relu')(merged)
merge_model = Dense(64, activation='relu')(merged)
merge_model = Dense(total_vocab, activation='softmax')(merge_model)
model = Model(inputs = [first_input, second_input], outputs = merge_model)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',metrics=['accuracy'])
model.summary()

Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_23 (InputLayer)          [(None, 5, 150, 1)]  0           []                               
                                                                                                  
 input_24 (InputLayer)          [(None, 5, 150, 1)]  0           []                               
                                                                                                  
 conv2d_24 (Conv2D)             (None, 1, 150, 512)  3072        ['input_23[0][0]']               
                                                                                                  
 conv2d_25 (Conv2D)             (None, 3, 101, 512)  77312       ['input_24[0][0]']               
                                                                                            

In [None]:
# fit model
history = model.fit([X_train, X_train], 
          y_train, 
          epochs=50,
          validation_split=0.1,
          verbose=2)

Epoch 1/10




85/85 - 23s - loss: 6.5680 - accuracy: 0.0236 - val_loss: 6.3511 - val_accuracy: 0.0266 - 23s/epoch - 266ms/step
Epoch 2/10
85/85 - 22s - loss: 6.0420 - accuracy: 0.0465 - val_loss: 6.1012 - val_accuracy: 0.0432 - 22s/epoch - 259ms/step
Epoch 3/10
85/85 - 23s - loss: 5.8042 - accuracy: 0.0602 - val_loss: 6.0398 - val_accuracy: 0.0498 - 23s/epoch - 272ms/step
Epoch 4/10
85/85 - 19s - loss: 5.6202 - accuracy: 0.0676 - val_loss: 6.0130 - val_accuracy: 0.0631 - 19s/epoch - 228ms/step
Epoch 5/10
85/85 - 20s - loss: 5.4729 - accuracy: 0.0798 - val_loss: 5.9217 - val_accuracy: 0.0731 - 20s/epoch - 237ms/step
Epoch 6/10
85/85 - 20s - loss: 5.3498 - accuracy: 0.0935 - val_loss: 5.8688 - val_accuracy: 0.0764 - 20s/epoch - 235ms/step
Epoch 7/10
85/85 - 19s - loss: 5.2160 - accuracy: 0.1045 - val_loss: 5.8763 - val_accuracy: 0.0930 - 19s/epoch - 229ms/step
Epoch 8/10
85/85 - 21s - loss: 5.1289 - accuracy: 0.1093 - val_loss: 5.8576 - val_accuracy: 0.0997 - 21s/epoch - 247ms/step
Epoch 9/10
85/85 - 

In [None]:
import matplotlib.pyplot as plt

In [None]:
history.history['accuracy']

In [None]:
plt.plot(np.arange(len(history.history['acc']), history.history[]))