<a href="https://colab.research.google.com/github/Ying-Yuan07/TensorFlowLearn/blob/main/Attention_on_LSTM_for_effective_Recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
%tensorflow_version 1.x
!pip install keras==2.1.6

import pandas as pd
import numpy as np
import keras.models as kmodels
import keras.layers as klayers
import keras.backend as K
from gensim.models import Word2Vec
from keras.layers import Input,Dense,LSTM,Activation,RepeatVector,Permute,Flatten,Multiply
from keras.models import Sequential
from keras.models import load_model,Model
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
np.random.seed(123)

#Read data into pandas dataframe
df=pd.read_csv('./sample_data/ratings_Beauty.csv')
df=df[['UserId', 'ProductId','Rating','Timestamp']]

#Rename the columns
df.columns=['userid', 'item_id', 'rating', 'reviewTime']
df=df.sort_values(['reviewTime'],ascending=[True])

#Divide the train and test data
#取数据0-1500行为训练集，1500-2000为测试集

length=150
train_end = 200
train_data=df[:length]
test_data=df[length:train_end]
print(train_data.shape, test_data.shape)
train_data = train_data.reset_index()
test_data = test_data.reset_index()
    
#find item sequences by user in train and test
item_seq_train=train_data.groupby("userid")['item_id'].apply(list).values
item_seq_test=test_data.groupby("userid")['item_id'].apply(list).values

# We consider the minimum sequence length as 6. If length is less than 6, we append dummy ids to the sequence  
def min_six_len_seq(dictList):
    new_list=[]
    for i in range(0,len(dictList)):
        if len(dictList[i])<6:
            w=6-len(dictList[i])
            dictList[i]=['padding_id']*w+dictList[i]
    return dictList

#find the train and test sequences of min len 6
item_train = min_six_len_seq(item_seq_train)
item_test = min_six_len_seq(item_seq_test)

# train word2vec model on train_data to get item embeddings
#Word2Vec(train_data,size,window,min_count,iter)
#size是指词向量的维度，默认为100
#min_count:要计算词向量的最小词频。这个值可以去掉一些很生僻的低频词，默认是5.可以对字典做截断， 词频少于min_count次数的单词会被丢弃掉
#window:即词向量上下文最大距离,默认为5。window越大，则和某一词较远的词也会产生上下文关系，一般为[5,10]
#iter: 随机梯度下降法中迭代的最大次数，默认是5。对于大语料，可以增大这个值
#workers：用于控制训练的并行数。
def word2vec_model(train_data):
    model = Word2Vec(train_data,size=30,window = 3,min_count =1,iter=20)
    return model


#train the model and save
wv_model=word2vec_model(item_train)
wv_model.save('word2vec_model')

#find full vocabulary
entire_products=[]
for key,value in wv_model.wv.vocab.items():
    entire_products.append(key)
np.save('./entire_products.npy',entire_products)

# Divide the sequences into length of 6. ( First 5 items are for train, 6th one for target)
def input_sequences(new_list, win_size=5):
    input_seq=[]
    target=[]
    for i in range(0,len(new_list)):
        seq_len = len(new_list[i])
        for j in range(0,seq_len):
            if j+win_size<seq_len:
                if new_list[i][j+5] in entire_products:
                    input_seq.append(new_list[i][j:j+5])
                    target.append(new_list[i][j+5])
    return input_seq,target


# Encoding the target.If new item arrives which i
def num_products(target):
    product_label=LabelEncoder()
    product_label.fit(entire_products)
    target_int = product_label.transform(target)
    return target_int
    
#Create Train and test input and target sequences
train_x,target_train=input_sequences(item_seq_train)
train_y=num_products(target_train)
print("train_x")
print(train_x)
print("train_y")
print(train_y)

test_x,target_test=input_sequences(item_seq_test)
test_y=num_products(target_test)
    
#represent each item with prod2vec embedding and if new item comes in test set, represent it with random vec
unknown_item_id=np.random.random((50,))
def w2v_data_extraction(new_list):
    w2v_data=[]
    for i in range(0,len(new_list)):
        seq_vec=[]
        for j in range(0,len(new_list[i])):
            try:
                embedding=wv_model.wv[new_list[i][j]]
            except KeyError:
                embedding=unknown_item_id
            seq_vec.append(embedding)
                
        w2v_data.append(seq_vec)
    print("w2v_data.len")
    print(len(w2v_data))
    print(w2v_data)
    return np.asarray(w2v_data)

train_x_emb=w2v_data_extraction(train_x)
print("train_x_emb.shape")
print(train_x_emb.shape)
test_x_emb=w2v_data_extraction(test_x)

#model architecture
def model_arch():
    main_input = Input(shape=(5,50), name='main_input')
    lstm_out = LSTM(32)(main_input)
    attention = Dense(1, activation='tanh')(lstm_out)
    attention = Activation('softmax')(attention)
    attention = RepeatVector(32)(attention)
    attention = Permute([2, 1])(attention)
    attention = Flatten()(attention)
    attention_mul = Multiply()([lstm_out, attention])
    main_output = (Dense(total_vocab, activation='softmax', name='main_output')(attention_mul))
    model = Model(inputs=main_input, outputs=main_output)
    model.compile(loss='categorical_crossentropy',metrics=['accuracy'],optimizer='ADAM')
    return model


#represent output as one-hot encoded
def one_hot(seq,total_vocab):
    seq_one_hot=np.zeros([len(seq),total_vocab])
    for i in range(0,len(seq)):
        seq_one_hot[i][seq[i]]=1
    return seq_one_hot

#fit the model on our data
def model_fit(model,train_x,train_y,total_vocab):
    print("train_x.shape")
    print(train_x.shape)
    print("train_y")
    print(train_y.shape)
    train_y=one_hot(train_y,total_vocab)
    print("after one_hot train_y,total_vocab")
    print(train_y.shape)
    print(total_vocab)

    print("model is building")
    model.fit(batch_size=64,epochs=10,x=train_x,y=train_y)
    print("model building done")
    model.save('keras_model.h5')
    
    return model


total_vocab=12102
model = model_arch()
model=model_fit(model,train_x_emb,train_y,total_vocab)

# Hit rate at 1 on test data
def hit_rate_at_1(prediction,actual):
    return accuracy_score(prediction,actual)


# Hit rata at 5 on test data
def hit_rate_at_5(pred,actual):
    predics = []
    for i in range(0, len(pred)):
        predics.append(np.argsort(pred[i])[-5:])
    count = 0
    for i in range(0, len(predics)):
        if actual[i] in predics[i]:
            count = count + 1

    return count/len(actual)


# Hit rate at 10 on test data
def hit_rate_at_10(pred, actual):
    predics = []
    for i in range(0, len(pred)):
        predics.append(np.argsort(pred[i])[-10:])
    count = 0
    for i in range(0, len(predics)):
        if actual[i] in predics[i]:
            count = count + 1
    return count /len(actual)
    
# Prediction on test data
def model_predict(model,test_x,test_seq):
    pred=model.predict(x=test_x)
    preddy=np.argmax(a=pred,axis=1)

    print(hit_rate_at_1(preddy,test_seq))
    print(hit_rate_at_5(pred, test_seq))
    print(hit_rate_at_10(pred, test_seq))

#predict on test data
model_predict(model,test_x,test_y)
print("Done")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        0.3680197 , -0.2968575 ,  0.23394647,  0.24941292,  0.54502356,
        0.28388718, -0.41580448,  0.47924513,  0.35727534,  0.07948952],
      dtype=float32), array([-0.21010135,  0.26660705, -0.22901982,  0.39425156,  0.17946853,
        0.47275582,  0.11739594, -0.2903617 ,  0.23361132,  0.27668524,
       -0.36663324, -0.12605081, -0.31255257,  0.01251998,  0.34809384,
       -0.09433711, -0.3129253 ,  0.00512801, -0.00095393, -0.38051113,
        0.3680197 , -0.2968575 ,  0.23394647,  0.24941292,  0.54502356,
        0.28388718, -0.41580448,  0.47924513,  0.35727534,  0.07948952],
      dtype=float32)], [array([-0.21010135,  0.26660705, -0.22901982,  0.39425156,  0.17946853,
        0.47275582,  0.11739594, -0.2903617 ,  0.23361132,  0.27668524,
       -0.36663324, -0.12605081, -0.31255257,  0.01251998,  0.34809384,
       -0.09433711, -0.3129253 ,  0.00512801, -0.00095393, -0.38051113,
        0.3680197 , -0.



train_x.shape
(148, 5, 30)
train_y
(148,)
after one_hot train_y,total_vocab
(148, 12102)
12102
model is building


ValueError: ignored

In [6]:
!pip list

Package                       Version
----------------------------- ------------------------------
absl-py                       1.1.0
alabaster                     0.7.12
albumentations                0.1.12
altair                        4.2.0
appdirs                       1.4.4
argon2-cffi                   21.3.0
argon2-cffi-bindings          21.2.0
arviz                         0.12.1
astor                         0.8.1
astropy                       4.3.1
astunparse                    1.6.3
atari-py                      0.2.9
atomicwrites                  1.4.0
attrs                         21.4.0
audioread                     2.1.9
autograd                      1.4
Babel                         2.10.2
backcall                      0.2.0
backports.tempfile            1.0
backports.weakref             1.0.post1
beautifulsoup4                4.6.3
bleach                        5.0.0
blis                          0.7.7
bokeh                         2.3.3
branca                        

In [1]:
import numpy as np
a = np.array([3, 1, 2, 4, 6, 1])
print(np.argmax(a))

4


In [2]:
import numpy as np
a = np.array([[1, 5, 5, 2],
        [9, 6, 2, 8],
        [3, 7, 9, 1]])
print(np.argmax(a, axis=0))

[1 2 2 1]


In [3]:
import numpy as np
a = np.array([[1, 5, 5, 2],
        [9, 6, 2, 8],
        [3, 7, 9, 1]])
print(np.argmax(a, axis=1))

[1 0 2]
