<a href="https://colab.research.google.com/github/Ying-Yuan07/TensorFlowLearn/blob/main/app%E4%BD%BF%E7%94%A8%E6%8E%A8%E8%8D%90Attention_on_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%tensorflow_version 1.x
!pip install keras==2.1.6

import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from keras.layers import Input,Dense,LSTM,Activation,RepeatVector,Permute,Flatten,Multiply
from keras.models import Sequential
from keras.models import load_model,Model
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
np.random.seed(123)

#Read data into pandas dataframe
df=pd.read_csv('./sample_data/username_5.csv')
df=df[['week', 'hour_part','app_name']]

#Rename the columns
df.columns=['week', 'hour_part','app_name']
"""
df=df.sort_values(['reviewTime'],ascending=[True])
"""
#Divide the train and test data
#取数据0-1500行为训练集，1500-2000为测试集

#训练集测试集7:3
test_len=int(df.shape[0]*0.3)
train_data=df[test_len:]
test_data=df[:test_len]
print("train_data.shape, test_data.shape")
print(train_data.shape, test_data.shape)
train_data = train_data.reset_index()
test_data = test_data.reset_index()
    
#find item sequences by user in train and test
#按用户id排序，统计每个用户购买商品（item_id）的情况，将同一个用户购买的商品（item_id）放入同一个list，所有list组成一个array
item_seq_train=train_data.groupby("hour_part")['app_name'].apply(list).values
item_seq_test=test_data.groupby("hour_part")['app_name'].apply(list).values
print("item_seq_train.shape")
print(item_seq_train.shape)
print("item_seq_test.shape")
print(item_seq_test.shape)
# We consider the minimum sequence length as 6. If length is less than 6, we append dummy ids to the sequence  
# 一个用户购买商品数不足6个时，用'padding_id'填充到6个
def min_six_len_seq(dictList):
    new_list=[]
    for i in range(0,len(dictList)):
        if len(dictList[i])<6:
            w=6-len(dictList[i])
            dictList[i]=['app0']*w+dictList[i]
    return dictList

#find the train and test sequences of min len 6
item_train = min_six_len_seq(item_seq_train)
item_test = min_six_len_seq(item_seq_test)

# train word2vec model on train_data to get item embeddings
#Word2Vec(train_data,size,window,min_count,iter)
#size是指词向量的维度，默认为100
#min_count:要计算词向量的最小词频。这个值可以去掉一些很生僻的低频词，默认是5.可以对字典做截断， 词频少于min_count次数的单词会被丢弃掉
#window:即词向量上下文最大距离,默认为5。window越大，则和某一词较远的词也会产生上下文关系，一般为[5,10]
#iter: 随机梯度下降法中迭代的最大次数，默认是5。对于大语料，可以增大这个值
#workers：用于控制训练的并行数。
def word2vec_model(train_data):
    model = Word2Vec(train_data,size=10,window = 3,min_count =1,iter=5)
    return model


#train the model and save
wv_model=word2vec_model(item_train)
wv_model.save('word2vec_model')

#find full vocabulary
entire_products=[]
i = 0
for key,value in wv_model.wv.vocab.items():
    entire_products.append(key)
np.save('./entire_products.npy',entire_products)
print("len(entire_products)")
print(len(entire_products))
print("entire_products")
print(entire_products)

# Divide the sequences into length of 6. ( First 5 items are for train, 6th one for target)
#用5个历史购买商品预测下一个商品
def input_sequences(new_list, win_size=5):
    input_seq=[]
    target=[]
    for i in range(0,len(new_list)):
        seq_len = len(new_list[i])
        for j in range(0,seq_len):
            if j+win_size<seq_len:
                if new_list[i][j+5] in entire_products:
                    input_seq.append(new_list[i][j:j+5])
                    target.append(new_list[i][j+5])
    return input_seq,target


def labelencoder_fit():
    product_label=LabelEncoder()
    product_label.fit(entire_products)
    return product_label

# Encoding the target.If new item arrives which i
# 将目标商品列表target(原始数据是商品id)转换成商品词汇库中的序号
def num_products(product_label,target):
    target_int = product_label.transform(target)
    return target_int

def reverse_num_products(product_label,target_num):
    target = product_label.inverse_transform(target_num)
    return target

#Create Train and test input and target sequences
#train_x:5个历史购买商品id list组成的arrary,train_y:目标商品在商品词汇库中对应的序号list
product_label=labelencoder_fit()
train_x,target_train=input_sequences(item_seq_train)
train_y=num_products(product_label,target_train)

test_x,target_test=input_sequences(item_seq_test)
test_y=num_products(product_label,target_test)
    
#represent each item with prod2vec embedding and if new item comes in test set, represent it with random vec
#将train_x中的每一个商品id都映射成一个维度为10的向量
unknown_item_id=np.random.random((10,))
def w2v_data_extraction(new_list):
    w2v_data=[]
    for i in range(0,len(new_list)):
        seq_vec=[]
        for j in range(0,len(new_list[i])):
            try:
                embedding=wv_model.wv[new_list[i][j]]
            except KeyError:
                embedding=unknown_item_id
            seq_vec.append(embedding)
                
        w2v_data.append(seq_vec)
    return np.asarray(w2v_data)

train_x_emb=w2v_data_extraction(train_x)
test_x_emb=w2v_data_extraction(test_x)
print("train_x_emb.shape")
print(train_x_emb.shape)
#model architecture
def model_arch():
    main_input = Input(shape=(5,10), name='main_input')
    lstm_out = LSTM(32)(main_input)
    attention = Dense(1, activation='tanh')(lstm_out)
    attention = Activation('softmax')(attention)
    attention = RepeatVector(32)(attention)
    attention = Permute([2, 1])(attention)
    attention = Flatten()(attention)
    attention_mul = Multiply()([lstm_out, attention])
    main_output = (Dense(total_vocab, activation='softmax', name='main_output')(attention_mul))
    model = Model(inputs=main_input, outputs=main_output)
    model.compile(loss='categorical_crossentropy',metrics=['accuracy'],optimizer='ADAM')
    return model


#represent output as one-hot encoded
def one_hot(seq,total_vocab):
    seq_one_hot=np.zeros([len(seq),total_vocab])
    for i in range(0,len(seq)):
        seq_one_hot[i][seq[i]]=1
    return seq_one_hot

#fit the model on our data
def model_fit(model,train_x,train_y,total_vocab):
    print("train_x.shape")
    print(train_x.shape)
    print("train_y shape")
    print(train_y.shape)
    train_y=one_hot(train_y,total_vocab)
    print("after one_hot train_y,total_vocab")
    print(train_y.shape)

    print("model is building")
    model.fit(batch_size=64,epochs=50,x=train_x,y=train_y)
    print("model building done")
    model.save('keras_model.h5')
    
    return model


total_vocab=len(entire_products)
model = model_arch()
model=model_fit(model,train_x_emb,train_y,total_vocab)

# Hit rate at 1 on test data
def hit_rate_at_1(prediction,actual):
    return accuracy_score(prediction,actual)


# 召回率 将正类预测为正类/原本的正类 = TP/(TP+FP)
# Hit rata at 5 on test data
# pred 返回的是一个n行k列的数组，行代表样本，j代表标签
# 第i行第j列上的数值是模型预测第i个预测样本为第j个标签的概率，并且每一行的概率和为1
def hit_rate_at_5(pred,actual):
    predics = []
    for i in range(0, len(pred)):
        predics.append(np.argsort(pred[i])[-5:])#取出第i行，数值top5的下标放入predics，即筛选第i个样本预测概率top5的目标索引
    count = 0
    for i in range(0, len(predics)):#在n个样本预测的结果中，命中的次数
        if actual[i] in predics[i]:
            count = count + 1
    return count/len(actual)


# Hit rate at 10 on test data
def hit_rate_at_10(pred, actual):
    predics = []
    for i in range(0, len(pred)):
        predics.append(np.argsort(pred[i])[-10:])
    count = 0
    for i in range(0, len(predics)):
        if actual[i] in predics[i]:
            count = count + 1
    return count /len(actual)
    
# Prediction on test data
def model_predict(model,test_x,test_seq):

    pred=model.predict(x=test_x)
    preddy=np.argmax(a=pred,axis=1)

    print("pred.shape")
    print(pred.shape)
    print("preddy.shape")
    print(preddy.shape)

    print("hit_rate_at_1(preddy,test_seq)")
    print(hit_rate_at_1(preddy,test_seq))
    print("hit_rate_at_5(preddy,test_seq)")
    print(hit_rate_at_5(pred, test_seq))
    print("hit_rate_at_10(preddy,test_seq)")
    print(hit_rate_at_10(pred, test_seq))

#predict on test data
model_predict(model,test_x_emb,test_y)
print("Done")

simulate_data = df[:10]
simulate_data = simulate_data['app_name']
print("history app sequence:")
print(simulate_data)

simulate_x,target_simulate=input_sequences(simulate_data)
print("simulate_x")
print(simulate_x)
print("target_simulate")
print(target_simulate)
simulate_y=num_products(product_label,target_simulate)
print(list(reverse_num_products(product_label,simulate_y)))

simulate_x_emb=w2v_data_extraction(simulate_x)
print("simulate_x_emb.shape")
print(simulate_x_emb.shape)

simulate_pred=model.predict(x=simulate_x_emb)

def simulate_hit_rate_at_5(pred, actual):
    predics = []
    for i in range(0, len(pred)):
        predics.append(np.argsort(pred[i])[-5:])
    count = 0
    for i in range(0, len(predics)):
        if actual[i] in predics[i]:
            count = count + 1
            print("target app is:")
            print(entire_products[actual[i]])
            print(list(reverse_num_products(product_label,[actual[i]])))
            print("predict top5 app is:")
            for index in predics[i]:
              print(entire_products[index])
              print(list(reverse_num_products(product_label,[index])))
    print("hit")
    print(count)
    print("sample numbers")
    print(len(actual))
    return count /len(actual)

simulate_hit_rate_at_5(simulate_pred,simulate_y)



In [23]:
#模拟手机环境使用
#输入5个历史使用app，输出1个target app
topk = 3 #用于计算recall@k
simulate_data = df[:6]
#simulate_data = simulate_data['app_name']
print("history app sequence:")
print(simulate_data)

item_seq_simulate=simulate_data.groupby("hour_part")['app_name'].apply(list).values
item_simulate = min_six_len_seq(item_seq_simulate)

simulate_x,target_simulate=input_sequences(item_simulate)
print("simulate_x")
print(simulate_x)
print("target_simulate")
print(target_simulate)
simulate_y=num_products(product_label,target_simulate)
print("target_simulate num")
print(simulate_y)
print("reverse target_simulate num")
print(list(reverse_num_products(product_label,simulate_y)))

simulate_x_emb=w2v_data_extraction(simulate_x)
print("simulate_x_emb.shape")
print(simulate_x_emb.shape)

simulate_pred=model.predict(x=simulate_x_emb)

def simulate_hit_rate_at_topk(topk, pred, actual):
    predics = []
    for i in range(0, len(pred)):
        predics.append(np.argsort(pred[i])[0-topk:])
    count = 0
    for i in range(0, len(predics)):
        if actual[i] in predics[i]:
            count = count + 1
            print("target app is:")
            print(list(reverse_num_products(product_label,[actual[i]])))
            print("predict top5 app is:")
            for index in predics[i]:
              print(list(reverse_num_products(product_label,[index])))
    print("hit")
    print(count)
    print("sample numbers")
    print(len(actual))
    return count /len(actual)

recall_k = simulate_hit_rate_at_topk(topk,simulate_pred,simulate_y)
print("recall@%d is %f",topk,recall_k)


history app sequence:
   week  hour_part       app_name
0     0          1          Gmail
1     0          1         Google
2     0          1      Instagram
3     0          1  Google Chrome
4     0          1  Google Chrome
5     0          1  Google Chrome
simulate_x
[['Gmail', 'Google', 'Instagram', 'Google Chrome', 'Google Chrome']]
target_simulate
['Google Chrome']
target_simulate num
[6]
reverse target_simulate num
['Google Chrome']
simulate_x_emb.shape
(1, 5, 10)
target app is:
['Google Chrome']
predict top5 app is:
['Facebook']
['Google']
['Google Chrome']
hit
1
sample numbers
1
recall@%d is %f 3 1.0


In [None]:
#load entire_products
loadData = np.load('entire_products.npy')
print(loadData.shape)
print(loadData)

for i in range(0,len(entire_products)):
  if loadData[i] == entire_products[i]:
    print(i)
    print(loadData[i])

In [None]:
#load word2vec_model
w2v_model = Word2Vec.load('word2vec_model')
mobile_test_x = ['Google Chrome','Yahoo Mail','Yahoo Mail','Google Chrome','Yahoo Mail']
mobile_test_y = ['Yahoo Mail']

unknown_item=np.random.random((10,))
def w2v_data_extraction(w2v_model,new_list):
    w2v_data=[]
    for i in range(0,len(new_list)):
      try:
        embedding=wv_model.wv[new_list[i]]
      except KeyError:
        embedding=unknown_item
      w2v_data.append(embedding)
    return np.asarray(w2v_data)
mobile_test_x_emb = w2v_data_extraction(w2v_model,mobile_test_x)
print(mobile_test_x_emb)


In [None]:
#load lstm model
!pip install h5py==3.0.0
lstm_model = load_model('keras_model.h5')
# make predictions
yhat = lstm_model.predict(x = mobile_test_x_emb)
top5 = np.argsort(yhat[0])[-5:]
print(top5)

In [51]:
!pip list

Package                       Version
----------------------------- ------------------------------
absl-py                       1.1.0
alabaster                     0.7.12
albumentations                0.1.12
altair                        4.2.0
appdirs                       1.4.4
argon2-cffi                   21.3.0
argon2-cffi-bindings          21.2.0
arviz                         0.12.1
astor                         0.8.1
astropy                       4.3.1
astunparse                    1.6.3
atari-py                      0.2.9
atomicwrites                  1.4.1
attrs                         21.4.0
audioread                     2.1.9
autograd                      1.4
Babel                         2.10.3
backcall                      0.2.0
backports.tempfile            1.0
backports.weakref             1.0.post1
beautifulsoup4                4.6.3
bleach                        5.0.1
blis                          0.7.8
bokeh                         2.3.3
branca                        