data processing

In [2]:
from itertools import chain 
import re
import pandas as pd
from nltk.tokenize import WordPunctTokenizer
def get_stopwords():
    with open('./data/stopwords.txt',encoding='utf-8') as f:  
        stopwords = list(set(f.read().splitlines()))
    return stopwords

def get_punctuations():
    with open('./data/punctuations.txt',encoding='utf-8') as f:
        punctuations = list(set(f.read().splitlines()))
    return punctuations

def clean_text(text): # Clean review text
    text = re.sub(r"[^A-Za-z]", " ", text)
    text = re.sub(r"\'s", " \'s", text)
    text = re.sub(r"\'ve", " \'ve", text)
    text = re.sub(r"n\'t", " n\'t", text)
    text = re.sub(r"\'re", " \'re", text)
    text = re.sub(r"\'d", " \'d", text)
    text = re.sub(r"\'ll", " \'ll", text)
    text = re.sub(r",", " , ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\(", " \( ", text)
    text = re.sub(r"\)", " \) ", text)
    text = re.sub(r"\?", " \? ", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = text.strip().lower()
    stop_words = get_stopwords()
    punctuations = get_punctuations()
    for p in punctuations:
        text = text.replace(p, ' ')  # Use spaces instead of punctuation marks
    word_list = WordPunctTokenizer().tokenize(text)  
    word_list = [word for word in word_list if word not in stop_words]
    return word_list
   
def user_item_reviews(x):
    ur = user_reviews.loc[x["userID"]].values.tolist()# get all user review
    ir = item_reviews.loc[x["itemID"]].values.tolist()# get all item review
    x["user_reviews"] = " ".join(list(chain(*list(chain(*ur))))[:50])
    x["item_reviews"] = " ".join(list(chain(*list(chain(*ir))))[:50])
    return x

qy_data_o = pd.read_csv('./data/Home_and_Kitchen.csv')
qy_data_o.columns = ['userID','itemID','ratings','reviews']
qy_data_p = qy_data_o.copy()
qy_data_p = qy_data_p.drop(qy_data_p[[not isinstance(x, str) or len(x) == 0 for x in qy_data_p['reviews']]].index)#Remove empty text
user_num_count = qy_data_p[['userID', 'ratings']].groupby('userID', as_index=False).size()#Count the number of userIDs
item_num_count = qy_data_p[['itemID', 'ratings']].groupby('itemID', as_index=False).size()#Count the number of itemIDs

user_num_unique = user_num_count.index#get userID
item_num_unique = item_num_count.index#get itemID
user_id_dict = dict((sid, i) for (i, sid) in enumerate(user_num_unique))#Renumber userID, from 0 to n, build mapping
item_id_dict = dict((uid, i) for (i, uid) in enumerate(item_num_unique))#Renumber itemID, from 0 to n, build mapping
qy_data_p['userID'] = qy_data_p['userID'].map(lambda x:user_id_dict[x])#Renumber userID, from 0 to n
qy_data_p['itemID'] = qy_data_p['itemID'].map(lambda x:item_id_dict[x])#Renumber itemID, from 0 to n
qy_data_p['reviews'] = qy_data_p['reviews'].apply(clean_text)#clean reviews

user_reviews = pd.pivot_table(qy_data_p,
                              index=["userID", "itemID"],
                              aggfunc=lambda x: x).drop("ratings", axis=1)#Aggregate function
item_reviews = pd.pivot_table(qy_data_p,
                               index=["itemID", "userID"],
                               aggfunc=lambda x: x).drop("ratings", axis=1)#Aggregate function
user_item_rating = qy_data_p.filter(regex='userID|itemID|ratings')#Filter review text
qy_data_cleaned = user_item_rating.apply(user_item_reviews, axis=1)#Get item-review, and user-review, get the final cleaned data
qy_data_cleaned.to_csv("./data/qy_data_cleaned.csv", index=False)#save


Model

In [2]:
from keras.models import Model
from keras.layers import Conv1D, MaxPooling1D, Flatten,Dropout
from keras.layers import Input, Dense
from keras.layers.merge import Concatenate
from keras.utils import to_categorical
from keras.layers.normalization import BatchNormalization
import keras

class CNNRecommend():
    def __init__(self,embedding_size,user_seq_len,item_seq_len):
        self.embedding_size = embedding_size
        self.user_input_layer, self.user_model = self.build_cnn_model(user_seq_len)#user cnn
        self.item_input_layer, self.item_model = self.build_cnn_model(item_seq_len)#item CNN
        self.combine = Concatenate()([self.user_model, self.item_model])#CNN stitching users and items
        self.model_out = Dense(6,activation='softmax')(self.combine)#The last layer,predict classification
    #Compile the CNN model
    def build_cnn_model(self, max_seq_len):
        input_layer = Input(shape=(max_seq_len, self.embedding_size))
        model = Conv1D(128, 5, padding='same',activation="relu")(input_layer)#
        model = MaxPooling1D(3, 3, padding='same')(model)
        model = Conv1D(64, 5, padding='same')(model)
        model = Flatten()(model)
        model = Dropout(0.5)(model)
        model = BatchNormalization()(model)
        model = Dense(128,activation="relu")(model)
        return input_layer, model
    #Build model
    def create_cnn_model(self):
        output = self.model_out
        self.model = Model(inputs=[self.user_input_layer, self.item_input_layer], outputs=[output])
        adam = keras.optimizers.Adam(lr = 0.005, beta_1=0.95, beta_2=0.999,epsilon=1e-08)#优化器
        self.model.compile(optimizer=adam, loss='categorical_crossentropy',metrics=['accuracy'])
     #train
    def train(self,user_reviews,item_reviews,ratings_data, epochs=5):
        self.create_cnn_model()#
        self.train_inputs = [user_reviews, item_reviews]
        self.train_outputs = to_categorical(ratings_data) 
        
        self.history = self.model.fit(self.train_inputs,
                                      self.train_outputs,
                                      validation_split=0.1,
                                      batch_size=32,
                                      epochs=epochs)
    def evaluate(self,user_reviews,item_reviews,ratings_data):
        self.test_inputs = [user_reviews, item_reviews]
        self.test_outputs = to_categorical(ratings_data)
        scores = self.model.evaluate(self.test_inputs, self.test_outputs, verbose=0)
        print('test accuracy:%.2f%%' % ( scores[1] * 100))



Using TensorFlow backend.


In [3]:
import pandas as pd
import numpy as np
from model import CNNRecommend
from sklearn.model_selection import train_test_split

#Convert each user’s comment into a digital vector, and get the word vector through GloVe
def glove_and_pad(item_seq_len, user_seq_len, pad_value, glove_map):
    def embed(row):
        sentence = str(row["user_reviews"]).split()[:user_seq_len]
        reviews = list(map(lambda word: glove_map.get(word)
            if word in glove_map else pad_value, sentence))
        row["user_reviews"] = reviews +\
                [pad_value] * (user_seq_len - len(reviews))
        sentence = str(row["item_reviews"]).split()[:item_seq_len]
        reviews = list(map(lambda word: glove_map.get(word)
            if word in glove_map else pad_value, sentence))
        row["item_reviews"] = reviews +\
                [pad_value] * (item_seq_len - len(reviews))
        return row
    return embed
#GloVe word vector mapping, dictionary type
def glove_map():
    with open('./data/glove.6B.50d.txt',encoding='utf-8') as fs:
        return {l[0]: np.asarray(l[1:], dtype="float32") for l in [line.split() for line in fs]}
qy_data_cleaned = pd.read_csv("./data/qy_data_cleaned.csv")
user_seq_sizes = qy_data_cleaned.loc[:, "user_reviews"].apply(lambda x: str(x).split()).apply(len)
item_seq_sizes = qy_data_cleaned.loc[:, "item_reviews"].apply(lambda x: str(x).split()).apply(len)
user_ptile = 40
item_ptile = 15
emb_size = 50
user_seq_len = int(np.percentile(user_seq_sizes, user_ptile))#Get the length of the user review text sequence
item_seq_len = int(np.percentile(item_seq_sizes, item_ptile))#Get the length of the item review text sequence

g2v_map = glove_map()#gloVe word vector dictionary mapping
glove_fn = glove_and_pad(item_seq_len, user_seq_len, np.array([0.0] * emb_size), g2v_map)#glove
train_data,test_data = train_test_split(hak_data_cleaned,test_size = 0.2,random_state=42) 
train_data_p = train_data.apply(glove_fn, axis=1)

train_rating = train_data_p.loc[:, "ratings"].values#Y value of training set
train_user_reviews = np.array(list(train_data_p.loc[:, "user_reviews"]))#The user_reviews feature matrix of the training set
train_item_reviews = np.array(list(train_data_p.loc[:, "item_reviews"]))#The item_reviews feature matrix of the training set

test_data_p= test_data.apply(glove_fn, axis=1)#test
test_rating = test_data_p.loc[:, "ratings"].values#test ratings
test_user_reviews = np.array(list(test_data_p.loc[:, "user_reviews"]))#test user_reviews
test_item_reviews = np.array(list(test_data_p.loc[:, "item_reviews"]))#test item_reviews


cnn = CNNRecommend(50, user_seq_len, item_seq_len)#Initialize cnn model
cnn.create_cnn_model()
cnn.train(train_user_reviews,train_item_reviews, train_rating, epochs=5)#train
cnn.evaluate(test_user_reviews,test_item_reviews,test_rating)#evaluate






Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Train on 35993 samples, validate on 4000 samples
Epoch 1/5





Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
test accuracy:70.92%
