In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import time

  from ._conv import register_converters as _register_converters


In [2]:

def processData(fraction = 1):
    buys_name = ['SessionID','TimeStamp','productSku','Price','Quantity']
    df_buys = pd.read_csv('../resource/yoochoose-data/yoochoose-buys.dat',names=buys_name,dtype={'productSku':int,'SessionID':int})
    df_buys = df_buys[df_buys['Quantity']!=0]

    sessionWithbuy = df_buys['SessionID'].unique()
    sessionWithbuy = sessionWithbuy[0 : int (fraction * len(sessionWithbuy))]
    clicks_names = ['SessionID','TimeStamp','productSku','productCategory']
    df_clicks = pd.read_csv('../resource/yoochoose-data/yoochoose-clicks.dat',names=clicks_names,dtype={'productSku':int,'productCategory':str,'SessionID':int})

    df_clicks = getDFBySessions(sessionWithbuy, df_clicks, 'count')
    df_buys = getDFBySessions(sessionWithbuy, df_buys, 'buy')
    df_buys['buy'] = 1

    skus = df_clicks.index.unique()
    inputholder = pd.Series(np.zeros(skus.size), index=skus)
    return df_clicks, df_buys, sessionWithbuy, inputholder


def getDFBySessions(sessions, df, new_colum):
    df_clicks_count = df[df['SessionID'].isin(sessions)]
    df_clicks_count = df_clicks_count.groupby(['SessionID','productSku']).size().reset_index(name=new_colum)
    df_clicks_count.set_index('productSku',drop=True,inplace=True)
    return df_clicks_count


def get_batch(clicks, buys, sessions, min_batch = 1):
    total_sess = len(sessions)
    # min_batch = total_sess if min_batch ==None else min_batch
    for click, buy in zip(clicks.groupby('SessionID'), buys.groupby('SessionID')):
        if click[0] != buy[0]:
            raise ('sessionId error')

        df_x, df_y = click[1], buy[1]
        left_x, right_x = inputholder.align(df_x['count'],join='left',fill_value = 0)
        left_y, right_y = inputholder.align(df_y['buy'], join='left', fill_value=0)
        x, y = right_x.values, right_y.values
        index = x.astype(bool)
        y = y[index]
        yield (np.array([x]), index, y)

In [4]:
begin =  time.time()
df_clicks, df_buys, sessions, inputholder = processData()
print('number of session used:{}'.format(len(sessions)))

train_session, test_session = train_test_split(sessions,test_size=0.2)
clicks_train, clicks_test = df_clicks[df_clicks['SessionID'].isin(train_session)], df_clicks[df_clicks['SessionID'].isin(test_session)]
buys_train, buys_test = df_buys[df_buys['SessionID'].isin(train_session)], df_buys[df_buys['SessionID'].isin(test_session)]

train = list(get_batch(clicks_train, buys_train,train_session))
test = list(get_batch(clicks_test, buys_test,test_session,min_batch=None))

end = time.time()
print('total time for process data:{}'.format(end-begin))

number of session used:234300
total time for process data:4138.011507034302


In [5]:
dimension = 5
input = tf.placeholder(dtype = tf.float32,shape=[None,len(inputholder)])
input_mask = tf.placeholder(dtype = tf.bool,shape=[None])
product_w = tf.Variable(initial_value=tf.truncated_normal(shape=[len(inputholder),dimension]))

hidden1 = tf.contrib.layers.fully_connected(inputs = input, num_outputs=50)
hidden2 = tf.contrib.layers.fully_connected(inputs = hidden1, num_outputs=50)
hidden3 = tf.contrib.layers.fully_connected(inputs = hidden2, num_outputs=50)
w_2 = tf.contrib.layers.fully_connected(inputs = hidden1, num_outputs=dimension)
w_0 = tf.Variable(initial_value=0.0)
select_product_w = tf.boolean_mask(product_w, input_mask)
output = tf.reduce_sum(w_2*select_product_w,axis=1) + w_0

true_y = tf.placeholder(dtype = tf.float32,shape=[None])
cost = tf.nn.relu(output) - output * true_y + tf.log(1 + tf.exp(-tf.abs(output)))
cost = tf.reduce_mean(cost)
optimizer = tf.train.AdamOptimizer()
loss = optimizer.minimize(cost)

n_elements = tf.cast(tf.shape(true_y)[0],dtype=tf.float32)
match = tf.reduce_sum(tf.abs(tf.round(tf.sigmoid(output)) - true_y))
accuracy = 1 - (match / n_elements)

In [6]:
epoch = 10
init = tf.global_variables_initializer()
sess = tf.InteractiveSession()
init.run()
begin = time.time()
for i in range(epoch):
    average_loss =[]
    for clicks, index, buy in train:
        _, final_loss = sess.run([loss, cost],feed_dict={input:clicks, true_y: buy, input_mask : index})
        average_loss.append(final_loss)
end = time.time()
print('total training time:{}'.format(end-begin))
print('accuracy on training sets:{}'.format(1 - np.average(average_loss)))

total training time:4965.745871067047
accuracy on training sets:0.5454939007759094


In [7]:
average_accuracy = []
for clicks, index, buy in test:
    sess_accuracy, y_hat = sess.run([accuracy, output],feed_dict={input:clicks, true_y: buy, input_mask : index})
    average_loss.append(sess_accuracy)
print('accuracy on test sets:{}'.format(np.average(average_loss)))
sess.close()

accuracy on test sets:0.5050144791603088
