In [None]:
from keras.layers import Input, Dense, Embedding, Flatten, Dropout, add, Activation
from keras.models import Model
from keras.regularizers import l2

In [None]:
import matplotlib.pyplot
import random 
import pickle
import copy
import numpy               as np
np.random.seed(7)
import pandas              as pd
import scipy.sparse        as sparse
import scipy.sparse.linalg as linalg
from numpy              import mat
from datetime           import datetime
from tqdm               import tqdm
from numpy.core.numeric import zeros_like
from sklearn.metrics    import pairwise_distances
from numpy              import linalg as la
from datetime           import date   as dt

In [None]:
class dataProcessing:
 
    folder = '/Users/Yur/Desktop/Thesis/Data/TaFeng.csv'
 
    def __init__(self):
        self.usersTest = {}
        self.itemDict = {}
        self.saveUsersTest = 'usersTest.dict'
        self.saveItemDict = 'itemDict.dict'
        self.saveUI = 'sUI.mtx'
        self.saveBI = 'basketDict.dict'
 
    def readTaFeng(self, folder):
        '''
        The TaFeng dataset posseses the useful information in cols 0, 1 and 4. 
        For other datasets, please change the usecols
 
        Args: 
            folder: the path to the corresponding folder
        Returns:
            df: DataFrame with all the transaction data
        '''
        df = pd.read_csv(folder, usecols=[0,1,4])
        df.columns = ['date_time', 'customer_id', 'subclass']
 
        df['date_time'] = df['date_time'].apply(lambda x: datetime.strptime(x[:10], '%m/%d/%Y'))
        return df
 
    def addTransactionID(self, df):
        """
        Create transaction id for each product purchased (row in df). Products in the same basket have the same transaction id.
 
        Args:
            df: Dataframe which includes all the product purchased.
        Returns:
            df: Add transaction id to the input df.
        """
        df['trans_id'] =  df['customer_id'].astype(str) + df['date_time'].astype(str) # assume each customer only make at most one transaction everyday
        df = df.sort_values(['trans_id']) # sort before finding the products in the same basket
        cust_id = df.customer_id.values[0]
        trans_buf_id = df.trans_id.values[0]
        trans = 0
        trans_id_list = []
        for i in tqdm(range(df.shape[0])):
            if df.customer_id.values[i] == cust_id:
                if df.trans_id.values[i] == trans_buf_id:
                    trans_id_list.append(trans)
                else:
                    trans += 1
                    trans_buf_id = df.trans_id.values[i]
                    trans_id_list.append(trans)
            else:
                cust_id = df.customer_id.values[i]
                trans_buf_id = df.trans_id.values[i]
                trans = 0
                trans_id_list.append(trans)
        df['trans_id'] = trans_id_list
        return df
 
    def nTransactionsCustomers(self, new_df):
        '''
        Get the number of different days a customer has shopped groceries and use the last time as test basket
 
        Args: 
            df: DataFrame which includes the transaction id's
        Returns:
            train_set: the test that will be used to train the similarity metrices
            test_test: the baskets that will be used for testing, with customers that shopped at least twice
        '''
        df = new_df.sort_values(['customer_id', 'date_time', 'trans_id'], ascending=[True, False, False])
        df_cust_purch = new_df.groupby(['customer_id'],as_index = False).trans_id.max() # number of purchases per user
        cust_id  = df.customer_id.values[0]
        n_purchases = df_cust_purch.trans_id.values[0]
 
        purch_list = []
        curr_cust = 0
        for i in tqdm(range(df.shape[0])):
            if df.customer_id.values[i] == cust_id:
                purch_list.append(n_purchases)
            else:
                curr_cust = curr_cust + 1
                cust_id = df.customer_id.values[i]
                n_purchases = df_cust_purch.trans_id.values[curr_cust]
                purch_list.append(n_purchases)
        df['nb_purchases'] = purch_list
 
        test_set = df.loc[df['trans_id'] == df['nb_purchases']]
        train_set = df.loc[df['trans_id'] != df['nb_purchases']]
        return train_set, test_set.loc[test_set['trans_id'] >0]
 
    def aggregateUserProcessing(self, dataset):
        '''
        This method reads a training data set and converts it into a UI dataframe
         
        Args:
            dataset: the training set (train_set from preceding function)
        Returns:
            UI: the User-Item Matrix
            ItemDict: dictionary with item keys
        '''
        userDict = dict()
        ind_u = 0
        ind_i = 0
        dataset = dataset.loc[:,['date_time', 'customer_id','subclass']].values.tolist()
        print('Counting user and item...')
        for d, u, i in dataset:
            if int(u) not in userDict.keys():
                userDict[int(u)] = ind_u
                ind_u+=1
            if i not in self.itemDict.keys():
                self.itemDict[i] = ind_i
                ind_i+=1
 
        self.UI = np.zeros((len(userDict.keys()), len(self.itemDict.keys())))
 
        print('Populating aggregate matrix')
        for d, u, i in dataset:
            self.UI[userDict[int(u)], self.itemDict[i]]+=1
 
        # Log frequence UI transformation (TO BE OPTIMIZE)
        # -------------------------------
        print('Frequence normalisation')
        for i,j in np.argwhere(self.UI!=0):
            self.UI[i,j] = np.log(self.UI[i,j]+1)
 
        self.UI = pd.DataFrame(data=self.UI,    # values
                          index=userDict.keys(),    # 1st column as index
                          columns=self.itemDict.keys())
 
        self.UI = self.UI.div(self.UI.sum(axis=1), axis=0) #normalize rows 
  
        return self.UI, self.itemDict
 
 
    def aggregateTransactionProcessing(self, dataset, minLengthBasket):
        '''
        This method reads a test set and splits it into a list of baskets that have been purchases per customer
 
        Args:
            dataset: test data
        returns:
            self.BasketItemList: list of purchased baskets that has to be split into training and test baskets
        '''
        print('Create test baskets dictionnary')
        dataset = dataset.values.tolist()
        basketDict = dict()
         
        ind_b = 0
        for i in np.arange(len(dataset)):
            if (dataset[i][1]) not in basketDict.keys():
                basketDict[dataset[i][1]] = ind_b
                ind_b+=1
         
        print('Create basket array')
        BasketItem = np.zeros((len(basketDict), len(self.itemDict.keys())), dtype='int')
        for i in np.arange(len(dataset)):
            if dataset[i][2] in self.itemDict.keys():
                BasketItem[basketDict[dataset[i][1]], self.itemDict[dataset[i][2]]] +=1
         
        self.BasketItemList = []
        self.usersTest = {y:x for x, y in basketDict.items()}
        for i in range(BasketItem.shape[0]):
            self.BasketItemList.append(np.argwhere(BasketItem[i,:]!=0).flatten())
         
        placeToProductDict = {y:x for x,y in self.itemDict.items()}
 
        for i in range(len(self.BasketItemList)):
            for j in range(len(self.BasketItemList[i])):
                self.BasketItemList[i][j] = placeToProductDict[self.BasketItemList[i][j]]
 
        index = 0
        user = 0
        correct_baskets = copy.copy(self.BasketItemList) 
        for basket in self.BasketItemList: # remove the purchases that have less than 4 separate products in them
            if len(basket) < minLengthBasket:
                correct_baskets.pop(index)
                self.usersTest.pop(user)
                user = user + 1
            else:
                index = index + 1
                user = user + 1
        self.BasketItemList = correct_baskets
        
        return self.BasketItemList, self.usersTest

In [None]:
def prepareCDAE(UI, baskets, usersTest, itemDict):
    #UI[UI!=0] = 1
    
    train_x = UI.values
    test_x = np.zeros(shape=(len(baskets), UI.shape[1]))

    for i, r in enumerate(baskets):
        for j, c in enumerate(itemDict.keys()):
            test_x[i,j] = int(c in r)
    
    # split train into train and validation randomly
    #train_validation_split = np.random.rand(len(train_x)) < 0.80
    #validation_x = train_x[~train_validation_split]
    #train_x      = train_x[train_validation_split]
    
#     # get users of train, val and test
#     users_x = list(UI.index)
#     userList = np.array(users_x).reshape(len(users_x), 1)
#     users_train_x = userList[train_validation_split]
#     users_val_x   = userList[~train_validation_split]
    
#     users_test_x = list(usersTest.values())
#     users_test_x = np.array(users_test_x).reshape(len(users_test_x), 1)
    return train_x, test_x

In [None]:
def create(I, U, K, hidden_activation, output_activation, q=0.2, l=0.01):
    '''
    create model
   
    :param I: number of items
    :param U: number of users
    :param K: number of units in hidden layer
    :param hidden_activation: activation function of hidden layer
    :param output_activation: activation function of output layer
    :param q: drop probability
    :param l: regularization parameter of L2 regularization
    '''
    x_item = Input((I,), name='x_item')
    h_item = Dropout(q)(x_item)
    h_item = Dense(K, kernel_regularizer=l2(l), bias_regularizer=l2(l))(h_item)

    x_user = Input((1,), dtype='int32', name='x_user')

    if hidden_activation:
        h = Activation(hidden_activation)(h_item)
    y = Dense(I, activation=output_activation)(h)

    return Model(input=x_item, output=y)

In [None]:
folder = 'TaFeng.csv'
     
data   = dataProcessing()
df     = data.readTaFeng(folder)
new_df = data.addTransactionID(df)

train, test_unf = data.nTransactionsCustomers(new_df)
UI, itemDict = data.aggregateUserProcessing(train)
baskets, usersTest = data.aggregateTransactionProcessing(test_unf, 4)

modelRND = RecommendationModels(UI, baskets)
modelPOP = RecommendationModels(UI, baskets)

itemPoprnd, itemPrior = modelRND.itemPopularity()
itemPoppop, itemPrior = modelPOP.itemPopularity()

copyTest1 = copy.deepcopy(baskets)
copyTest2 = copy.deepcopy(baskets)
n = 3
targetsRND, evidencesRND = modelRND.splitTargetEvidence(itemPoprnd, n, copyTest1, itemDict, rnd=True, pop=False)
targetsPOP, evidencesPOP = modelPOP.splitTargetEvidence(itemPoppop, n, copyTest2, itemDict, rnd=False, pop=True)

In [None]:
train_x, test_x = prepareCDAE(UI, evidencesPOP, usersTest, itemDict)
train_x_rnd, test_x_rnd = prepareCDAE(UI, evidencesRND, usersTest, itemDict)

In [None]:
model = create(I=train_x.shape[1], U=len(train_x), K=250, hidden_activation='relu', output_activation='linear', q=0.01, l=0.01)
model.compile(loss='mean_squared_error', optimizer='Adam')
model.summary()

history = model.fit(x=train_x, y=train_x,
                   batch_size=64, nb_epoch=100, verbose=2,
                   validation_split=0.2)
pred = model.predict(x=test_x)

In [None]:
for i in [50, 100, 150, 200, 250]:

        model = create(I=train_x.shape[1], U=len(train_x), K=i, hidden_activation='relu', output_activation='linear', q=0.01, l=0.01)
        model.compile(loss='mean_squared_error', optimizer='Adam')
        model.summary()

        history = model.fit(x=train_x, y=train_x,
                           batch_size=128, nb_epoch=100, verbose=0,
                           validation_split=0.2)
        pred = model.predict(x=test_x)

        def randomRecommend(Rhat, evidences, targets):
            hits = []
            hitCount= 0
            for i in range(len(evidences)):
                scores = np.asarray(Rhat[i])
                scores[evidences[i]] = 0
                recommendations = np.argsort(scores)[-3:] 

                for item in recommendations:
                    if item in targets[i]:
                        hitCount = hitCount + 1

                if hitCount > 0:
                    hits.append(1)
                else:
                    hits.append(0)
                hitCount = 0
                hitRate = sum(hits)/len(hits)

            return hitRate

        def modelBaskToRecoBask(evidences, targets):
            recoEvidences = copy.deepcopy(evidences)
            recoTargets = copy.deepcopy(targets)

            for i in range(len(targets)):
                for j in range(len(targets[i])):
                    recoTargets[i][j] = itemDict[targets[i][j]]

            for i in range(len(evidences)):
                for j in range(len(evidences[i])):
                    recoEvidences[i][j] = itemDict[evidences[i][j]]

            return recoTargets, recoEvidences
        pred_rnd = model.predict(x=test_x_rnd)
        recoTargets, recoEvidences = modelBaskToRecoBask(evidencesPOP, targetsPOP)
        recoTargetsRND, recoEvidencesRND = modelBaskToRecoBask(evidencesRND, targetsRND)

        hitRateRND = randomRecommend(pred_rnd, recoEvidencesRND, recoTargetsRND)
        hitRatePOP = randomRecommend(pred, recoEvidences, recoTargets)
        print(hitRatePOP*100)
        print(hitRateRND*100)
        print('Size hidden layer: ', i)
        print('Batch size: ', j)