In [1]:
import pandas as pd
import datetime

In [3]:
# pred_tmp = pd.read_csv('../data/predictions.csv')
# pred = pd.DataFrame(pred_tmp['visitorid'])
# pred[(pred['visitorid'].isin(events['visitorid'])==True)].shape

In [4]:
def get_events(filter_flag=False):    
    events = pd.read_csv('../data/events.csv')
    if(filter_flag==True):
        events = events[events['event']=='transaction']
    else:
        events = events
    print("EVENTS:",events.shape[0])
    return events

def get_categories():
    category_tree = pd.read_csv('../data/category_tree.csv')
    category_tree[category_tree['parentid'].isnull()==True] = 0
    category_tree['parentid'] = category_tree['parentid'].apply(lambda x:round(x))
    print("CATEGORIES:",category_tree.shape[0])
    return category_tree

def get_unique_list(df, user_column):
    return np.sort(df[user_column].unique())

def make_splits(df):
    #First convert to datetime format
    df = df.assign(timestamp=pd.Series(datetime.datetime.fromtimestamp(i/1000).date() for i in df.timestamp))
    df = df.sort_values('timestamp').reset_index(drop=True)

    #Train
    train = df[(df.timestamp < datetime.datetime.strptime('2015-08-01', '%Y-%m-%d').date())]
    #Eval
    test = df[
        (df.timestamp < datetime.datetime.strptime('2015-09-01', '%Y-%m-%d').date())
        &         (df.timestamp > datetime.datetime.strptime('2015-07-31', '%Y-%m-%d').date())
    ]
    #Filtered
    fltr = df[
        (df.timestamp > datetime.datetime.strptime('2015-08-31', '%Y-%m-%d').date())
    ]
    print("Train rows are:",train.shape[0])
    print("Test rows are:",test.shape[0])
    print("Filtered out Interactions are:",fltr.shape[0])
    
    return train,test

def get_predictors():
    pred = pd.read_csv('../data/predictions.csv')

events = get_events(False)
train_events,test_events = make_splits(events)
category_tree = get_categories()

EVENTS: 2756101
Train rows are: 1896804
Test rows are: 553373
Filtered out Interactions are: 305924
CATEGORIES: 1669


In [5]:
import datetime

def get_items(events):
    items1 = pd.read_csv('../data/item_properties_part1.csv')
    items2 = pd.read_csv('../data/item_properties_part2.csv')
    items = pd.concat([items1, items2])
    print("ITEMS:",items.shape[0])
    return items

items_df = get_items(events)
items = get_unique_list(items_df,'itemid')


ITEMS: 20275902


In [6]:
users = get_unique_list(events,'visitorid')

In [25]:
#remove items that are not in events
#remove event items that are not in items
items_df = items_df[items_df.itemid.isin(events.itemid)==True]
events = events[events.itemid.isin(items_df.itemid)==True]
print("Events:",events.shape[0])
train_events = train_events[train_events.itemid.isin(items_df.itemid)==True]
test_events = test_events[test_events.itemid.isin(items_df.itemid)==True]
print("Train Events:",train_events.shape[0])
print("Test Events:",test_events.shape[0])
print("Items:",items_df.shape[0])

Events: 2500516
Train Events: 1725857
Test Events: 499399
Items: 10180153


In [28]:
def get_item_feature_interaction(items_df, category_tree_df):
    items_to_cat = items_df[(items_df.property == 'categoryid')][['itemid','value']].drop_duplicates()
    items_to_cat['value'] = items_to_cat['value'].astype(int)
    item_feature_df = pd.merge(items_to_cat, category_tree_df.rename(columns={'categoryid':'value'}), on='value',  how='left')

    item_category_df = item_feature_df[["itemid", "value"]].rename(columns = {"value" : "feature"})
    item_category_df["feature_count"] = 1 # adding weight to category feature
    item_parent_df = item_feature_df[["itemid", "parentid"]].rename(columns = {"parentid" : "feature"})
    item_parent_df["feature_count"] = 1 # adding weight to department feature

    item_feature_df_sub = pd.concat([item_category_df, item_parent_df], ignore_index=True)

    # saving some memory
    del item_feature_df
    del item_category_df
    del item_parent_df

    # grouping for summing over feature_count
    item_feature_df = item_feature_df_sub.groupby(["itemid", "feature"], as_index = False)["feature_count"].sum()
    return item_feature_df

item_to_feature = get_item_feature_interaction(items_df, category_tree)
item_to_feature.shape[0]

396660

In [29]:
def id_mappings(user_list, item_list):
    """
    
    Create id mappings to convert user_id, item_id, and feature_id
    
    """
    user_to_index_mapping = {}
    index_to_user_mapping = {}
    for user_index, user_id in enumerate(user_list):
        user_to_index_mapping[user_id] = user_index
        index_to_user_mapping[user_index] = user_id
        
    item_to_index_mapping = {}
    index_to_item_mapping = {}
    for item_index, item_id in enumerate(item_list):
        item_to_index_mapping[item_id] = item_index
        index_to_item_mapping[item_index] = item_id
                
    return user_to_index_mapping, index_to_user_mapping, \
           item_to_index_mapping, index_to_item_mapping

# generate mapping, LightFM library can't read other than (integer) index
user_to_index_mapping, \
index_to_user_mapping, \
item_to_index_mapping, \
index_to_item_mapping = id_mappings(users, items)


In [30]:
def get_user_product_interaction(events):
    
    # creating a dataframe consists of TWO columns user_id, and product_name (product bought by the user) for the train data
    user_to_item_df = events[['visitorid','itemid']]
    user_to_item_df["item_count"] = 1
    user_to_item_rating = user_to_item_df.groupby(["visitorid", "itemid"], as_index = False)["item_count"].sum()
    return user_to_item_rating

# convert to the user, item, feature lists into indexes.
# interaction matrices can only consume indexes
user_to_item_train = get_user_product_interaction(train_events)
user_to_item_test = get_user_product_interaction(test_events)
user_to_item_train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,visitorid,itemid,item_count
0,5,61396,1
1,7,139394,1
2,7,164941,1
3,7,226353,1
4,9,222422,1


In [31]:
item_to_feature.head()

Unnamed: 0,itemid,feature,feature_count
0,3,938.0,1
1,3,1171.0,1
2,4,1038.0,1
3,4,1174.0,1
4,6,573.0,1


In [32]:
def get_interaction_matrix(df, df_column_as_row, df_column_as_col, df_column_as_value, row_indexing_map, 
                          col_indexing_map):
    
    row = df[df_column_as_row].apply(lambda x: row_indexing_map[x]).values
    col = df[df_column_as_col].apply(lambda x: col_indexing_map[x]).values
    value = df[df_column_as_value].values
    
    return coo_matrix((value, (row, col)), shape = (len(row_indexing_map), len(col_indexing_map)))

In [34]:
from scipy.sparse import coo_matrix # for constructing sparse matrix
# generate user_item_interaction_matrix for train data
train_user_to_item_interaction = get_interaction_matrix(user_to_item_train
                                                  , "visitorid"
                                                  , "itemid"
                                                  , "item_count"
                                                  , user_to_index_mapping
                                                  , item_to_index_mapping)

test_user_to_item_interaction = get_interaction_matrix(user_to_item_test
                                                  , "visitorid"
                                                  , "itemid"
                                                  , "item_count"
                                                  , user_to_index_mapping
                                                  , item_to_index_mapping)

# generate item_to_feature interaction
item_to_feature_interaction = get_interaction_matrix(item_to_feature
                                                           , "itemid"
                                                           , "feature"
                                                           , "feature_count"
                                                           , item_to_index_mapping
                                                           , index_to_item_mapping)

In [39]:
assert(test_user_to_item_interaction.shape[0]==train_user_to_item_interaction.shape[0])
print(test_user_to_item_interaction.shape,train_user_to_item_interaction.shape)

(1407580, 417053) (1407580, 417053)


In [40]:
# lightfm 
from lightfm import LightFM # model
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k

# initialising model with warp loss function
model_without_features = LightFM(loss = "warp")



In [42]:
import time
# fitting into user to product interaction matrix only / pure collaborative filtering factor

start = time.time()
#===================

model_without_features.fit(train_user_to_item_interaction,
          user_features=None, 
          item_features=None, 
          sample_weight=None, 
          epochs=1, 
          num_threads=4,
          verbose=False)

#===================
end = time.time()
print("time taken = {0:.{1}f} seconds".format(end - start, 2))

# auc metric score (ranging from 0 to 1)

start = time.time()
#===================

auc_without_features = auc_score(model = model_without_features, 
                        test_interactions = test_user_to_item_interaction,
                        num_threads = 4, check_intersections = False)

precision_without_features = precision_at_k(model = model_without_features, 
                        test_interactions = test_user_to_item_interaction,
                        num_threads = 4, check_intersections = False)

recall_without_features = recall_at_k(model = model_without_features, 
                        test_interactions = test_user_to_item_interaction,
                        num_threads = 4, check_intersections = False)
#===================
end = time.time()
print("time taken = {0:.{1}f} seconds".format(end - start, 2))
print("average AUC without adding item-feature interaction = {0:.{1}f}".format(auc_without_features.mean(), 2))
print("average Precision without adding item-feature interaction = {0:.{1}f}".format(precision_without_features.mean(), 2))
print("average Recall without adding item-feature interaction = {0:.{1}f}".format(recall_without_features.mean(), 2))

time taken = 11179.65 seconds
average AUC without adding item-feature interaction = 0.88
average Precision without adding item-feature interaction = 0.00
average Recall without adding item-feature interaction = 0.01


In [49]:
# initialising model with warp loss function
model_with_features = LightFM(loss = "warp")

# fitting the model with hybrid collaborative filtering + content based (product + features)
start = time.time()
#===================


model_with_features.fit(train_user_to_item_interaction,
          user_features=None, 
          item_features=item_to_feature_interaction, 
          sample_weight=None, 
          epochs=1, 
          num_threads=4,
          verbose=False)

#===================
end = time.time()
print("time taken = {0:.{1}f} seconds".format(end - start, 2))

start = time.time()
#===================
auc_with_features = auc_score(model = model_with_features, 
                        test_interactions = test_user_to_item_interaction,
                        item_features = item_to_feature_interaction,
                        num_threads = 4, check_intersections=False)

precision_with_features = precision_at_k(model = model_with_features, 
                        test_interactions = test_user_to_item_interaction,
                        num_threads = 4, check_intersections = False)

recall_with_features = recall_at_k(model = model_with_features, 
                        test_interactions = test_user_to_item_interaction,
                        num_threads = 4, check_intersections = False)
#===================
end = time.time()
print("time taken = {0:.{1}f} seconds".format(end - start, 2))

print("average AUC without adding item-feature interaction = {0:.{1}f}".format(auc_with_features.mean(), 2))
print("average Precision without adding item-feature interaction = {0:.{1}f}".format(precision_with_features.mean(), 2))
print("average Recall without adding item-feature interaction = {0:.{1}f}".format(recall_with_features.mean(), 2))

average AUC without adding item-feature interaction = 0.85
average Precision without adding item-feature interaction = 0.00
average Recall without adding item-feature interaction = 0.00


In [None]:
def combined_train_test(train, test):
    """
    
    test set is the more recent rating/number_of_order of users.
    train set is the previous rating/number_of_order of users.
    non-zero value in the test set will replace the elements in 
    the train set matrices

    """
    # initialising train dict
    train_dict = {}
    for train_row, train_col, train_data in zip(train.row, train.col, train.data):
        train_dict[(train_row, train_col)] = train_data
        
    # replacing with the test set
    
    for test_row, test_col, test_data in zip(test.row, test.col, test.data):
        train_dict[(test_row, test_col)] = max(test_data, train_dict.get((test_row, test_col), 0))
        
    
    # converting to the row
    row_element = []
    col_element = []
    data_element = []
    for row, col in train_dict:
        row_element.append(row)
        col_element.append(col)
        data_element.append(train_dict[(row, col)])
        
    # converting to np array
    
    row_element = np.array(row_element)
    col_element = np.array(col_element)
    data_element = np.array(data_element)
    
    return coo_matrix((data_element, (row_element, col_element)), shape = (train.shape[0], train.shape[1]))