In [1]:
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
from datetime import datetime as dt
from datetime import date

Inspired by: http://www.scielo.org.za/pdf/sajie/v31n3/08.pdf

In [2]:
items = pd.read_csv("../../Data/old/items.csv", sep="|")
orders = pd.read_csv("../../Data/old/orders_before_dec.csv", sep="|", parse_dates=[0])
orders.head()

Unnamed: 0,date,userID,itemID,order
0,2020-06-01,38769,3477,1
1,2020-06-01,42535,30474,1
2,2020-06-01,42535,15833,1
3,2020-06-01,42535,20131,1
4,2020-06-01,42535,4325,1


In [3]:
def get_average_order_amount_user_item(_orders):
    res = _orders[['userID', 'itemID', 'order']].groupby(['userID','itemID']).mean().reset_index()
    res.rename(columns={'order':'avg_order_user_item'}, inplace=True)
    return res


def get_average_order_amount_item(_orders):
    res = _orders[['itemID', 'order']].groupby(['itemID']).mean().reset_index()
    res.rename(columns={'order':'avg_order_item'}, inplace=True)
    return res

def get_average_order_amount_user(_orders):
    res = _orders[['userID', 'order']].groupby(['userID']).mean().reset_index()
    res.rename(columns={'order':'avg_order_user'}, inplace=True)
    return res

def number_orders_user_item(_orders):
    return _orders[['userID', 'itemID']].groupby(['userID', 'itemID']).size().reset_index().rename(columns={0:'num_orders_user_item'})


def feature_avg_lifespan(_orders):
    workcopy = _orders[['date','userID', 'itemID']].copy()
    workcopy.sort_values(['userID', 'itemID', 'date'], inplace=True)
    workcopy['diffs'] = workcopy.groupby(['userID', 'itemID'])['date'].diff()
    workcopy = workcopy[pd.notnull(workcopy['diffs'])]
    workcopy['sum'] = 1
    workcopy['diffs'] = workcopy.diffs.dt.days
    workcopy = workcopy[['userID', 'itemID', 'diffs', 'sum']].groupby(['itemID', 'userID']).sum().reset_index()
    workcopy = workcopy.drop(columns=['userID']).groupby('itemID').sum().reset_index()
    workcopy['avg_lifespan'] = workcopy['diffs'] / workcopy['sum']
    return workcopy.drop(columns=['sum', 'diffs'])

def lifespan_variance_item(_orders):
    workcopy = _orders[['date','userID', 'itemID']].copy()
    workcopy.sort_values(['userID', 'itemID', 'date'], inplace=True)
    workcopy['diffs'] = workcopy.groupby(['userID', 'itemID'])['date'].diff()
    workcopy['diffs'] = workcopy.diffs.dt.days
    workcopy = workcopy.groupby(['userID', 'itemID']).var().reset_index()
    workcopy = workcopy[['itemID', 'diffs']].groupby(['itemID']).mean().reset_index()
    workcopy.rename(columns={'diffs':'item_lifespan_variance'}, inplace=True)
    return workcopy

def lastpurchase(_orders, date):
    date = pd.Timestamp(date)
    res = _orders[['userID', 'itemID', 'date']].groupby(['userID', 'itemID']).max().reset_index()
    res['last_purchased'] = res['date'].apply(lambda d: (date - d))
    res['last_purchased'] = res.last_purchased.dt.days
    res.drop(columns=['date'], inplace=True)
    return res

def average_lifespan_item_user(_orders):
    res = _orders[['userID', 'itemID', 'date']].copy()
    res['date'] = res[['date', 'userID', 'itemID']].groupby(['userID', 'itemID']).diff()
    res['date'] = res.date.dt.days
    res = res.groupby(['userID', 'itemID']).mean().reset_index()
    res.rename(columns={'date':'avg_lifespan_user_item'}, inplace=True)
    return res

def lifespan_item_user_variance(_orders):
    res = _orders[['userID', 'itemID', 'date']].copy()
    res['date'] = res[['date', 'userID', 'itemID']].groupby(['userID', 'itemID']).diff()
    res['date'] = res.date.dt.days
    res = res.groupby(['userID', 'itemID']).var().reset_index()
    res.rename(columns={'date':'lifespan_user_item_variance'}, inplace=True)
    return res

def create_train_labels(X, combinations,month):
    #set month in each example
    X['month'] = X['date'].dt.month

    X = X[X['month'] == month]
    X = X.merge(combinations, how='inner')
    #set week in month. We want at most 4
    X['week'] = X.date.apply(lambda d: int(min((d.day-1)//7 + 1, 4)))
    X = X.drop(columns=['date', 'order', 'month'])
    combinations = combinations.copy()
    combinations['week'] = 5
    res = pd.concat([combinations, X])
    res = res.groupby(['userID', 'itemID']).min().reset_index()
    res['week'] = res.week.apply(lambda w: w if w < 5 else 0)
    return res

def create_trainset(_orders, cutoffdate):
    date = pd.Timestamp(cutoffdate)
    orders_before_cutoff = _orders[_orders['date'] < date]
    relevant_user_item_combs = orders_before_cutoff[['userID', 'itemID']].groupby(['userID', 'itemID'])
    relevant_user_item_combs = relevant_user_item_combs.filter(lambda x: len(x)>=2).groupby(['userID', 'itemID']).all().reset_index()
    labels = create_train_labels(_orders, relevant_user_item_combs, date.month)
    relevant_orders = orders.merge(relevant_user_item_combs, how='inner')
    train_features = relevant_user_item_combs
    for (f, args) in [(get_average_order_amount_user_item, None), (number_orders_user_item, None), (feature_avg_lifespan, None), (lastpurchase, date), (average_lifespan_item_user,None)]:
        if args is not None:
            train_features = train_features.merge(f(orders_before_cutoff, args), how='left')
        else:
            train_features = train_features.merge(f(orders_before_cutoff), how='left')
            
    train_data = labels.merge(train_features, how='left', on=['userID', 'itemID'])
    train_data = train_data.drop(columns=['userID', 'itemID'])
    return (train_data.drop(columns='week'), train_data['week'])

def create_test_features(_orders, user_item, cutoff, drop_user_item=True):
    date = pd.Timestamp(cutoff)
    test_features = user_item.copy()
    for (f, args) in [(get_average_order_amount_user_item, None), (number_orders_user_item, None), (feature_avg_lifespan, None), (lastpurchase, date), (average_lifespan_item_user,None)]:
        if args is not None:
            test_features = test_features.merge(f(_orders, args), how='left')
        else:
            test_features = test_features.merge(f(_orders), how='left')
    if drop_user_item: return test_features.drop(columns=['userID', 'itemID'])
    else: return test_features
    


In [21]:
orders_before_dec = pd.read_csv('../../Data/old/orders_before_dec.csv', sep='|', parse_dates=[0])
orders_before_jan = pd.read_csv('../../Data/old/orders_before_jan.csv', sep='|', parse_dates=[0])

user_item_features_before_dec = average_lifespan_item_user(orders_before_dec)
user_item_features_before_jan = average_lifespan_item_user(orders_before_jan)

user_item_features_before_dec=pd.merge(user_item_features_before_dec, lastpurchase(orders_before_dec, '2020-12-01'), on=['userID', 'itemID'])
user_item_features_before_jan=pd.merge(user_item_features_before_jan, lastpurchase(orders_before_jan, '2021-01-01'), on=['userID', 'itemID'])

user_item_features_before_dec=pd.merge(user_item_features_before_dec, number_orders_user_item(orders_before_dec), on=['userID', 'itemID'], how='left')
user_item_features_before_jan=pd.merge(user_item_features_before_jan, number_orders_user_item(orders_before_jan), on=['userID', 'itemID'], how='left')

user_item_features_before_dec=pd.merge(user_item_features_before_dec, lifespan_item_user_variance(orders_before_dec), on=['userID', 'itemID'], how='left')
user_item_features_before_jan=pd.merge(user_item_features_before_jan, lifespan_item_user_variance(orders_before_jan), on=['userID', 'itemID'], how='left')

item_features_before_dec = feature_avg_lifespan(orders_before_dec)
item_features_before_jan = feature_avg_lifespan(orders_before_jan)

item_features_before_dec = pd.merge(item_features_before_dec, lifespan_variance_item(orders_before_dec),how='left', on='itemID') 
item_features_before_jan = pd.merge(item_features_before_jan, lifespan_variance_item(orders_before_jan),how='left', on='itemID') 

In [25]:
for i in [user_item_features_before_dec, user_item_features_before_jan, item_features_before_dec, item_features_before_jan]:
    i.fillna(0, inplace=True)

user_item_features_before_dec.to_csv('../../Data/Feature CSVs/user_item_features_dec_marc.csv', index=False)
user_item_features_before_jan.to_csv('../../Data/Feature CSVs/user_item_features_jan_marc.csv', index=False)

item_features_before_dec.to_csv('../../Data/Feature CSVs/item_avg_lifetime_dec.csv', index=False)
item_features_before_jan.to_csv('../../Data/Feature CSVs/item_avg_lifetime_jan.csv', index=False)

In [19]:
X,y = create_trainset(orders, '2020-11-01')
X2, y2 = create_trainset(orders, '2020-10-01')
X = pd.concat([X, X2])
y = pd.concat([y,y2])
X2, y2 = create_trainset(orders, '2020-09-01')
X = pd.concat([X, X2])
y = pd.concat([y, y2])
display(X.head())


Unnamed: 0,avg_order_user_item,num_orders_user_item,avg_lifespan,last_purchased,avg_lifespan_user_item
0,1.0,2,79.0625,23,126.0
1,1.0,2,65.857143,39,86.0
2,1.0,2,80.333333,13,112.0
3,1.0,2,55.411765,12,61.0
4,1.0,2,48.333333,22,51.0


In [20]:
y.value_counts()

0    87855
4     7642
2     5642
3     5063
1     4014
Name: week, dtype: int64

In [5]:
testorders = pd.read_csv('../../../Data/orders_before_dec.csv', sep='|', parse_dates=[0])
testcombs = pd.read_csv('../../../Data/submission_dec.csv', sep='|')[['userID', 'itemID']]
test_features = create_test_features(testorders, testcombs, '2020-12-01', True)
test_features.fillna(999)
test_features.head()

FileNotFoundError: [Errno 2] No such file or directory: '../../../Data/orders_before_dec.csv'