In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime as dt

Inspired by: http://www.scielo.org.za/pdf/sajie/v31n3/08.pdf

In [31]:
items = pd.read_csv("../../Data/items.csv", sep="|")
orders = pd.read_csv("../../Data/orders_before_dec.csv", sep="|")
orders["date"] = pd.to_datetime(orders["date"])

### Feature average order amount by item-user

In [16]:
def get_average_order_amount_user_item(_orders):
    return _orders[['userID', 'itemID', 'order']].groupby(['userID','itemID']).mean().reset_index()

In [17]:
get_average_order_amount_user_item(orders).head()

Unnamed: 0,userID,itemID,order
0,0,1505,1.0
1,0,9325,1.0
2,0,12468,1.0
3,0,12505,1.0
4,0,15083,1.0


### Feature average order amount by item

In [32]:
def get_average_order_amount_item(_orders):
    return _orders[['itemID', 'order']].groupby(['itemID']).mean().reset_index()

In [34]:
get_average_order_amount_item(orders).head()

Unnamed: 0,itemID,order
0,0,1.0
1,1,1.75
2,2,1.0
3,3,2.333333
4,4,5.0


### Feature average order amount user

In [37]:
def get_average_order_amount_user(_orders):
    return orders[['userID', 'order']].groupby(['userID']).mean().reset_index()

In [38]:
get_average_order_amount_user(orders).head()

Unnamed: 0,userID,order
0,0,1.0
1,1,1.3
2,2,1.166667
3,3,1.290323
4,4,1.705882


### Feature Number of orders user-item

In [63]:
def number_orders_user_item(_orders):
    return _orders[['userID', 'itemID']].groupby(['userID', 'itemID']).size().reset_index().rename(columns={0:'num_orders_user_item'})

In [65]:
number_orders_user_item(orders).head()

Unnamed: 0,userID,itemID,num_orders_user_item
0,0,1505,1
1,0,9325,1
2,0,12468,1
3,0,12505,1
4,0,15083,1


### Feature average lifespan product

In [68]:
test_csv = pd.read_csv('test.csv', sep='|', parse_dates=[0])

In [125]:
def feature_avg_lifespan(_orders):
    workcopy = _orders[['date','userID', 'itemID']].copy()
    workcopy.sort_values(['userID', 'itemID', 'date'], inplace=True)
    workcopy['diffs'] = workcopy.groupby(['userID', 'itemID'])['date'].diff()
    workcopy = workcopy[pd.notnull(workcopy['diffs'])]
    workcopy['sum'] = 1
    workcopy['diffs'] = workcopy.diffs.dt.days
    workcopy = workcopy[['userID', 'itemID', 'diffs', 'sum']].groupby(['itemID', 'userID']).sum().reset_index()
    workcopy = workcopy.drop(columns=['userID']).groupby('itemID').sum().reset_index()
    workcopy['avg_lifespan'] = workcopy['diffs'] / workcopy['sum']
    return workcopy.drop(columns=['sum', 'diffs'])

In [126]:
feature_avg_lifespan(orders)

Unnamed: 0,itemID,avg_lifespan
0,1,90.000000
1,6,64.615385
2,9,58.076923
3,12,84.000000
4,21,53.000000
...,...,...
11040,32764,18.000000
11041,32766,83.500000
11042,32768,81.222222
11043,32769,105.000000
