In [1]:
import pandas as pd
import numpy as np
import datetime as dt

# Prepare the dataset

To prepare for the buyer features, we have to use the log dataset, which contains the behaviours of around 5000 frequent usersover 6 months.

In [8]:
df = pd.read_csv('log.csv')

# train test split

In [9]:
df['vmonth'] = df.vtime.str[: 7]
df_train = df

In [5]:
df_train.loc[:, 'vtime'] = df_train.vtime.apply(pd.to_datetime)

In [10]:
# keep 2013-09 as outcome, and use data before that to create features
train = df_train[df_train.vmonth != '2013-09'].reset_index(drop=True)
out = df_train[df_train.vmonth == '2013-09'].reset_index(drop=True)

# buyer data

## action-based features

In [22]:
# this is the final dataframe of buyer features
buyer = list(set(df.user_id))
b = pd.DataFrame(index=buyer)
b = b.reset_index()
b = b.rename(columns={'index': 'user_id'})

In [23]:
# this is where we extract features related to historical actions from

# all behaviour
buyer = train.groupby(['user_id', 'action']).count()[['item_id']].reset_index()

# last_month behaviour
buyer_last_month = train[train.vmonth == '2013-08'].groupby(['user_id', 'action']).count()[['item_id']].reset_index()

# last ten days behaviour
buyer_recent = train[train.vtime > dt.datetime(2013, 8, 20)].groupby(['user_id', 'action']).count()[['item_id']].reset_index()

In [24]:
# purchase
to_merge = buyer[buyer.action == 'alipay'][['user_id', 'item_id']]
b = b.merge(to_merge, how='left', on='user_id')
b = b.rename(columns={'item_id': 'purchase'})

In [25]:
# purchase last month
to_merge = buyer_last_month[buyer_last_month.action == 'alipay'][['user_id', 'item_id']]
b = b.merge(to_merge, how='left', on='user_id')
b = b.rename(columns={'item_id': 'purchase_last_m'})

In [26]:
# purchase last ten day
to_merge = buyer_recent[buyer_recent.action == 'alipay'][['user_id', 'item_id']]
b = b.merge(to_merge, how='left', on='user_id')
b = b.rename(columns={'item_id': 'purchase_recent'})

In [27]:
# click
to_merge = buyer[buyer.action == 'click'][['user_id', 'item_id']]
b = b.merge(to_merge, how='left', on='user_id')
b = b.rename(columns={'item_id': 'click'})

In [28]:
# click last month
to_merge = buyer_last_month[buyer_last_month.action == 'click'][['user_id', 'item_id']]
b = b.merge(to_merge, how='left', on='user_id')
b = b.rename(columns={'item_id': 'click_last_m'})

In [29]:
# click last ten day
to_merge = buyer_recent[buyer_recent.action == 'click'][['user_id', 'item_id']]
b = b.merge(to_merge, how='left', on='user_id')
b = b.rename(columns={'item_id': 'click_recent'})

In [30]:
# cart
to_merge = buyer[buyer.action == 'cart'][['user_id', 'item_id']]
b = b.merge(to_merge, how='left', on='user_id')
b = b.rename(columns={'item_id': 'cart'})

In [31]:
# cart last month
to_merge = buyer_last_month[buyer_last_month.action == 'cart'][['user_id', 'item_id']]
b = b.merge(to_merge, how='left', on='user_id')
b = b.rename(columns={'item_id': 'cart_last_m'})

In [32]:
# click last ten day
to_merge = buyer_recent[buyer_recent.action == 'cart'][['user_id', 'item_id']]
b = b.merge(to_merge, how='left', on='user_id')
b = b.rename(columns={'item_id': 'cart_recent'})

In [33]:
# ratio
b['click_buy_ratio'] = b.purchase / b.click
b['cart_buy_ratio'] = b.purchase / b.cart

b['click_buy_ratio_last_m'] = b.purchase_last_m / b.click_last_m
b['cart_buy_ratio_last_m'] = b.purchase_last_m / b.cart_last_m

b['click_buy_ratio_recent'] = b.purchase_recent / b.click_recent
b['cart_buy_ratio_recent'] = b.purchase_recent / b.cart_recent

In [34]:
b = b.fillna(0)
b = b.set_index('user_id')

## action-based features (2)

In [38]:
# last purchase
df2 = train[train.action == 'alipay']
for id_ in b.index:
    try:
        b.loc[id_, 'last_buy'] = (dt.datetime(2013, 9, 1) - max(df2[df2.user_id == id_].vtime)).days
    except ValueError:
        b.loc[id_, 'last_buy'] = np.nan

In [55]:
# last click
to_merge = df3[df3.user_id.isin(b.index)].groupby('user_id').max()[['vtime']]
to_merge['last_click'] = (dt.datetime(2013, 9, 1) - to_merge.vtime)
to_merge['last_click'] = to_merge.last_click.apply(lambda x: x.days)
to_merge = to_merge.reset_index()
b = b.merge(to_merge[['user_id', 'last_click']], how='left', on='user_id')

In [39]:
# last day of cart
df4 = train[train.action == 'cart']
for id_ in b.index:
    try:
        b.loc[id_, 'last_cart'] = (dt.datetime(2013, 9, 1) - max(df4[df4.user_id == id_].vtime)).days
    except ValueError:
        b.loc[id_, 'last_cart'] = np.nan

In [75]:
# actions after last purchase
b['click_after_purchase'] = (b.last_buy - b.last_click > 0).astype(int)
b['cart_after_purchase'] = (b.last_buy - b.last_cart > 0).astype(int)

## outcome

In [76]:
#outcome
g = out.groupby(['user_id', 'action']).count()[['item_id']]
g = g.reset_index()

In [77]:
to_merge = g[g.action == 'alipay'][['user_id', 'item_id']]
to_merge['item_id'] = np.ones(len(to_merge))
b = b.merge(to_merge, how='left', on='user_id')

In [78]:
b = b.rename(columns={'item_id': 'outcome'})
b['outcome'] = b.outcome.fillna(0)

In [79]:
b.to_csv('buyer_test.csv')