In [1]:
import pandas as pd
import numpy as np
import datetime as dt

# Prepare the dataset

To prepare for the buyer features, we have to use the log dataset, which contains the behaviours of around 5000 frequent usersover 6 months.

In [3]:
df = pd.read_csv('log.csv')

# train test split

In [4]:
# ignore 2013-09, which serves as outcomes in test dataset
df['vmonth'] = df.vtime.str[: 7]
df_train = df[df.vmonth != '2013-09']

In [5]:
df_train.loc[:, 'vtime'] = df_train.vtime.apply(pd.to_datetime)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train.loc[:, 'vtime'] = df_train.vtime.apply(pd.to_datetime)


In [6]:
# keep 2013-08 as outcome, and use data before that to create features
train = df_train[df_train.vmonth != '2013-08'].reset_index(drop=True)
out = df_train[df_train.vmonth == '2013-08'].reset_index(drop=True)

# buyer data

## action-based features

In [37]:
# this is the final dataframe of buyer features
buyer = list(set(df.user_id))
b = pd.DataFrame(index=buyer)
b = b.reset_index()
b = b.rename(columns={'index': 'user_id'})

In [38]:
# this is where we extract features related to historical actions from

# all behaviour
buyer = train.groupby(['user_id', 'action']).count()[['item_id']].reset_index()

# last_month behaviour
buyer_last_month = train[train.vmonth == '2013-07'].groupby(['user_id', 'action']).count()[['item_id']].reset_index()

# last ten days behaviour
buyer_recent = train[train.vtime > dt.datetime(2013, 7, 20)].groupby(['user_id', 'action']).count()[['item_id']].reset_index()

In [39]:
# purchase
to_merge = buyer[buyer.action == 'alipay'][['user_id', 'item_id']]
b = b.merge(to_merge, how='left', on='user_id')
b = b.rename(columns={'item_id': 'purchase'})

In [40]:
# purchase last month
to_merge = buyer_last_month[buyer_last_month.action == 'alipay'][['user_id', 'item_id']]
b = b.merge(to_merge, how='left', on='user_id')
b = b.rename(columns={'item_id': 'purchase_last_m'})

In [41]:
# purchase last ten day
to_merge = buyer_recent[buyer_recent.action == 'alipay'][['user_id', 'item_id']]
b = b.merge(to_merge, how='left', on='user_id')
b = b.rename(columns={'item_id': 'purchase_recent'})

In [42]:
# click
to_merge = buyer[buyer.action == 'click'][['user_id', 'item_id']]
b = b.merge(to_merge, how='left', on='user_id')
b = b.rename(columns={'item_id': 'click'})

In [43]:
# click last month
to_merge = buyer_last_month[buyer_last_month.action == 'click'][['user_id', 'item_id']]
b = b.merge(to_merge, how='left', on='user_id')
b = b.rename(columns={'item_id': 'click_last_m'})

In [44]:
# click last ten day
to_merge = buyer_recent[buyer_recent.action == 'click'][['user_id', 'item_id']]
b = b.merge(to_merge, how='left', on='user_id')
b = b.rename(columns={'item_id': 'click_recent'})

In [45]:
# cart
to_merge = buyer[buyer.action == 'cart'][['user_id', 'item_id']]
b = b.merge(to_merge, how='left', on='user_id')
b = b.rename(columns={'item_id': 'cart'})

In [46]:
# cart last month
to_merge = buyer_last_month[buyer_last_month.action == 'cart'][['user_id', 'item_id']]
b = b.merge(to_merge, how='left', on='user_id')
b = b.rename(columns={'item_id': 'cart_last_m'})

In [47]:
# click last ten day
to_merge = buyer_recent[buyer_recent.action == 'cart'][['user_id', 'item_id']]
b = b.merge(to_merge, how='left', on='user_id')
b = b.rename(columns={'item_id': 'cart_recent'})

In [48]:
# ratio
b['click_buy_ratio'] = b.purchase / b.click
b['cart_buy_ratio'] = b.purchase / b.cart

b['click_buy_ratio_last_m'] = b.purchase_last_m / b.click_last_m
b['cart_buy_ratio_last_m'] = b.purchase_last_m / b.cart_last_m

b['click_buy_ratio_recent'] = b.purchase_recent / b.click_recent
b['cart_buy_ratio_recent'] = b.purchase_recent / b.cart_recent

In [49]:
b = b.fillna(0)
b = b.set_index('user_id')

## action-based features (2)

In [50]:
# last purchase
df2 = train[train.action == 'alipay']
for id_ in b.index:
    try:
        b.loc[id_, 'last_buy'] = (dt.datetime(2013, 8, 1) - max(df2[df2.user_id == id_].vtime)).days
    except ValueError:
        b.loc[id_, 'last_buy'] = np.nan

In [51]:
# last click
df3 = train[train.action == 'click']
for id_ in b.index:
    try:
        b.loc[id_, 'last_click'] = (dt.datetime(2013, 8, 1) - max(df3[df3.user_id == id_].vtime)).days
    except ValueError:
        b.loc[id_, 'last_click'] = np.nan

In [52]:
# last day of cart
df4 = train[train.action == 'cart']
for id_ in b.index:
    try:
        b.loc[id_, 'last_cart'] = (dt.datetime(2013, 8, 1) - max(df4[df4.user_id == id_].vtime)).days
    except ValueError:
        b.loc[id_, 'last_cart'] = np.nan

In [53]:
# actions after last purchase
b['click_after_purchase'] = (b.last_buy - b.last_click > 0).astype(int)
b['cart_after_purchase'] = (b.last_buy - b.last_cart > 0).astype(int)

## outcome

In [58]:
#outcome
g = out.groupby(['user_id', 'action']).count()[['item_id']]
g = g.reset_index()

In [59]:
to_merge = g[g.action == 'alipay'][['user_id', 'item_id']]
to_merge['item_id'] = np.ones(len(to_merge))
b = b.merge(to_merge, how='left', on='user_id')

In [60]:
b = b.rename(columns={'item_id': 'outcome'})
b['outcome'] = b.outcome.fillna(0)

In [63]:
b.to_csv('buyer_train.csv')

# seller to user data

In [None]:
%store -r seller_id
%store -r user_id
user_id = list(user_id)
seller_id = list(seller_id)

In [None]:
b = pd.read_csv('buyer_train.csv')

In [None]:
train = train.reset_index(drop=True) 
train = train[train.user_id.isin(user_id)]
train = train[train.seller_id.isin(seller_id)]

In [None]:
# create index based on the first state prediction
index = pd.MultiIndex.from_product([user_id, seller_id], names=['user_id', 'seller_id'])
b_s = pd.DataFrame(index=index)

In [None]:
group = train.groupby(['user_id', 'seller_id', 'action']).count()
group = group.reset_index()

In [None]:
to_merge = group[group.action == 'alipay'][['user_id', 'seller_id', 'item_id']]
b_s = b_s.merge(to_merge, how='left', on=['user_id', 'seller_id'])

In [None]:
to_merge = group[group.action == 'click'][['user_id', 'seller_id', 'item_id']]
b_s = b_s.merge(to_merge, how='left', on=['user_id', 'seller_id'])

In [None]:
b_s = b_s.rename(columns={'item_id_x': 'purchase', 'item_id_y': 'click'})

In [None]:
to_merge = group[group.action == 'cart'][['user_id', 'seller_id', 'item_id']]
b_s = b_s.merge(to_merge, how='left', on=['user_id', 'seller_id'])
b_s = b_s.rename(columns={'item_id': 'cart'})

### last month

In [None]:
last_month = train[train.vmonth == '2013-07'].groupby(['user_id', 'seller_id', 'action']).count()[['item_id']]
last_month = last_month.reset_index()

In [None]:
to_merge = last_month[last_month.action == 'purchase'][['user_id', 'seller_id', 'item_id']]
b_s = b_s.merge(to_merge, how='left', on=['user_id', 'seller_id'])
b_s = b_s.rename(columns={'item_id': 'recent_purchase'})

In [None]:
to_merge = last_month[last_month.action == 'click'][['user_id', 'seller_id', 'item_id']]
b_s = b_s.merge(to_merge, how='left', on=['user_id', 'seller_id'])
b_s = b_s.rename(columns={'item_id': 'recent_click'})

In [None]:
to_merge = last_month[last_month.action == 'cart'][['user_id', 'seller_id', 'item_id']]
b_s = b_s.merge(to_merge, how='left', on=['user_id', 'seller_id'])
b_s = b_s.rename(columns={'item_id': 'recent_cart'})

### outcome

In [None]:
g = out.groupby(['user_id', 'seller_id', 'action']).count()
g = g.reset_index()
to_merge = g[g.action == 'alipay'][['user_id', 'seller_id', 'item_id']]
to_merge['item_id'] = np.ones(len(to_merge))

In [None]:
b_s = b_s.merge(to_merge, how='left', on=['user_id', 'seller_id'])
b_s = b_s.rename(columns={'item_id': 'outcome'})

In [None]:
b_s= b_s.fillna(0)

In [None]:
b_s.to_csv('b_s_train.csv', index=False)