In [None]:
import numpy as np
import pandas as pd
from time import time
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

%matplotlib inline

In [None]:
all_events=pd.read_csv('raw_data/tianchi_fresh_comp_train_user.csv')
part_events=pd.read_csv('raw_data/partial_events.csv')
all_items=pd.read_csv('raw_data/tianchi_fresh_comp_train_item.csv')
target_items=set(all_items.item_id)
action_types=['browsed','collected','carted','bought']

In [None]:
'total events count %d, total users %d' % (len(all_events), len(all_events.user_id.unique()))

In [None]:
'total user item pair %d' % len(all_events.groupby(['user_id','item_id']).count())

In [None]:
'total buy events %d' % len(all_events[all_events.behavior_type==4].groupby(['user_id','item_id']).count())

## sample users

In [None]:
all_users=all_events.user_id.unique()
sample_users=all_users[:500]
sample_events=all_events[all_events.user_id.apply(lambda uid:uid in sample_users)]
len(sample_events)

In [None]:
len(sample_events.item_id.unique())

In [None]:
behaves=sample_events[['user_id','item_id','behavior_type','time']].pivot_table(index=['user_id','item_id'],columns='behavior_type',aggfunc='count').fillna(0)
behaves.columns=action_types
buy_behaves=behaves[behaves.bought>0]
len(behaves),len(buy_behaves)

In [None]:
nbuy_behaves=behaves[behaves.bought==0]

In [None]:
buy_behaves.index.levels[0]

### will events happen before buy sth in n days?

In [None]:
sample_events['time']=pd.DatetimeIndex(sample_events.time)
buy_events=sample_events[sample_events.behavior_type==4]
nbuy_events=sample_events[sample_events.behavior_type!=4]

n_days=1
buy_events['before_n']=buy_events.apply(lambda e:len(nbuy_events[(nbuy_events.user_id==e.user_id)&(nbuy_events.item_id==e.item_id)&(nbuy_events.time<e.time)]), axis=1)
len(buy_events),len(buy_events[buy_events.before_n==0])

it seems half not-buy events happen in the same day with the buy event

## items

In [None]:
item_size=len(all_items)
cat_size=len(all_items.item_category.unique())
geo_size=len(all_items.item_geohash.unique())
'recommend items %d, types %d, geo %d' % (item_size,cat_size,geo_size)

In [None]:
'geo missing %.4f'%(1-all_items.item_geohash.count()/len(all_items))

is every category equal size?

In [None]:
cate_summ=all_items.groupby('item_category').count()[['item_id']].sort_values('item_id',ascending=False)

In [None]:
cate_summ.hist(cumulative=True,normed=1,figsize=(20,5),bins=300)

In [None]:
'only has 1 item categories %d' % (len(cate_summ[cate_summ.item_id==1]))

In [None]:
'cate size avg %d, mid %d' %(item_size)//cat_size, cate_summ.iloc[cat_size//2])

In [None]:
'top 10 items ratio %.2f, top 100 %.2f' % (np.sum(cate_summ[:10])/item_size, np.sum(cate_summ[:100])/item_size)

In [None]:
train_items=set(all_events.item_id.unique())
'target_items_in_train / target_items: %.2f ' % (len(list(filter(lambda i: i in train_items, target_items)))/len(target_items))

## user_events

In [None]:
'%d items in train events ' % len(all_events.item_id.unique())

### is everyday's events num equal?

In [None]:
part_events['day']=part_events.time.apply(lambda t:pd.Timestamp(year=t.year,month=t.month,day=t.day))

In [None]:
part_events.groupby('day').count()[['time']].plot(kind='bar',figsize=(20,5))

it seems the last day's data is normal

### is all buy items in target items?

In [None]:
buy_events=part_events[part_events.behavior_type==4]

'in target ratio %.2f' % (len(buy_events[buy_events.item_id.apply(lambda i:i in target_items)]) / len(buy_events))

target items are part of everyday buy items 

### before a user buy one item, will he view similar items day before?

In [None]:
def count_sim_visits(during):
    obs_day=pd.Timestamp(year=2014,month=11,day=19)
    chk_day=obs_day+pd.Timedelta(during,'d')

    obs_events=part_events[(part_events.time>=obs_day)&(part_events.time<chk_day)]
    buy_events=part_events[(part_events.behavior_type==4)&(part_events.time>=chk_day)&(part_events.time<chk_day+pd.Timedelta(1,'d'))] \
        [lambda df:df.item_id.apply(lambda i:i in target_items)]

    buy_times=len(buy_events)
    sim_visits=len(obs_events[lambda df:df.apply(lambda e: len(buy_events[
        (buy_events.user_id==e.user_id)&(buy_events.item_category==e.item_category)])>0,axis=1)])

    print('%ddays, buy %d, similar visits %d, ratio %.2f' % (during, buy_times, sim_visits, buy_times/sim_visits))

for d in range(1,4):
    count_sim_visits(d)

25% similar items will be bought next day

### before buy one item, will user be active days before?

In [None]:
def count_active_users(during):
    obs_day=pd.Timestamp(year=2014,month=11,day=19)
    chk_day=obs_day+pd.Timedelta(during,'d')

    obs_events=part_events[(part_events.time>=obs_day)&(part_events.time<chk_day)]
    buy_events=part_events[(part_events.behavior_type==4)&(part_events.time>=chk_day)&(part_events.time<chk_day+pd.Timedelta(1,'d'))]

    buy_users=len(buy_events.user_id.unique())
    active_users=len(obs_events.user_id.unique())

    print('%ddays, buy users %d, active users %d, ratio %.2f' % (during, buy_users, active_users, buy_users/active_users))

for d in range(1,4):
    count_active_users(d)

about 20% active users will buy in next day

### action ratios

In [None]:
all_evts_num=len(all_events)
action_nums=all_events.groupby('behavior_type').count().item_id
'action ratios %s ' % ((action_nums/all_evts_num)*100)

In [None]:
item_actions=all_events[['item_id','item_category','behavior_type','user_id']].pivot_table(index=['item_id','item_category'],
                                                                                           columns='behavior_type',aggfunc='count').fillna(0)
item_actions.columns=action_types

In [None]:
item_actions.sort_values('browsed',ascending=False)[:20]

In [None]:
item_actions.sort_values('browsed',ascending=False)[-20:]

In [None]:
'total buy items %d, unpop items %d' %(len(item_actions[item_actions.bought>0]),len(item_actions[(item_actions.bought==1)&(item_actions.browsed==0)]))

## predict active users

split users to 2 classes: buy or not-buy

resample the not-buy user to build negative samples

summary user's actions as features 

use lr to predict

In [None]:
# obs_day=pd.Timestamp(year=2014,month=11,day=19)
# buy_day=pd.Timestamp(year=2014,month=11,day=20)
# test_day=pd.Timestamp(year=2014,month=11,day=21)

# obs_events=part_events[(part_events.time>=obs_day)&(part_events.time<buy_day)]
# obs_users=obs_events.user_id.unique()
# buy_events=part_events[(part_events.behavior_type==4)&(part_events.time>=buy_day)&(part_events.time<test_day)]
# buy_users=buy_events.user_id.unique()


# obs_data=obs_events[['user_id','behavior_type','item_id']].pivot_table(index='user_id',columns='behavior_type',aggfunc='count').fillna(0)
# obs_data['label']=[int(u in buy_users) for u in obs_data.index]

# buy_data=obs_data[obs_data.label==1]
# not_buy_data=obs_data[obs_data.label==0]

# np.random.seed(1)

# scores=[]
# lr=LogisticRegression()
# for i in range(3):
#     not_buy_data=not_buy_data.loc[np.random.choice(not_buy_data.index,len(buy_data),replace=False)]

#     train_data=pd.concat([buy_data,not_buy_data])
#     X_train=train_data.drop('label',axis=1)
#     y_train=train_data['label']

    
#     scores+=list(cross_val_score(lr,X_train,y_train,cv=5))
    
# np.mean(scores),np.std(scores)

In [None]:
# lr.fit(X_train,y_train)

In [None]:
# lr.coef_

test lr

In [None]:
# obs_day=pd.Timestamp(year=2014,month=11,day=21)
# buy_day=pd.Timestamp(year=2014,month=11,day=22)

# obs_events=part_events[(part_events.time>=obs_day)&(part_events.time<buy_day)]
# buy_events=part_events[(part_events.behavior_type==4)&(part_events.time>=buy_day)&(part_events.time<buy_day+pd.Timedelta(1,'d'))]


# obs_data=obs_events[['user_id','behavior_type','item_id']].pivot_table(index='user_id',columns='behavior_type',aggfunc='count').fillna(0)
# labels=lr.predict(obs_data)
# probs=lr.predict_proba(obs_data)[:,1]
# obs_data['label']=labels
# obs_data['prob']=probs
# len(obs_data),np.sum(obs_data.label)

In [None]:
# len(buy_events),len(buy_events.user_id.unique())

In [None]:
def score(y_true,y_pred):
    true_vals=set(y_true)
    pred_vals=set(y_pred)
    hits=len(list(filter(lambda x:x in true_vals,pred_vals)))
    prec=hits/len(y_pred)
    rec=hits/len(y_true)
    return 2*prec*rec/(prec+rec),prec,rec

score(buy_events.user_id.unique(),obs_data[obs_data.label==1].index)

In [None]:
score(buy_events.user_id.unique(),obs_data[obs_data.label==1].sort_values('prob',ascending=False)[:200].index)

we get 30% accurate predicted buy users.

In [None]:
# tmp_tab=part_events[['item_id','behavior_type','time']]. \
#     pivot_table(index=['item_id'],columns=['behavior_type'],aggfunc='count').fillna(0).head()

In [None]:
# tmp_tab.columns=action_types
# tmp_tab['item_category']=[lambda i:all_items]

## buy actions

In [None]:
all_events.columns

In [None]:
all_behaves=part_events[['user_id','item_id','behavior_type']].pivot_table(columns='behavior_type',index=['user_id','item_id'],aggfunc='sum').fillna(0)
all_behaves.head()