# thoughts

- pred buy users first
- get users' viewed item categories
- filter by the target cates
- select the top buy items of the cate

In [None]:
import numpy as np
import pandas as pd
from time import time
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

%matplotlib inline

In [None]:
np.random.seed(1)

part_events=pd.read_csv('raw_data/partial_events.csv')
part_events['time']=pd.DatetimeIndex(part_events.time)

all_items=pd.read_csv('raw_data/tianchi_fresh_comp_train_item.csv')
target_items=set(all_items.item_id)
target_cates=set(all_items.item_category)
target_item_cates={}
for i in all_items.index:
    target_item_cates[all_items.loc[i,'item_id']]=all_items.loc[i,'item_category']
    
action_types=['browsed','collected','carted','bought']

In [None]:
def score(y_true,y_pred):
    true_vals=set((y_true.user_id.apply(lambda u:str(u))+'_'+y_true.item_id.apply(lambda i:str(i))).unique())
    pred_vals=set((y_pred.user_id.apply(lambda u:str(u))+'_'+y_pred.item_id.apply(lambda i:str(i))).unique())
    hits=len(list(filter(lambda x:x in true_vals,pred_vals)))
    prec=hits/len(y_pred)
    rec=hits/len(y_true)
    return 2*prec*rec/(prec+rec),prec,rec

## train

In [None]:
def prepare(events,obs_day,buy_day):

    obs_events=events[(events.time>=obs_day)&(events.time<buy_day)]
    buy_events=events[(events.behavior_type==4)&(events.time>=buy_day)&(events.time<buy_day+pd.Timedelta(1,'d'))]
    buy_users=set(buy_events.user_id.unique())

    obs_data=obs_events[['user_id','behavior_type','item_id']].pivot_table(index='user_id',columns='behavior_type',aggfunc='count').fillna(0)
    obs_data.columns=action_types
    obs_data['label']=[int(u in buy_users) for u in obs_data.index]

    buy_data=obs_data[obs_data.label==1]
    not_buy_data=obs_data[obs_data.label==0]
    not_buy_data=not_buy_data.loc[np.random.choice(not_buy_data.index,len(buy_data),replace=False)]

    train_data=pd.concat([buy_data,not_buy_data])
    X_train=train_data.drop('label',axis=1)
    y_train=train_data['label']
    
    return X_train,y_train
    

In [None]:
obs_day=pd.Timestamp(year=2014,month=12,day=14)
buy_day=pd.Timestamp(year=2014,month=12,day=15)

X_train,y_train=prepare(part_events,obs_day,buy_day)

lr=LogisticRegression()
lr.fit(X_train,y_train)



In [None]:
obs_day=pd.Timestamp(year=2014,month=12,day=16)
buy_day=pd.Timestamp(year=2014,month=12,day=17)

X_dev,y_dev=prepare(part_events,obs_day,buy_day)
X_dev['label']=lr.predict(X_dev)


In [None]:
def get_truth(events,buy_day):
    return events[(events.behavior_type==4)&(events.time>=buy_day)&(events.time<buy_day+pd.Timedelta(1,'d'))][lambda df:df.item_id.apply(lambda i: i in target_items)]

In [None]:
pred_users=X_dev[X_dev.label==1].index

events=part_events
# def predict(events,obs_day,buy_day):
pred_user_items=[]
most_pop_cate_items={}


events=events[lambda df:df.item_category.apply(lambda c:c in target_cates)]

items_table=events[['item_id','behavior_type','time']].pivot_table(index='item_id',
                                                                columns='behavior_type',aggfunc='count').fillna(0)
items_table.columns=action_types
items_table['item_category']=[target_item_cates[i] for i in items_table.index]

obs_events=events[(events.time>=obs_day)&(events.time<buy_day)]
for u in pred_users:
    item_cates=obs_events[obs_events.user_id==u].item_category.unique()
    for ic in item_cates:
        if ic not in most_pop_cate_items:
            most_buy_item=items_table[items_table.item_category==ic].sort_values('bought',ascending=False).index[0]
            most_pop_cate_items[ic]=most_buy_item
        pred_user_items.append({'user_id':u,'item_id':most_pop_cate_items[ic]})

y_pred=pd.DataFrame(pred_user_items)



In [None]:
y_true=get_truth(part_events,buy_day)

# y_pred=predict(y_true,obs_day,buy_day)
len(y_true),len(y_pred),score(y_true,y_pred)