In [None]:
import numpy as np
import pandas as pd
from time import time
%matplotlib inline

In [None]:
all_events=pd.read_csv('raw_data/tianchi_fresh_comp_train_user.csv')
part_events=pd.read_csv('raw_data/partial_events.csv')
all_items=pd.read_csv('raw_data/tianchi_fresh_comp_train_item.csv')
target_items=set(all_items.item_id)
action_types=['browsed','collected','carted','bought']

In [None]:
def score(y_true,y_pred):
    true_vals=set((y_true.user_id.apply(lambda u:str(u))+'_'+y_true.item_id.apply(lambda i:str(i))).unique())
    pred_vals=set((y_pred.user_id.apply(lambda u:str(u))+'_'+y_pred.item_id.apply(lambda i:str(i))).unique())
    hits=len(list(filter(lambda x:x in true_vals,pred_vals)))
    prec=hits/len(y_pred)
    rec=hits/len(y_true)
    return 2*prec*rec/(prec+rec),prec,rec

start from 1st day

use 1st day's data as observed features

labeled with the 2nd day

In [None]:
feature_cols=action_types
def prepare_train(train_events,beg_day,during=3):
    train_events['time']=pd.DatetimeIndex(train_events.time)
    end_day=beg_day+pd.Timedelta(during,'d')

    x_events=train_events[np.logical_and(train_events.time>=beg_day,train_events.time<end_day)]
    y_events=train_events[np.logical_and(train_events.time>=end_day,train_events.time<end_day+pd.Timedelta(1,'d'))]

    buy_events=y_events[y_events.behavior_type==4]
    buy_user_items=set(['a'])
    for i in buy_events.index:
        uid=buy_events.loc[i,'user_id']
        iid=buy_events.loc[i,'item_id']
        k=(uid,iid)
        if k not in buy_user_items:
            buy_user_items.add(k)
    buy_user_items.discard('a')
    buy_user_items_visited=set(filter(lambda x:len(x_events[(x_events.user_id==x[0]) & (x_events.item_id==x[1])])>0,buy_user_items))
    
    y_val=buy_events[buy_events.item_id.apply(lambda iid:iid in target_items)]

    x_train=[]
    y_train=[]
    
    not_buy_user_items=set(['a'])
    for i in x_events.index:
        if len(y_train)== len(buy_user_items_visited)*2:
            break
        uid=x_events.loc[i,'user_id']
        iid=x_events.loc[i,'item_id']
        k=(uid,iid)
        features={}        
        if k in buy_user_items_visited:
            features['user_id']=uid
            features['item_id']=iid
            evts=x_events[(x_events.user_id==uid) & (x_events.item_id==iid)]
            for i,t in enumerate(action_types):
                features[t]=len(evts[evts.behavior_type==i+1])
            y_train.append(1)
            x_train.append(features)
            continue

        if k not in not_buy_user_items and len(not_buy_user_items)<len(buy_user_items_visited):
            features['user_id']=uid
            features['item_id']=iid
            evts=x_events[(x_events.user_id==uid) & (x_events.item_id==iid)]
            for i,t in enumerate(action_types):
                features[t]=len(evts[evts.behavior_type==i+1])
            y_train.append(0)
            x_train.append(features)
            not_buy_user_items.add(k)
            continue

    return pd.DataFrame(x_train), y_train,y_val

def prepare_test(train_events,during=3):
    last_time=train_events.time.sort_values().iloc[-1]
    beg_day=last_time-pd.Timedelta(during,'d')

    test_data=[]
    test_events=train_events[train_events.time>=beg_day]
    user_items=set(['a'])
    for i in test_events.index:
        uid=test_events.loc[i,'user_id']
        iid=test_events.loc[i,'item_id']
        k=(uid,iid)
        if k not in user_items:
            features={}
            features['user_id']=uid
            features['item_id']=iid
            evts=test_events[(test_events.user_id==uid) & (test_events.item_id==iid)]
            for i,t in enumerate(action_types):
                features[t]=len(evts[evts.behavior_type==i+1])
            test_data.append(features)
            user_items.add(k)
            
    return pd.DataFrame(test_data)




In [None]:
x_train.describe()

In [None]:
from sklearn.linear_model import LogisticRegression
ti=time()  

x_train,y_train,_=prepare_train(part_events,pd.Timestamp(year=2014,month=11,day=18))

lr=LogisticRegression()
lr.fit(x_train.drop(['user_id','item_id'],axis=1),y_train)
lr.score(x_train.drop(['user_id','item_id'],axis=1),y_train),time()-ti

In [None]:
x_dev,y_dev,y_val=prepare_train(part_events,pd.Timestamp(year=2014,month=12,day=15))
lr.score(x_dev.drop(['user_id','item_id'],axis=1),y_dev)

In [None]:
x_dev['label']=y_dev
score(y_val,x_dev[x_dev.label==1])

In [None]:
ti=time()  

x_train,y_train,_=prepare_train(all_events,pd.Timestamp(year=2014,month=11,day=18),during=1)

x_dev,y_dev,y_val=prepare_train(all_events,pd.Timestamp(year=2014,month=12,day=10),during=1)

time()-ti

In [None]:
lr=LogisticRegression()
lr.fit(x_train[feature_cols],y_train)
lr.score(x_train[feature_cols],y_train)

In [None]:
x_dev,y_dev,y_val=prepare_train(all_events,pd.Timestamp(year=2014,month=12,day=10),during=1)
lr.score(x_dev[feature_cols],y_dev)

In [None]:
x_dev['label']=y_dev
score(y_val,x_dev[(x_dev.label==1)])

In [None]:
ti=time()
test_data=prepare_test(all_events,during=3)
test_data['prob']=lr.predict_proba(test_data[feature_cols])[:,1]

time()-ti

In [None]:
preds=test_data.sort_values('prob',ascending=False)[lambda df:df.item_id.apply(lambda iid:iid in target_items)]

In [None]:
preds.iloc[1000].prob

In [None]:
test_data['label']=lr.predict(test_data[feature_cols])
len(test_data[test_data.label==1]),np.sum(test_data.label)

In [None]:
preds[:1000][['user_id','item_id']].to_csv('output/lr_1day_top1000.csv',index=False)