In [28]:
import numpy as np
import pandas as pd
import time

In [37]:
def make_data_small(data):
    #後ろの1週間を除いて100ユーザー分のデータを取得
    former = data[data['time_stamp'] < '2017-04-24 00:00:00.000']
    users_small = former['user_id'].unique()[0:100]
    data_small = former[np.in1d(former['user_id'], users_small)]
    products_small = former['product_id'].unique()
    
    #後ろの1週間から同じユーザーとプロダクトのものを評価用に取得
    latter = data[data['time_stamp'] >= '2017-04-24 00:00:00.000']
    latter = latter[np.in1d(latter['user_id'], users_small)]
    latter = latter[np.in1d(latter['product_id'], products_small)]
    
    test_small = pd.DataFrame(latter['user_id'].unique())
    test_small.columns = ['user_id']
    test_small_ans = latter
    
    return data_small, test_small, test_small_ans

In [38]:
#小さいデータセットを作成

filename = 'data/train/train_C.tsv'
train = pd.read_table(filename)
train_small, test_small, test_small_ans = make_data_small(train)

In [39]:
#データサイズ確認

users_small = train_small['user_id'].unique()
products_small = train_small['product_id'].unique()
print("event: " + str(len(train_small)))
print("user: " + str(len(users_small)))
print("product: " + str(len(products_small)))

event: 7979
user: 100
product: 4329


In [40]:
def make_crossmat(data, users = None, products = None):
    if users == None:
        users = data['user_id'].unique()
    if products == None:
        products = data['product_id'].unique()
    
    #ユーザーとプロダクトを行列のインデックスに変換
    data['user_id_int'] = data['user_id'].map(lambda x: np.where(users == x)[0][0])
    data['product_id_int'] = data['product_id'].map(lambda x: np.where(products == x)[0][0])
    
    #各イベントごとにカウント
    mats = np.zeros((4, len(users), len(products)))
    def count_event(event):
        mats[event['event_type'], event['user_id_int'], event['product_id_int']] += 1
        return 0
    train_small.apply(count_event, axis=1)
    
    #スコアの重みをかけて足す
    scores = np.array([
        3, #0カート
        1, #1閲覧
        2, #2クリック
        4  #3コンバージェンス
    ])
    crossmat = np.einsum('ijk,i', mats, scores)
    
    return mats, crossmat

In [41]:
mats, mat = make_crossmat(train_small, users_small, products_small)
mat

  
  after removing the cwd from sys.path.


array([[ 2.,  1.,  3., ...,  0.,  0.,  0.],
       [ 1.,  0.,  1., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  1.,  1.,  1.]])

In [42]:
def get_rating_error(r, p, q):
    return r - np.dot(p, q)


def get_error(R, P, Q, beta):
    error = 0.0
    for i in xrange(len(R)):
        for j in xrange(len(R[i])):
            if R[i][j] == 0:
                continue
            error += pow(get_rating_error(R[i][j], P[:,i], Q[:,j]), 2)
    error += beta/2.0 * (np.linalg.norm(P) + np.linalg.norm(Q))
    return error


def matrix_factorization(R, K, steps=5000, alpha=0.0002, beta=0.02, threshold=0.001):
    P = np.random.rand(K, len(R))
    Q = np.random.rand(K, len(R[0]))
    for step in xrange(steps):
        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i][j] == 0:
                    continue
                err = get_rating_error(R[i][j], P[:, i], Q[:, j])
                for k in xrange(K):
                    P[k][i] += alpha * (2 * err * Q[k][j])
                    Q[k][j] += alpha * (2 * err * P[k][i])
        error = get_error(R, P, Q, beta)
        if step % 100 == 0:
            print("step: "+str(step)+" error: "+str(error))
        if error < threshold:
            break
    return P, Q

In [43]:
#開始時刻
print(str(time.ctime()))
t1 = time.time()

nP, nQ = matrix_factorization(mat, 5, threshold=1.0)

#計算時間
elapsed_time = time.time()-t1
print("time: " + str(elapsed_time) + "秒")

Thu Oct 12 00:44:58 2017
step: 0 error: 21927.5517497
step: 100 error: 9632.21040739
step: 200 error: 5254.89164591
step: 300 error: 3334.92025031
step: 400 error: 2252.21955561
step: 500 error: 1654.25761732
step: 600 error: 1292.12965834
step: 700 error: 1039.41479227
step: 800 error: 850.810727962
step: 900 error: 706.788029699
step: 1000 error: 595.536645374
step: 1100 error: 508.580995798
step: 1200 error: 439.621487301
step: 1300 error: 384.022453874
step: 1400 error: 338.42264092
step: 1500 error: 300.406465542
step: 1600 error: 268.241128404
step: 1700 error: 240.677659789
step: 1800 error: 216.805905724
step: 1900 error: 195.951703094
step: 2000 error: 177.605571165
step: 2100 error: 161.374047097
step: 2200 error: 146.94671412
step: 2300 error: 134.0737983
step: 2400 error: 122.550773972
step: 2500 error: 112.20761956
step: 2600 error: 102.901202071
step: 2700 error: 94.5098056677
step: 2800 error: 86.9291467827
step: 2900 error: 80.0694158535
step: 3000 error: 73.8530125512


In [44]:
mat_estimate = np.dot(nP.T,nQ)
mat_estimate

array([[ 1.97264077,  1.15129266,  3.00391891, ...,  2.30613722,
         2.22167091,  0.89014958],
       [ 0.98644468,  1.47325226,  1.08595404, ...,  1.4083217 ,
         1.59953347,  0.61334467],
       [ 1.06610832,  0.94214787,  0.98962035, ...,  0.80067206,
         0.84199397,  0.65687644],
       ..., 
       [ 0.99573948,  1.40151279,  1.05123497, ...,  1.19564329,
         1.31646804,  0.682822  ],
       [ 3.05013593,  2.1842155 ,  3.77185138, ...,  3.52659804,
         3.78918116,  1.28321123],
       [ 1.33358234,  0.95620264,  1.65346208, ...,  0.98053529,
         1.00884638,  1.06962064]])

In [45]:
def make_recommend(users_test, mats, mat, users, products):
    recommend_df = pd.DataFrame([[],[]]).T
    for user_id in users_test:
        user_int = np.where(users == user_id)[0][0]
        scores = mat[user_int,:]
        ranking = np.argsort(scores)
        recommends = []
        for r in ranking:
            if mats[3,user_int,r] == 0: #未購入からレコメンド
                product_id = products[r]
                recommends.append(product_id)
                if len(recommends) >= 22:
                    break
        k = len(recommends)
        add = pd.DataFrame([[user_id] * k, recommends, range(k)]).T
        recommend_df = pd.concat([recommend_df, add], axis = 0)
    recommend_df.index = range(recommend_df.shape[0])
    return recommend_df

In [47]:
submit_df = make_recommend(test_small['user_id'], mats, mat, users_small, products_small)
submit_df

Unnamed: 0,0,1,2
0,0014532_C,00312053_c,0
1,0014532_C,00172957_c,1
2,0014532_C,00094364_c,2
3,0014532_C,00008654_c,3
4,0014532_C,00281776_c,4
5,0014532_C,00216330_c,5
6,0014532_C,00197283_c,6
7,0014532_C,00128007_c,7
8,0014532_C,00233491_c,8
9,0014532_C,00310719_c,9


In [108]:
def evaluate(recommend_df, data_ans):
    rels = [0, 1, 3, 7]
    data_ans['rel'] = data_ans['event_type'].map(lambda x: rels[x])
    i = 0
    scores = []
    for user_id in recommend_df[0].unique():
        a = data_ans[data_ans['user_id'] ==user_id]
        r = recommend_df[recommend_df[0] ==user_id]
        
        a_rel = a.sort_values(by = 'rel', ascending = False)
        a_rel.drop_duplicates('product_id')
        a_rel = a_rel['rel']
        l = min(len(a_rel), 22)
        idcg = 0
        for j in xrange(l):
            idcg += a_rel.values[j] / np.log2(j+2)
        #print("idcg:"+str(idcg))
        
        dcg = 0
        for r_e in r.iterrows():
            j = r_e[1][2]
            a_list = a[a['product_id'] == r_e[1][1]]['rel'].sort_values(ascending = False)
            r_e_rel = 0
            if a_list.size > 0:
                dcg += a_list.values[0] / np.log2(j+2)
        #print("dcg:"+str(dcg))
        
        scores.append(dcg / idcg)
        #i += 1
        #if i > 5:
        #    break
    return np.mean(scores)

In [109]:
evaluate(submit_df, test_small_ans)

0.0024983686112535377