In [1]:
import numpy as np
import pandas as pd
import time

In [2]:
def make_data_small(data, usercount = 100):
    #後ろの1週間を除いて100ユーザー分のデータを取得
    former = data[data['time_stamp'] < '2017-04-24 00:00:00.000']
    users_small = former['user_id'].unique()[0 : usercount]
    data_small = former[np.in1d(former['user_id'], users_small)]
    products_small = former['product_id'].unique()
    
    #後ろの1週間から同じユーザーとプロダクトのものを評価用に取得
    latter = data[data['time_stamp'] >= '2017-04-24 00:00:00.000']
    latter = latter[np.in1d(latter['user_id'], users_small)]
    latter = latter[np.in1d(latter['product_id'], products_small)]
    
    test_small = pd.DataFrame(latter['user_id'].unique())
    test_small.columns = ['user_id']
    test_small_ans = latter
    
    return data_small, test_small, test_small_ans

In [3]:
#小さいデータセットを作成

filename = 'data/train/train_C.tsv'
train = pd.read_table(filename)
train_small, test_small, test_small_ans = make_data_small(train)

In [4]:
#データサイズ確認

users_small = train_small['user_id'].unique()
products_small = train_small['product_id'].unique()
print("event: " + str(len(train_small)))
print("user: " + str(len(users_small)))
print("product: " + str(len(products_small)))

event: 7979
user: 100
product: 4329


In [5]:
def make_crossmat(data, users = [], products = []):
    if len(users) == 0:
        users = data['user_id'].unique()
    if len(products) == 0:
        products = data['product_id'].unique()
    
    #ユーザーとプロダクトを行列のインデックスに変換
    data['user_id_int'] = data['user_id'].map(lambda x: np.where(users == x)[0][0])
    data['product_id_int'] = data['product_id'].map(lambda x: np.where(products == x)[0][0])
    
    #各イベントごとにカウント
    mats = np.zeros((4, len(users), len(products)))
    def count_event(event):
        mats[event['event_type'], event['user_id_int'], event['product_id_int']] += 1
        return 0
    train_small.apply(count_event, axis=1)
    
    #スコアの重みをかけて足す
    scores = np.array([
        3, #0カート
        1, #1閲覧
        2, #2クリック
        4  #3コンバージェンス
    ])
    crossmat = np.einsum('ijk,i', mats, scores)
    
    return mats, crossmat

In [6]:
mats, mat = make_crossmat(train_small, users_small, products_small)
mat

array([[ 2.,  1.,  3., ...,  0.,  0.,  0.],
       [ 1.,  0.,  1., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  1.,  1.,  1.]])

In [7]:
def get_rating_error(r, p, q):
    return r - np.dot(p, q)


def get_error(R, P, Q, beta):
    error = 0.0
    for i in xrange(len(R)):
        for j in xrange(len(R[i])):
            if R[i][j] == 0:
                continue
            error += pow(get_rating_error(R[i][j], P[:,i], Q[:,j]), 2)
    error += beta/2.0 * (np.linalg.norm(P) + np.linalg.norm(Q))
    return error


def matrix_factorization(R, K, steps=5000, alpha=0.0002, beta=0.02, threshold=0.001):
    np.random.seed(1234)
    P = np.random.rand(K, len(R))
    Q = np.random.rand(K, len(R[0]))
    t1 = time.time()
    step = 0
    while True:
        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i][j] == 0:
                    continue
                err = get_rating_error(R[i][j], P[:, i], Q[:, j])
                for k in xrange(K):
                    P[k][i] += alpha * (2 * err * Q[k][j])
                    Q[k][j] += alpha * (2 * err * P[k][i])
        error = get_error(R, P, Q, beta)
        if step % 100 == 0:
            time_spent = time.time()-t1
            print("step: " + str(step) + " error: " + str(error) + " time: " + str(time_spent) + "秒")
        step += 1
        if error < threshold or step >= steps:
            time_spent = time.time()-t1
            print("step: " + str(step) + " error: " + str(error) + " time: " + str(time_spent) + "秒")
            break
    return P, Q

In [8]:
nP, nQ = matrix_factorization(mat, 5, threshold=1.0)

step: 5000 error: 17.7040307378 time: 3123.33389711秒

In [12]:
mat_estimate = np.dot(nP.T,nQ)
mat_estimate

array([[ 0.48960361,  0.24479165,  0.73449984, ...,  0.42149506,
         0.25749318,  0.42740348],
       [ 0.24474319,  0.19169394,  0.24456702, ...,  0.21380435,
         0.22078379,  0.21511228],
       [ 0.24474148,  0.11807646,  0.35285627, ...,  0.16172407,
         0.13754109,  0.13558938],
       ..., 
       [ 0.24822521,  0.26545699,  0.14285855, ...,  0.26502793,
         0.35489819,  0.25984083],
       [ 0.22603871,  0.09455061,  0.36258429, ...,  0.16552911,
         0.13860357,  0.12880359],
       [ 0.23258835,  0.17573105,  0.22217366, ...,  0.21318364,
         0.21318285,  0.21318441]])

In [10]:
def make_recommend(test, mat, exclude_mat, users, products):
    recommend_df = pd.DataFrame([[],[]]).T
    for user_id in test['user_id']:
        user_int = np.where(users == user_id)[0][0]
        scores = mat[user_int,:]
        ranking = np.argsort(scores)
        recommends = []
        for r in ranking:
            if not exclude_mat[user_int,r]:
                product_id = products[r]
                recommends.append(product_id)
                if len(recommends) >= 22:
                    break
        k = len(recommends)
        add = pd.DataFrame([[user_id] * k, recommends, range(k)]).T
        recommend_df = pd.concat([recommend_df, add], axis = 0)
    recommend_df.index = range(recommend_df.shape[0])
    return recommend_df

In [11]:
#購入済み以外からレコメンド
exclude_mat = (mats[3] != 0)

submit_df = make_recommend(test_small, mat, exclude_mat, users_small, products_small)
submit_df

Unnamed: 0,0,1,2
0,0014532_C,00312053_c,0
1,0014532_C,00172957_c,1
2,0014532_C,00094364_c,2
3,0014532_C,00008654_c,3
4,0014532_C,00281776_c,4
5,0014532_C,00216330_c,5
6,0014532_C,00197283_c,6
7,0014532_C,00128007_c,7
8,0014532_C,00233491_c,8
9,0014532_C,00310719_c,9


In [12]:
def evaluate(recommend_df, data_ans):
    rels = [0, 1, 3, 7]
    data_ans['rel'] = data_ans['event_type'].map(lambda x: rels[x])
    i = 0
    scores = []
    for user_id in recommend_df[0].unique():
        a = data_ans[data_ans['user_id'] ==user_id]
        r = recommend_df[recommend_df[0] ==user_id]
        
        a_rel = a.sort_values(by = 'rel', ascending = False)
        a_rel.drop_duplicates('product_id')
        a_rel = a_rel['rel']
        l = min(len(a_rel), 22)
        idcg = 0
        for j in xrange(l):
            idcg += a_rel.values[j] / np.log2(j+2)
        #print("idcg:"+str(idcg))
        
        dcg = 0
        for r_e in r.iterrows():
            j = r_e[1][2]
            a_list = a[a['product_id'] == r_e[1][1]]['rel'].sort_values(ascending = False)
            r_e_rel = 0
            if a_list.size > 0:
                dcg += a_list.values[0] / np.log2(j+2)
        #print("dcg:"+str(dcg))
        
        scores.append(dcg / idcg)
        #i += 1
        #if i > 5:
        #    break
    return np.mean(scores)

In [15]:
evaluate(submit_df, test_small_ans)

0.0024983686112535377

In [13]:
def nmf_fill0(R, K, steps=5000, beta=0.02, threshold=0.001):
    isvalue = (R != 0)
    np.random.seed(1234)
    P = np.random.rand(K, len(R))
    Q = np.random.rand(K, len(R[0]))
    P = np.transpose(P)
    t1 = time.time()
    step = 0
    while True:
        PQzero = np.multiply(np.dot(P, Q), isvalue)
        
        Qn = np.dot(P.T, R)
        Qd = np.dot(P.T, PQzero)
        #Q = np.matrix(np.array(Q) * np.array(Qn) / np.array(Qd))
        Q = Q * Qn / Qd
        
        Pn = np.dot(R, Q.T)
        Pd = np.dot(PQzero, Q.T)
        #P = np.matrix(np.array(P) * np.array(Pn) / np.array(Pd))
        P = P * Pn / Pd
        
        error = get_error(R, P.T, Q, beta)
        if step % 100 == 0:
            time_spent = time.time()-t1
            print("step: " + str(step) + " error: " + str(error) + " time: " + str(time_spent) + "秒")
        step += 1
        if error < threshold or step >= steps:
            time_spent = time.time()-t1
            print("step: " + str(step) + " error: " + str(error) + " time: " + str(time_spent) + "秒")
            break
    return P.T, Q

In [None]:
#0対応、行列形式NMF
nP2, nQ2 = nmf_fill0(mat, 5, threshold=1.0)

step: 0 error: 1169324.06956 time: 0.308763027191秒
step: 100 error: 471945.5076 time: 30.5211920738秒
step: 200 error: 412217.4166 time: 60.2459409237秒
step: 300 error: 387868.564395 time: 89.432352066秒
step: 400 error: 373586.313671 time: 118.117511034秒
step: 500 error: 363830.64249 time: 146.011734962秒
step: 600 error: 356710.757644 time: 173.600703955秒
step: 700 error: 351338.596722 time: 201.169296026秒
step: 800 error: 347219.320388 time: 228.901756048秒
step: 900 error: 344041.435828 time: 260.118957996秒
