## Personalized recommendation system using random walk based markov model

In [61]:
import numpy as np
import pandas as pd
import os
import collections
import itertools

In [369]:
## related functions
def filter_split_data(data, n_freq):
    # filter less frequent users to reduce sparsity
    uids = data['UserId']
    user_freq = collections.Counter(uids)
    # filter users 
    counter_df = pd.DataFrame.from_dict(user_freq, orient='index').reset_index()
    counter_df = counter_df.rename(columns={'index':'usd', 0:'count'})
    user_sel = counter_df[counter_df['count']>8]['usd']
    
    data_filtered = data[data['UserId'].isin(user_sel)]
    data_filtered = data_filtered.reset_index()
    
    # split data into train and validation
    # validation dataset is the last 3 items of an user
    val_index = list()
    train_index = list()
    for i in user_sel:
        index = data_filtered[data_filtered['UserId'] == i].index.tolist()
        val_index = list(itertools.chain(val_index, index[-3:]))
        train_index = list(itertools.chain(train_index, index[0:len(index)-3]))
    
    train_data = data_filtered.loc[train_index]
    val_data = data_filtered.loc[val_index]
    
    # reset index
    train_data = train_data.reset_index()
    val_data = val_data.reset_index()
    return train_data, val_data

## calculate transition probability matrix from datasets
def transition_prob(data):
    items = data['ItemId'].unique()
    uids = data['UserId'].unique()
    # initialization of transition probability matrix
    t = np.zeros((len(items), len(items)))
    
    for j in range(len(uids)):
        uid_df = data[data["UserId"] == uids[j]]
        k=0
        while(k>=0 and k<uid_df.shape[0]-1):
            row_item = uid_df.iloc[k, :]['ItemId']
            row_index = np.where(items == row_item)
            next_item = uid_df.iloc[k+1, :]['ItemId']
            col_index = np.where(items == next_item)
            t[row_index, col_index] = t[row_index, col_index] + 1
            k = k+1
    
    #print(t)
    t = t/t.sum(axis=1)[:,None]
    t = np.nan_to_num(t)
    return t

## generate user behavior matrix
def user_spec_vec(data):
    items = data['ItemId'].unique()
    uids = data['UserId'].unique()
    
    # initiate user specific matrix
    u = np.zeros((len(uids), len(items)))
    
    for i in range(len(uids)):
        uid_df = data[data["UserId"] == uids[i]]
        for j in range(len(items)):
            item_index = uid_df[uid_df['ItemId'] == items[j]]['ItemId']
            if len(item_index) > 0:
                col_index = np.where(items == items[j])
                u[i, col_index] = len(item_index)
    
    norm_u = u/u.sum(axis=1)[:,None]
    norm_u = np.nan_to_num(norm_u)
    return u, norm_u

## calculate user-specific recomendation matrix
def walk_alg(u, T, alpha, steps=100):
    # initialize recomendation matrix
    p = u  # dim: u*n
    
    for k in range(steps):
            p =  alpha*np.dot(p, T) + (1-alpha)*u
            # norm p
            p_updated = p/p.sum(axis=1)[:,None]
            #if sum(p.sum(axis=1) - p_updated(axis=1)) < ttol:
            #    return(p)
            
            p = p_updated
            
    return p

## get ntop recommendations list for each user
def user_recommendation(p, items, usid, ntop=3):
    recomm_list = list()
    sort = np.argsort(-p, axis=1)
    
    for i in range(p.shape[0]):
        recomm_list.append(list(items[sort[i, range(ntop)]]))
    
    df = pd.DataFrame(recomm_list)
    df['usid'] = usid
    return df

In [374]:
## read datasets
data = pd.read_csv("datasets/events_train_full.0.txt", sep="\t")
data.head()

n_freq = 8
# filter less frequent users to reduce sparsity
train_data, val_data = filter_split_data(data, n_freq)

## get subset data
data_sel = train_data[0:1000]

# item and user lists
items = data_sel['ItemId'].unique()
uids = data_sel['UserId'].unique()

# generate transition prob matrix
### test for the first 1000 rows
T = transition_prob(data_sel)
u, u_norm = user_spec_vec(data_sel)



In [375]:
T

array([[0.8       , 0.2       , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.97368421, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [376]:
u_norm

array([[0.83333333, 0.16666667, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.79166667, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.08333333, 0.08333333,
        0.08333333]])

In [378]:
# generate recommendation matrix
p = walk_alg(u_norm, T, alpha=0.01, steps=50)
p

array([[0.83305518, 0.16694482, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.79145628, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.0834036 , 0.0834036 ,
        0.0834036 ]])

In [380]:
user_recommendation(p, items, uids, ntop=3)

Unnamed: 0,0,1,2,usid
0,407485,133814,152249,1230
1,381314,175292,407485,1722
2,29863,252596,335661,1879
3,259884,78268,342264,2114
4,195958,152249,135900,2270
...,...,...,...,...
64,188169,165161,164027,24757
65,102176,197270,166138,25635
66,248742,133431,410698,25748
67,280342,187208,150877,25784
