In [1]:
import numpy as np
import pandas as pd

In [8]:
names = ['user_id', 'talk_id', 'view_time']
train_user_views_df = pd.read_csv('c:/Users/Ayush/Documents/train_user_view.txt', sep='\t', names=names)
train_user_views_df.head()

Unnamed: 0,user_id,talk_id,view_time
0,1,10474,07/27/2016 12:27:35
1,1,10488,08/05/2016 12:34:01
2,1,10506,08/07/2016 15:55:57
3,1,10446,08/07/2016 15:58:09
4,1,10506,08/07/2016 16:06:26


In [9]:
new_df = train_user_views_df.sort_values(['user_id','talk_id','view_time'],ascending=[True,True,True])
new_df.head()

Unnamed: 0,user_id,talk_id,view_time
3,1,10446,08/07/2016 15:58:09
5,1,10446,08/08/2016 18:18:46
6,1,10446,08/09/2016 10:38:38
7,1,10446,08/09/2016 10:39:33
8,1,10446,08/09/2016 16:33:56


In [16]:
n_users = new_df.user_id.unique().shape[0]
n_talks = new_df.talk_id.unique().shape[0]
print str(n_users) + ' users'
print str(n_talks) + ' items'

37 users
165 items


In [17]:
comments = np.zeros((n_users, n_items))
ctr = 0
ctr2 = 0
temp = 0
for row in new_df.itertuples():
    if (row[1]-1)!=temp:
        ctr = 0
        ctr2 = ctr2+1
    comments[ctr2, ctr] = row[2]
    if (ctr+1) < n_talks:
        ctr += 1
    else:
        ctr = 0
    temp = row[1]-1
comments

array([[ 10446.,  10446.,  10446., ...,      0.,      0.,      0.],
       [ 10636.,  10636.,  10636., ...,  10634.,  10636.,  10636.],
       [ 10516.,  10517.,  10532., ...,      0.,      0.,      0.],
       ..., 
       [ 10634.,      0.,      0., ...,      0.,      0.,      0.],
       [ 10666.,  10666.,  10666., ...,      0.,      0.,      0.],
       [ 10583.,  10591.,  10702., ...,      0.,      0.,      0.]])

In [18]:
sparsity = float(len(comments.nonzero()[0]))
sparsity /= (comments.shape[0] * comments.shape[1])
sparsity *= 100
print 'Sparsity: {:4.2f}%'.format(sparsity)

Sparsity: 20.44%


In [24]:
def train_test_split(comments):
    test = np.zeros(comments.shape)
    train = comments.copy()
    for user in xrange(comments.shape[0]):
        test_talks = np.random.choice(comments[user, :].nonzero()[0], 
                                        size=5, 
                                        replace=True)
        train[user, test_talks] = 0.
        test[user, test_talks] = comments[user, test_talks]
        
    # Test and training are truly disjoint
    assert(np.all((train * test) == 0)) 
    return train, test

def valid_test_split(comments):
    valid = np.zeros(comments.shape)
    train = comments.copy()
    for user in xrange(comments.shape[0]):
        valid_talks = np.random.choice(comments[user, :].nonzero()[0], 
                                        size=5, 
                                        replace=True)
        train[user, valid_talks] = 0.
        valid[user, valid_talks] = comments[user, valid_talks]
        
    # Test and training are truly disjoint
    assert(np.all((train * valid) == 0)) 
    return valid

In [25]:
train, test = train_test_split(comments)
valid = valid_test_split(comments)

In [26]:
def slow_similarity(ratings, kind='user'):
    if kind == 'user':
        axmax = 0
        axmin = 1
    elif kind == 'talk':
        axmax = 1
        axmin = 0
    sim = np.zeros((comments.shape[axmax], comments.shape[axmax]))
    for u in xrange(comments.shape[axmax]):
        for uprime in xrange(comments.shape[axmax]):
            rui_sqrd = 0.
            ruprimei_sqrd = 0.
            for i in xrange(comments.shape[axmin]):
                sim[u, uprime] = comments[u, i] * comments[uprime, i]
                rui_sqrd += comments[u, i] ** 2
                ruprimei_sqrd += comments[uprime, i] ** 2
            sim[u, uprime] /= rui_sqrd * ruprimei_sqrd
    return sim

def fast_similarity(comments, kind='user', epsilon=1e-9):
    # epsilon -> small number for handling dived-by-zero errors
    if kind == 'user':
        sim = comments.dot(comments.T) + epsilon
    elif kind == 'talk':
        sim = comments.T.dot(comments) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

In [27]:
%timeit fast_similarity(train, kind='user')

The slowest run took 420.60 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 39.1 µs per loop


In [28]:
user_similarity = fast_similarity(train, kind='user')
talk_similarity = fast_similarity(train, kind='talk')
print talk_similarity[:4, :4]

[[ 1.          0.81261905  0.73618814  0.7702859 ]
 [ 0.81261905  1.          0.71551963  0.79329073]
 [ 0.73618814  0.71551963  1.          0.61499078]
 [ 0.7702859   0.79329073  0.61499078  1.        ]]


In [31]:
def predict_slow_simple(comments, similarity, kind='user'):
    pred = np.zeros(comments.shape)
    if kind == 'user':
        for i in xrange(comments.shape[0]):
            for j in xrange(comments.shape[1]):
                pred[i, j] = similarity[i, :].dot(comments[:, j])\
                             /np.sum(np.abs(similarity[i, :]))
        return pred
    elif kind == 'talk':
        for i in xrange(comments.shape[0]):
            for j in xrange(comments.shape[1]):
                pred[i, j] = similarity[j, :].dot(comments[i, :].T)\
                             /np.sum(np.abs(similarity[j, :]))

        return pred

def predict_fast_simple(comments, similarity, kind='user'):
    if kind == 'user':
        return similarity.dot(comments) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif kind == 'talk':
        return comments.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])

In [32]:
%timeit predict_slow_simple(train, user_similarity, kind='user')

10 loops, best of 3: 32.2 ms per loop


In [33]:
%timeit predict_fast_simple(train, user_similarity, kind='user')

The slowest run took 46.97 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 44.4 µs per loop


In [34]:
from sklearn.metrics import mean_squared_error

def get_mse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

In [41]:
talk_prediction = predict_fast_simple(train, talk_similarity, kind='talk')
user_prediction = predict_fast_simple(train, user_similarity, kind='user')

print 'User-based test set CF MSE: ' + str(get_mse(user_prediction, valid))
print 'Talk-based test set CF MSE: ' + str(get_mse(talk_prediction, valid))

print 'User-based test set CF MSE: ' + str(get_mse(user_prediction, test))
print 'Talk-based test set CF MSE: ' + str(get_mse(talk_prediction, test))

User-based test set CF MSE: 31744514.671
Talk-based test set CF MSE: 69828631.3198
User-based test set CF MSE: 38728878.637
Talk-based test set CF MSE: 72659716.5328
