In [1]:
from os.path import join
import math
from math import sqrt
import pandas as pd
import numpy as np
import sklearn
import scipy.stats
from sklearn.cross_validation import train_test_split
from scipy.spatial.distance import squareform, pdist
from sklearn.metrics.pairwise import cosine_distances
from scipy.stats.stats import pearsonr
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import NearestNeighbors
import time



In [2]:
#read data path
BASE_PATH = "/media/sf_졸프"
FILE_NAME = "userData.txt"

watchHistory = pd.read_csv(join(BASE_PATH, FILE_NAME), sep="\t")
watchMatrix = pd.crosstab(watchHistory.VIDEO_NUM, watchHistory.USER_NUM).replace(0, np.nan)

#read data the first process
N_USERS = watchHistory.USER_NUM.unique().shape[0]
N_ITEMS = watchHistory.VIDEO_NUM.unique().shape[0]

history = np.zeros((N_USERS, N_ITEMS))

"""유사도 구하기 전에 가중치 두기 추가 test 해보장..."""
#if user watch video, mark 1
for row in watchHistory.itertuples():
    history[row[1]-1, row[2]-1] = row[3]
    
history = history.T
print(history.shape)

(20, 41)


In [3]:
#jaccard similarity u2u
def jaccard_prediction(data) :
    dist_jac = pdist(data, metric="jaccard")
    s_dist = squareform(dist_jac)
    np.fill_diagonal(s_dist, 0)
    origin_jaccard = 1 - s_dist
    pred_jaccard = origin_jaccard.dot(data) / np.array([np.abs(origin_jaccard).sum(axis=1)]).T
    return pred_jaccard

In [4]:
#cosine similarity u2u
def cosine_prediction(data) :
    dist_cos = 1 - cosine_distances(data)
    pred_cos = dist_cos.dot(data) / np.array([np.abs(dist_cos).sum(axis=1)]).T
    return pred_cos

In [5]:
#calculate similarity between items to items using pearson correlation
def pearson_prediction(data) :
    pearson_corr = np.zeros((len(data), len(data)))
    for item_i in range(len(data)):
        for item_j in range(len(data)):
            if not math.isnan(scipy.stats.pearsonr(data[item_i], data[item_j])[0]) :
                pearson_corr[item_i][item_j] = scipy.stats.pearsonr(data[item_i], data[item_j])[0]
            else:
                pearson_corr[item_i][item_j] = 0
                
    pred_pearson = pearson_corr.dot(data) / np.array([np.abs(pearson_corr).sum(axis=1)]).T
    print(pred_pearson)
    return pred_pearson


In [6]:
#Religious Spiritual Mystical Experience (RSME) : root(mean squre deviation)

#ignore not-zero
def expanded_rmse(predict, target) :
    predict = predict[target.nonzero()].flatten()
    target = target[target.nonzero()].flatten()
    return mean_squared_error(predict, target)

def rmse(predict, target) : #true rmse
    return sqrt(mean_squared_error(predict, target))

print(rmse(cosine_prediction(history), history))
print(rmse(jaccard_prediction(history), history))
print(rmse(pearson_prediction(history), history))

#optimized for cosine_similarity

0.37623047665098874
0.3338387223378005
[[ 0.12219197  0.38350018 -0.08610844 -0.05053952 -0.19256687 -0.17788567
  -0.08768433 -0.30304482 -0.0789318  -0.10274183  0.28019046  0.11971293
  -0.21330955  0.30553491 -0.1073601   0.41553931 -0.0200145  -0.29952344
   0.2838249  -0.32568579  0.44031331 -0.15662998  0.14186806 -0.20265968
   0.00286089 -0.07589378  0.22078764 -0.18727164 -0.19162558 -0.26733045
  -0.17676904  0.26657412 -0.23358218  0.24801032 -0.16632489  0.3133926
  -0.14546627 -0.11350876 -0.20239766 -0.05494509  0.22090222]
 [ 0.18205426 -0.28870071 -0.1389324  -0.2231556   0.22887132 -0.04066064
  -0.0854514   0.30970505 -0.13920262 -0.28958928 -0.19421518  0.18091475
  -0.00617861 -0.17894201 -0.11624873 -0.24053794 -0.08566885  0.30025864
  -0.14547447  0.36206202 -0.19361215  0.30151886 -0.14682588  0.32608827
  -0.14999053  0.24585654  0.19270889 -0.12435987 -0.06496298 -0.03021963
   0.39801945 -0.00959696 -0.11198279 -0.02206352 -0.09682148 -0.22703698
  -0.113477

In [7]:
#data : watch_history
def recommend_item(user_id, data, metric, count) :
    recommend = pd.DataFrame(data=[], index=range(user_id+1), columns=range(N_ITEMS))

    predict_matrix = np.zeros((N_ITEMS, N_USERS))

    if metric == 'cosine':
        predict_matrix = cosine_prediction(data)
    elif metric == 'jaccard':
        predict_matrix = jaccard_prediction(data)
    elif metric == 'pearson' :
        predict_matrix = pearson_prediction(data)
        
    for i in range(N_ITEMS) :
        if(data[i][user_id] == 1) : 
            continue;
        else :
            recommend.ix[0][i] = predict_matrix[i][user_id]
    recommend = recommend.dropna(axis='columns')
    recommended_item = recommend.sort_values(by=user_id, ascending=False, axis=1)
    recommended_item = recommended_item.iloc[user_id][:count]
    return recommended_item

start = time.time()
print(recommend_item(0, history, 'cosine', 5))
print("time of prediction by IBCF : " + str(time.time() - start) + "\n")

2     0.359207
16     0.30374
18    0.290353
19    0.289251
5      0.28581
Name: 0, dtype: object
time of prediction by IBCF : 0.02208423614501953



.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
