In [1]:
#library import

from os.path import join
import math
from math import sqrt
import pandas as pd
import numpy as np
import sklearn
import scipy.stats
from sklearn.cross_validation import train_test_split
from scipy.spatial.distance import squareform, pdist
from sklearn.metrics.pairwise import cosine_distances
from scipy.stats.stats import pearsonr
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import NearestNeighbors



In [2]:
#read data path
BASE_PATH = "/media/sf_졸프"
FILE_NAME = "userData.txt"

watchHistory = pd.read_csv(join(BASE_PATH, FILE_NAME), sep="\t")
watchMatrix = pd.crosstab(watchHistory.VIDEO_NUM, watchHistory.USER_NUM).replace(0, np.nan)

In [4]:
#read data the first process
N_USERS = watchHistory.USER_NUM.unique().shape[0]
N_ITEMS = watchHistory.VIDEO_NUM.unique().shape[0]

history = np.zeros((N_USERS, N_ITEMS))

"""유사도 구하기 전에 가중치 두기 추가 test 해보장..."""
#if user watch video, mark 1
for row in watchHistory.itertuples():
    history[row[1]-1, row[2]-1] = row[3]
    
print(history)

#몇 퍼센트의 데이터가 정보를 갖고 있는지!
# sparsity = float(len(history.nonzero()[0]))
# sparsity /= (history.shape[0] * history.shape[1])
# sparsity *= 100
# print("Sparsity : {:4.2f}%" .format(sparsity))

#don't need
history_train, history_test = train_test_split(history, test_size = 0.33, random_state=42)

#predict user_i's rating if they didn't watch item.

[[1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 1. 0. 1.]
 [0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 1.]
 [0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 1.]
 [0. 1. 0. 1. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 1.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 0. 1.]
 [1. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 1. 1. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1.

In [5]:
#jaccard similarity u2u
def jaccard_prediction(data) :
    dist_jac = pdist(data, metric="jaccard")
    s_dist = squareform(dist_jac)
    np.fill_diagonal(s_dist, 0)
    origin_jaccard = 1 - s_dist
    pred_jaccard = origin_jaccard.dot(data) / np.array([np.abs(origin_jaccard).sum(axis=1)]).T
    return pred_jaccard

In [6]:
#cosine similarity u2u
def cosine_prediction(data) :
    dist_cos = 1 - cosine_distances(data)
    pred_cos = dist_cos.dot(data) / np.array([np.abs(dist_cos).sum(axis=1)]).T
    return pred_cos

In [14]:
#pearson correlation u2u
#history_train : sample data
"""
pearsonCorr means,

-1 < r < 0.7 : 강한 음의 선형관계
-0.7 < r < -0.3 : 뚜렷한 음의 선형관계

-0.3 < r < -0.1 : 약한 음의 선형관계
-0.1 < r < 0.1 : 거의 무시될 수 있는 선형관계

0.1 < r < 0.3 : 약한 양의 상관관계
0.3 < r < 0.7 : 뚜렷한 양의 선형관계
0.7 < r < 1 : 강한 양의 선형관계
"""
def pearson_prediction(data) :
    pearson_corr = np.zeros((len(data), len(data)))
    for user_i in range(len(data)):
        for user_j in range(len(data)):
            if not math.isnan(scipy.stats.pearsonr(data[user_i], data[user_j])[0]) :
                pearson_corr[user_i][user_j] = scipy.stats.pearsonr(data[user_i], data[user_j])[0]
            else:
                pearson_corr[user_i][user_j] = 0
                
    pred_pearson = pearson_corr.dot(data) / np.array([np.abs(pearson_corr).sum(axis=1)]).T
    print(pred_pearson)
    return pred_pearson

In [15]:
#Religious Spiritual Mystical Experience (RSME) : root(mean squre deviation)

#ignore not-zero
def expanded_rmse(predict, target) :
    predict = predict[target.nonzero()].flatten()
    target = target[target.nonzero()].flatten()
    return mean_squared_error(predict, target)

def rmse(predict, target) : #true rmse
    return sqrt(mean_squared_error(predict, target))

print(rmse(cosine_prediction(history), history))
print(rmse(jaccard_prediction(history), history))
print(rmse(pearson_prediction(history), history))

#optimized for cosine_similarity

0.37565950210385957
0.35139707227930866
[[ 1.96076491e-01  2.56285340e-01  1.32484115e-01 -7.03979736e-02
  -4.45695899e-02  5.29936462e-03 -4.39011505e-02  4.16104072e-01
   3.10116780e-01 -7.03979736e-02  3.83099031e-01 -8.92128593e-02
  -1.20008093e-01 -1.29730483e-01  4.08051076e-01 -8.09967028e-02
   5.49094843e-02 -6.88917400e-02  1.67744331e-02  1.67744331e-02]
 [ 3.83415040e-01 -1.78286916e-01 -4.90513763e-02 -4.06352838e-02
  -8.74456525e-02 -1.17723303e-01 -1.14212348e-01 -8.08385540e-02
  -5.63128658e-02 -1.63263725e-01  3.83415040e-01  6.74376573e-02
  -1.01180534e-02  3.45908420e-01 -1.47154129e-02 -1.24022624e-01
   3.52282253e-01  3.83415040e-01  1.39584861e-01  4.09367431e-01]
 [-1.86233007e-02 -1.24650060e-01 -8.48699036e-02  2.90387808e-01
   2.04804229e-01  2.60267704e-01  3.46967744e-01 -9.32459094e-02
  -8.37600577e-03 -4.90918067e-02  1.81055794e-01 -6.86095983e-02
   1.33203240e-02 -1.38509858e-01  3.96059550e-02 -1.73567665e-01
  -8.66934249e-02 -1.03493204e-01 

In [18]:
#data : watch_history
def recommend_item(user_id, data, metric, count) :
    recommend = pd.DataFrame(data=[], index=range(user_id+1), columns=range(N_ITEMS))

    predict_matrix = np.zeros((N_USERS, N_ITEMS))

    if metric == 'cosine':
        predict_matrix = cosine_prediction(data)
    elif metric == 'jaccard':
        predict_matrix = jaccard_prediction(data)
    elif metric == 'pearson' :
        predict_matrix = pearson_prediction(data)
        
    for i in range(N_ITEMS) :
        if(data[user_id][i] == 1) : 
            continue;
        else :
            recommend.ix[0][i] = predict_matrix[user_id][i]
    recommend = recommend.dropna(axis='columns')
    recommended_item = recommend.sort_values(by=user_id, ascending=False, axis=1)
    recommended_item = recommended_item.iloc[user_id][:count]
    return recommended_item

recommend_item(0, history, 'cosine', 5)
print(history)

[[1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 1. 0. 1.]
 [0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 1.]
 [0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 1.]
 [0. 1. 0. 1. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 1.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 0. 1.]
 [1. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 1. 1. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1.

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [12]:
#find nearest user with 5
k = 5
#metric : correlation(pearson), cosine, jaccard
neighbor = NearestNeighbors(k, metric='correlation', algorithm='auto')
neighbor.fit(history)
top_k_distances, top_k_users = neighbor.kneighbors(history_train, return_distance=True)

print(top_k_users)

[[15 20  1 39 18]
 [32 29 28 14 37]
 [16 37  4  2 25]
 [35  1 18 24  9]
 [34 10 17 36 27]
 [ 0 21 26 28 14]
 [27  5 11  9 19]
 [ 5 27 11 22 34]
 [11 27  5 19 13]
 [ 1 15 35 20 18]
 [30 38 39 25 26]
 [21  0 38 13 25]
 [ 2 37 27 16  3]
 [31 13 23 10 20]
 [37  2 22 16 32]
 [ 3 39 35 36 24]
 [36 35 39 27 30]
 [23  4 27 36 34]
 [33 25 26 13 10]
 [10 34 36  1 33]
 [22 37 11  5 17]
 [18 35 15  1 32]
 [20 15  1 10 26]
 [ 7 19 17 12 23]
 [14 24 40 28 12]
 [28  8 32 29 14]
 [38 30 26 23 25]]
