# User based recommendations

In [1]:
import pandas as pd
import scipy
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

In [2]:
df = pd.read_csv('../data/ratings_pivot.csv')

In [3]:
df = df.set_index('Cust_Id')

In [4]:
df

Unnamed: 0_level_0,30,58,77,143,252,257,258,331,341,362,...,17343,17383,17424,17479,17515,17563,17611,17621,17697,17725
Cust_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.0,0.0,0.0,3.0,0.0,5.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,2.0,0.0,0.0,0.0,0.0
4,4.0,0.0,0.0,4.0,3.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,3.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,3.0,0.0,0.0,4.0,4.0,4.0,0.0,5.0,0.0,4.0,...,4.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65529,4.0,0.0,0.0,0.0,1.0,0.0,0.0,4.0,0.0,3.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
65530,4.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,4.0,0.0,0.0
65532,4.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0
65533,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,5.0,0.0,0.0


### Need to research best way to handle very sparse matrices
- https://machinelearningmastery.com/sparse-matrices-for-machine-learning/
    - "It is computationally expensive to represent and work with sparse matrices as though they are dense, and much improvement in performance can be achieved by using representations and operations that specifically handle the matrix sparsity."
- There are alternative data structures that would be more memory efficient for these type of matrices
- https://pandas.pydata.org/docs/user_guide/sparse.html

### Sparse Data Structure for efficiency

In [5]:
sparse_data = scipy.sparse.csr_matrix(normalize(df.values))

In [6]:
print(sparse_data.toarray())

[[0.0835593  0.         0.         ... 0.         0.         0.        ]
 [0.07265392 0.         0.         ... 0.         0.         0.        ]
 [0.09441212 0.         0.         ... 0.         0.         0.        ]
 ...
 [0.10069467 0.         0.10069467 ... 0.12586834 0.         0.        ]
 [0.13137724 0.         0.         ... 0.16422155 0.         0.        ]
 [0.14210511 0.         0.         ... 0.         0.08526306 0.        ]]


In [7]:
cosine_similarity(sparse_data[0], sparse_data[1])

array([[0.58820381]])

In [8]:
#!pip install sortedcollections

In [9]:
#dictionary for id -> index pairs
id_index_dic = {}
for i, j in enumerate(df.index):
    id_index_dic[j] = i

In [10]:
id_index_dic[5]

3

In [49]:
from sortedcontainers import SortedList
'''
    Get top n similar users based on a given user
    
    params:
        user: user index of interest
        data: user ratings matrix
        n: number of similar users to return 
        
    returns:
        top_scores: 2d list of score and id 
'''
def get_similar_users(user: int, data, n: int):
    top_scores = SortedList(key = lambda x: -x[0])
    for u in range(len(data.toarray())):
        score = cosine_similarity(data[user], data[u])
        top_scores.add([score, u])
    return top_scores[:n+1] #n+1 since current index would be included
    

In [12]:
%%time
# get similar users of user 0 
results = get_similar_users(0, sparse_data, 5)

CPU times: user 40 s, sys: 116 ms, total: 40.1 s
Wall time: 40.1 s


40 second runtime for a single user :(

In [13]:
results

[[array([[1.]]), 0],
 [array([[0.72543739]]), 38299],
 [array([[0.71573383]]), 51704],
 [array([[0.71272392]]), 49760],
 [array([[0.70688714]]), 1303],
 [array([[0.69327405]]), 5284]]

I think creating a local DB and storing this stuff would be better

In [47]:
def proportion_similar(u1, u2):
    user1 = set(df.loc[u1][df.loc[u1] > 0].index)
    user2 = set(df.loc[u2][df.loc[u2] > 0].index) 
    return (len(user1.intersection(user2)) / len(user1)), (len(user2))

In [48]:
for i in results:
    r, l = proportion_similar(results[0][1], i[1])
    print(r, i[1], l)

1.0 0 98
0.336734693877551 38299 51
0.7448979591836735 51704 153
0.35714285714285715 49760 63
0.6428571428571429 1303 93
0.4897959183673469 5284 82


### Get movie intersection of all users

In [54]:
results.pop(0)

[array([[1.]]), 0]

In [65]:
results

[[array([[0.72543739]]), 38299],
 [array([[0.71573383]]), 51704],
 [array([[0.71272392]]), 49760],
 [array([[0.70688714]]), 1303],
 [array([[0.69327405]]), 5284]]

In [71]:
type(df.loc[results[0][1]][df.loc[results[0][1]] > 0].index)

pandas.core.indexes.base.Index

In [None]:
list(set.intersection(*map(set, )))

TypeError: unhashable type: 'numpy.ndarray'

In [75]:
def movie_intersection(r, df):
    users = []
    for i in r:
        users.append(set(df.loc[i[1]][df.loc[i[1]] > 0].index))
        
    result = users[0]
    for i in range(1, len(users)):
        result = set(result) & set(users[i])
                     
    return result

In [82]:
similar_user_movies = movie_intersection(results, df)

In [83]:
user1_movies = set(df.loc[0][df.loc[0] > 0].index)

In [84]:
similar_user_movies - user1_movies

{'16954', '9728'}

The above results is the only movies that all of user 0's similar users have seen that they havent

This is useful for existing users, but I would like to figure out a way to incorporate this into recommendations based off movies instead of a user