In [158]:
import pandas as pd
import numpy as np

data = pd.read_csv('ml-latest-small/ratings.csv')
data['userId'] = data['userId'].astype('str')
data['movieId'] = data['movieId'].astype('str')

users = data['userId'].unique() #list of all users
movies = data['movieId'].unique() #list of all movies

print("Number of users", len(users))
print("Number of movies", len(movies))

data.head()

Number of users 610
Number of movies 9724


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


#### https://heartbeat.fritz.ai/recommender-systems-with-python-part-iii-collaborative-filtering-singular-value-decomposition-5b5dcb3f242b

In [1]:
import pandas as pd
import numpy as np
import os

data_path = 'ml-latest-small'
movies_filename = 'movies.csv'
ratings_filename = 'ratings.csv'
df_movies = pd.read_csv(
    os.path.join(data_path, movies_filename),
#     movies_filename,
    usecols=['movieId', 'title'],
    dtype={'movieId': 'int32', 'title': 'str'})
df_ratings = pd.read_csv(
    os.path.join(data_path, ratings_filename),
#     ratings_filename,
    usecols=['userId', 'movieId', 'rating'],
    
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [2]:
df_movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [5]:
df_movies.size

19484

In [3]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [6]:
df_ratings.size

302508

In [7]:
# Pivot Ratings into Movie Features

df_ratings=df_ratings[:2000000]
df_movie_features = df_ratings.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(0)

In [13]:
df_movie_features.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
R = df_movie_features.values

In [34]:
R.shape

(610, 9724)

In [28]:
R

array([[4. , 0. , 4. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [2.5, 2. , 2. , ..., 0. , 0. , 0. ],
       [3. , 0. , 0. , ..., 0. , 0. , 0. ],
       [5. , 0. , 0. , ..., 0. , 0. , 0. ]], dtype=float32)

In [29]:
user_ratings_mean = np.mean(R, axis = 1)

In [35]:
user_ratings_mean.shape

(610,)

In [30]:
user_ratings_mean

array([0.10417524, 0.01177499, 0.00976964, 0.07897984, 0.01645413,
       0.11281366, 0.05049362, 0.01727684, 0.01542575, 0.0472028 ,
       0.02488688, 0.01444879, 0.01162073, 0.01676265, 0.04787125,
       0.037536  , 0.04545455, 0.19266763, 0.18850267, 0.08936652,
       0.14854997, 0.03146853, 0.04540313, 0.04128959, 0.01285479,
       0.00699301, 0.04925957, 0.1770362 , 0.03450226, 0.01655697,
       0.02015631, 0.03938708, 0.06077746, 0.03023447, 0.0096668 ,
       0.01624846, 0.00894694, 0.02581242, 0.04113533, 0.03990128,
       0.07260387, 0.16135335, 0.0533731 , 0.01655697, 0.15903948,
       0.01727684, 0.04396339, 0.0136775 , 0.00920403, 0.08864665,
       0.13939737, 0.05985191, 0.01028383, 0.01028383, 0.00730152,
       0.01799671, 0.16608392, 0.04494035, 0.04792267, 0.00843274,
       0.01624846, 0.15364048, 0.10119292, 0.2003805 , 0.01408885,
       0.14263678, 0.01470588, 0.4190148 , 0.02067051, 0.02756068,
       0.01295763, 0.01923077, 0.08016249, 0.07774578, 0.02293

In [39]:
user_ratings_mean.reshape(-1, 1)

array([[0.10417524],
       [0.01177499],
       [0.00976964],
       [0.07897984],
       [0.01645413],
       [0.11281366],
       [0.05049362],
       [0.01727684],
       [0.01542575],
       [0.0472028 ],
       [0.02488688],
       [0.01444879],
       [0.01162073],
       [0.01676265],
       [0.04787125],
       [0.037536  ],
       [0.04545455],
       [0.19266763],
       [0.18850267],
       [0.08936652],
       [0.14854997],
       [0.03146853],
       [0.04540313],
       [0.04128959],
       [0.01285479],
       [0.00699301],
       [0.04925957],
       [0.1770362 ],
       [0.03450226],
       [0.01655697],
       [0.02015631],
       [0.03938708],
       [0.06077746],
       [0.03023447],
       [0.0096668 ],
       [0.01624846],
       [0.00894694],
       [0.02581242],
       [0.04113533],
       [0.03990128],
       [0.07260387],
       [0.16135335],
       [0.0533731 ],
       [0.01655697],
       [0.15903948],
       [0.01727684],
       [0.04396339],
       [0.013

In [31]:
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

In [32]:
R_demeaned

array([[ 3.8958247 , -0.10417524,  3.8958247 , ..., -0.10417524,
        -0.10417524, -0.10417524],
       [-0.01177499, -0.01177499, -0.01177499, ..., -0.01177499,
        -0.01177499, -0.01177499],
       [-0.00976964, -0.00976964, -0.00976964, ..., -0.00976964,
        -0.00976964, -0.00976964],
       ...,
       [ 2.2321575 ,  1.7321576 ,  1.7321576 , ..., -0.26784244,
        -0.26784244, -0.26784244],
       [ 2.9875565 , -0.01244344, -0.01244344, ..., -0.01244344,
        -0.01244344, -0.01244344],
       [ 4.506119  , -0.4938811 , -0.4938811 , ..., -0.4938811 ,
        -0.4938811 , -0.4938811 ]], dtype=float32)

In [36]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(R_demeaned, k = 50)

In [38]:
sigma.size

50

In [37]:
sigma

array([ 67.8663  ,  68.19671 ,  69.02678 ,  69.417046,  69.91864 ,
        70.02093 ,  70.1941  ,  71.67445 ,  72.43372 ,  73.21883 ,
        73.437614,  74.02645 ,  74.28977 ,  74.92086 ,  75.17529 ,
        75.59326 ,  76.7023  ,  77.35721 ,  78.39407 ,  79.04347 ,
        79.21218 ,  80.56748 ,  81.546745,  82.19738 ,  83.04449 ,
        85.11692 ,  85.74874 ,  86.51714 ,  87.915474,  90.33577 ,
        90.93405 ,  92.26269 ,  93.39979 ,  97.10069 ,  99.28906 ,
        99.82361 , 101.84787 , 105.97361 , 107.04782 , 109.20838 ,
       112.80841 , 120.61546 , 122.64724 , 134.58719 , 139.63737 ,
       153.93103 , 163.7309  , 184.86186 , 231.22456 , 474.20605 ],
      dtype=float32)

In [11]:
# that the Sigma$ returned is just the values instead of a diagonal matrix. 
# This is useful, but since I'm going to leverage matrix multiplication to get predictions 
# I'll convert it to the diagonal matrix form.
sigma = np.diag(sigma)
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [12]:
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = df_movie_features.columns)
preds_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
0,2.167359,0.402757,0.840174,-0.076284,-0.551335,2.504077,-0.890116,-0.026441,0.196975,1.593262,...,-0.023453,-0.019968,-0.026939,-0.026939,-0.023453,-0.026939,-0.023453,-0.023453,-0.023453,-0.058732
1,0.211461,0.006658,0.033455,0.017419,0.18343,-0.06247,0.083036,0.024158,0.049331,-0.152531,...,0.019498,0.016777,0.022219,0.022219,0.019498,0.022219,0.019498,0.019498,0.019498,0.032281
2,0.003589,0.030517,0.046393,0.008176,-0.006246,0.10733,-0.012416,0.003779,0.007297,-0.059364,...,0.005909,0.006209,0.00561,0.00561,0.005909,0.00561,0.005909,0.005909,0.005909,0.008004
3,2.051538,-0.387103,-0.2522,0.087563,0.130465,0.270202,0.477836,0.040313,0.025857,-0.017364,...,0.004836,0.004172,0.005499,0.005499,0.004836,0.005499,0.004836,0.004836,0.004836,-0.023311
4,1.344738,0.778512,0.065747,0.111744,0.273143,0.584421,0.25493,0.128789,-0.085542,1.023455,...,-0.008042,-0.007419,-0.008664,-0.008664,-0.008042,-0.008664,-0.008042,-0.008042,-0.008042,-0.010127


In [75]:
def recommend_movies(preds_df, userID, movies_df, original_ratings_df, num_recommendations=5):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # UserID starts at 1, not 0
    sorted_user_predictions = preds_df.iloc[user_row_number].sort_values(ascending=False) # UserID starts at 1

    # Get the user's data and merge in the movie information.
    user_data = original_ratings_df[original_ratings_df.userId == (userID)]
    user_full = (user_data.merge(movies_df, how = 'left', left_on = 'movieId', right_on = 'movieId').
                     sort_values(['rating'], ascending=False)
                 )

#     print('User {0} has already rated {1} movies.'.format(userID, user_full.shape[0]))
#     print('Recommending highest {0} predicted ratings movies not already rated.'.format(num_recommendations))
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies_df[~movies_df['movieId'].isin(user_full['movieId'])]).merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left', left_on = 'movieId',
               right_on = 'movieId').rename(columns = {user_row_number: 'Predictions'}).sort_values('Predictions', ascending = False).iloc[:num_recommendations, :-1]
                    
    return user_full, recommendations


In [76]:
already_rated, predictions = recommend_movies(preds_df, 330, df_movies, df_ratings, 10)

In [17]:
already_rated.head(10)

Unnamed: 0,userId,movieId,rating,title
246,330,79702,5.0,Scott Pilgrim vs. the World (2010)
155,330,2324,5.0,Life Is Beautiful (La Vita è bella) (1997)
208,330,4886,5.0,"Monsters, Inc. (2001)"
204,330,4226,5.0,Memento (2000)
202,330,4022,5.0,Cast Away (2000)
183,330,3052,5.0,Dogma (1999)
55,330,551,5.0,"Nightmare Before Christmas, The (1993)"
57,330,555,5.0,True Romance (1993)
65,330,593,5.0,"Silence of the Lambs, The (1991)"
168,330,2712,5.0,Eyes Wide Shut (1999)


In [18]:
predictions

Unnamed: 0,movieId,title
2807,4027,"O Brother, Where Art Thou? (2000)"
1424,2115,Indiana Jones and the Temple of Doom (1984)
4200,6539,Pirates of the Caribbean: The Curse of the Bla...
1147,1704,Good Will Hunting (1997)
2171,3114,Toy Story 2 (1999)
817,1220,"Blues Brothers, The (1980)"
3427,4995,"Beautiful Mind, A (2001)"
707,1036,Die Hard (1988)
408,527,Schindler's List (1993)
3416,4979,"Royal Tenenbaums, The (2001)"


In [19]:
# LU decomposition
from numpy import array
from scipy.linalg import lu
# define a square matrix
A = array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print(A)
# LU decomposition
P, L, U = lu(A) 
print(P)
print(L)
print(U)

[[1 2 3]
 [4 5 6]
 [7 8 9]]
[[0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]]
[[1.         0.         0.        ]
 [0.14285714 1.         0.        ]
 [0.57142857 0.5        1.        ]]
[[7.         8.         9.        ]
 [0.         0.85714286 1.71428571]
 [0.         0.         0.        ]]


In [21]:
# QR decomposition
from numpy import array
from numpy.linalg import qr
# define a 3x2 matrix
A = array([[1, 2], [3, 4], [5, 6]])
print(A)
# QR decomposition
Q, R = qr(A, 'complete')
print(Q)
print(R)
# reconstruct
B = Q.dot(R)
print(B)

[[1 2]
 [3 4]
 [5 6]]
[[-0.16903085  0.89708523  0.40824829]
 [-0.50709255  0.27602622 -0.81649658]
 [-0.84515425 -0.34503278  0.40824829]]
[[-5.91607978 -7.43735744]
 [ 0.          0.82807867]
 [ 0.          0.        ]]
[[1. 2.]
 [3. 4.]
 [5. 6.]]


In [22]:
# Cholesky decomposition
from numpy import array
from numpy.linalg import cholesky
# define a 3x3 matrix
A = array([[2, 1, 1], [1, 2, 1], [1, 1, 2]])
print(A)
# Cholesky decomposition
L = cholesky(A)
print(L)
# reconstruct
B = L.dot(L.T)
print(B)

[[2 1 1]
 [1 2 1]
 [1 1 2]]
[[1.41421356 0.         0.        ]
 [0.70710678 1.22474487 0.        ]
 [0.70710678 0.40824829 1.15470054]]
[[2. 1. 1.]
 [1. 2. 1.]
 [1. 1. 2.]]


In [2]:
import pandas as pd
d = {'USER_ID': ['USER 1', 'USER 2', 'USER 1', 'USER 3'],
     "Category": ['Green', 'Blue', 'Red', 'Green'],
     'Height': ['172cm', '169cm', '153cm', '172cm'],
     'Weight': ['69kg', '61kg', '41kg', '59kg']}
df = pd.DataFrame(data=d)

In [3]:
df

Unnamed: 0,USER_ID,Category,Height,Weight
0,USER 1,Green,172cm,69kg
1,USER 2,Blue,169cm,61kg
2,USER 1,Red,153cm,41kg
3,USER 3,Green,172cm,59kg


In [5]:
A = df.pivot(index='USER_ID', columns='Category')

In [11]:
A

Unnamed: 0_level_0,Height,Height,Height,Weight,Weight,Weight
Category,Blue,Green,Red,Blue,Green,Red
USER_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
USER 1,,172cm,153cm,,69kg,41kg
USER 2,169cm,,,61kg,,
USER 3,,172cm,,,59kg,


In [13]:
A['Height',['Blue', 'Green']]

TypeError: '('Height', ['Blue', 'Green'])' is an invalid key