In [64]:
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate
from surprise import accuracy
from surprise import NormalPredictor, BaselineOnly, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline, SVD, NMF

In [6]:
movie_df = pd.read_csv('movies.dat', sep = '::', header = None, names = ['movieId', 'Title', 'Genre'])
movie_df.head()

  if __name__ == '__main__':


Unnamed: 0,movieId,Title,Genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
user_df = pd.read_csv('users.dat', sep = '::', header=None, names = ['userId', 'Gender', 'Age', 'Occupation', 'zipCode'])
user_df.head()

  if __name__ == '__main__':


Unnamed: 0,userId,Gender,Age,Occupation,zipCode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [10]:
ratings_df = pd.read_csv('ratings.dat', sep='::', header=None, names = ['userId', 'movieId', 'rating', 'timestamp'])
ratings_df.head()

  if __name__ == '__main__':


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [35]:
print(ratings_df.userId.unique().shape)
print(user_df.userId.unique().shape)
print(sum(ratings_df.groupby(['userId']).size() >=20))
print(ratings_df.movieId.unique().shape)
print(movie_df.movieId.unique().shape) #more movies here than in the ratings list
ratings_df.head()

(6040,)
(6040,)
6040
(3706,)
(3883,)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [38]:
ratings_df.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,1000209.0,1000209.0,1000209.0,1000209.0
mean,3024.512,1865.54,3.581564,972243700.0
std,1728.413,1096.041,1.117102,12152560.0
min,1.0,1.0,1.0,956703900.0
25%,1506.0,1030.0,3.0,965302600.0
50%,3070.0,1835.0,4.0,973018000.0
75%,4476.0,2770.0,4.0,975220900.0
max,6040.0,3952.0,5.0,1046455000.0


In [39]:
reader = Reader(rating_scale = (1, 5))
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

In [41]:
cross_validate(NormalPredictor(), data, cv=3, verbose=True)
cross_validate(BaselineOnly(), data, cv=3, verbose=True)
cross_validate(SVD(), data, cv=3, verbose=True)
cross_validate(NMF(), data, cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm NormalPredictor on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.5059  1.5030  1.5060  1.5050  0.0014  
MAE (testset)     1.2066  1.2055  1.2079  1.2067  0.0010  
Fit time          0.90    1.28    1.07    1.08    0.16    
Test time         4.27    4.36    4.84    4.49    0.25    
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9094  0.9099  0.9104  0.9099  0.0004  
MAE (testset)     0.7201  0.7206  0.7213  0.7206  0.0005  
Fit time          2.12    2.27    2.54    2.31    0.17    
Test time         3.52    3.90    3.91    3.78    0.18    
Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8863  0.8840  0.8869  0.8857  0.0013  
MAE (testset)   

{'fit_time': (48.47248291969299, 52.07418489456177, 47.94432616233826),
 'test_mae': array([0.72700391, 0.726688  , 0.72666178]),
 'test_rmse': array([0.92032959, 0.92024836, 0.9198904 ]),
 'test_time': (3.6545698642730713, 3.9595420360565186, 4.272303819656372)}

In [65]:
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=0.25)
for n_factors in [20, 50, 100]:
    algo = SVD(n_factors=n_factors)
    algo.fit(trainset)
    pred = algo.test(testset)
    print(accuracy.mae(pred))

MAE:  0.6903
0.6902851869518045
MAE:  0.6875
0.6874756750278672
MAE:  0.6908
0.6908261177088539


In [66]:
trainset = data.build_full_trainset()
algo = SVD(n_factors=50)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x10eb9c5f8>

In [67]:
def get_recommend(userId, algo):
    """
    return both the movies a user liked
    as well as
    movies that a user might like
    in two df
    """
    from collections import defaultdict
    user_recom = defaultdict(list)
    for movieId in ratings_df.movieId.unique():
        user_recom['movieId'].append(movieId)
        user_recom['rating'].append(algo.predict(userId, movieId).est)
    user_recom = pd.DataFrame(user_recom)
    user_like = ratings_df[ratings_df.userId == userId].sort_values('rating', ascending=False)
    user_recom = user_recom[~user_recom.movieId.isin(user_like.movieId)]
    user_recom = user_recom.sort_values('rating', ascending = False)
    user_like = user_like.merge(movie_df, how='left', on ='movieId')
    user_recom = user_recom.merge(movie_df, how='left', on = 'movieId')
    return user_like, user_recom

In [68]:
user_like, user_recom = get_recommend(837, algo)

In [69]:
user_like.head(20)

Unnamed: 0,userId,movieId,rating,timestamp,Title,Genre
0,837,858,5,975360036,"Godfather, The (1972)",Action|Crime|Drama
1,837,1387,5,975360036,Jaws (1975),Action|Horror
2,837,2028,5,975360089,Saving Private Ryan (1998),Action|Drama|War
3,837,1221,5,975360036,"Godfather: Part II, The (1974)",Action|Crime|Drama
4,837,913,5,975359921,"Maltese Falcon, The (1941)",Film-Noir|Mystery
5,837,3417,5,975360893,"Crimson Pirate, The (1952)",Adventure|Comedy|Sci-Fi
6,837,2186,4,975359955,Strangers on a Train (1951),Film-Noir|Thriller
7,837,2791,4,975360893,Airplane! (1980),Comedy
8,837,1188,4,975360920,Strictly Ballroom (1992),Comedy|Romance
9,837,1304,4,975360058,Butch Cassidy and the Sundance Kid (1969),Action|Comedy|Western


In [70]:
user_recom.head(20)

Unnamed: 0,movieId,rating,Title,Genre
0,527,4.028868,Schindler's List (1993),Drama|War
1,912,3.973781,Casablanca (1942),Drama|Romance|War
2,953,3.972036,It's a Wonderful Life (1946),Drama
3,1236,3.91591,Trust (1990),Comedy|Drama
4,1287,3.896967,Ben-Hur (1959),Action|Adventure|Drama
5,904,3.887683,Rear Window (1954),Mystery|Thriller
6,2905,3.882226,Sanjuro (1962),Action|Adventure
7,1272,3.840225,Patton (1970),Drama|War
8,3469,3.822542,Inherit the Wind (1960),Drama
9,1254,3.817188,"Treasure of the Sierra Madre, The (1948)",Adventure


3653