In [1]:
from math import sqrt
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [116]:
# Reading ratings file
ratings = pd.read_csv('data/ratings.dat', sep='::', encoding='latin-1', names=['userId','MovieID','rating','timestamp'])

  


In [117]:
# Reading movies file
movies = pd.read_csv('data/movies.dat', sep=',', header=None, encoding='latin-1', names = ['MovieID', 'Title', 'Genres'])

In [118]:
df_movies = movies 
df_ratings = ratings

In [119]:
df_movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [107]:
print("Shape of frames: \n"+ " Rating DataFrame"+ str(df_ratings.shape)+"\n Movies DataFrame"+ str(df_movies.shape))

Shape of frames: 
 Rating DataFrame(1000209, 4)
 Movies DataFrame(3883, 3)


In [120]:
merge_ratings_movies = pd.merge(df_movies, df_ratings, on='MovieID', how='inner')

In [121]:
merge_ratings_movies.head()

Unnamed: 0,MovieID,Title,Genres,userId,rating,timestamp
0,1,Toy Story (1995),Animation|Children's|Comedy,1,5,978824268
1,1,Toy Story (1995),Animation|Children's|Comedy,6,4,978237008
2,1,Toy Story (1995),Animation|Children's|Comedy,8,4,978233496
3,1,Toy Story (1995),Animation|Children's|Comedy,9,5,978225952
4,1,Toy Story (1995),Animation|Children's|Comedy,10,5,978226474


In [122]:
merge_ratings_movies.shape

(1000209, 6)

In [123]:
merge_ratings_movies = merge_ratings_movies.drop('timestamp', axis=1)

In [124]:
merge_ratings_movies.shape

(1000209, 5)

In [131]:
# Define a TF-IDF Vectorizer Object.
tfidf_movies_genres = TfidfVectorizer(token_pattern = '[a-zA-Z0-9\-]+')

#Replace NaN with an empty string
df_movies['Genres'] = df_movies['Genres'].replace(to_replace="(no genres listed)", value="")

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_movies_genres_matrix = tfidf_movies_genres.fit_transform(df_movies['Genres'])
# print(tfidf_movies_genres.get_feature_names())
# Compute the cosine similarity matrix
# print(tfidf_movies_genres_matrix.shape)
# print(tfidf_movies_genres_matrix.dtype)
cosine_sim_movies = linear_kernel(tfidf_movies_genres_matrix, tfidf_movies_genres_matrix)
# print(cosine_sim_movies)

In [134]:
def get_recommendations_based_on_genres(movie_title, cosine_sim_movies=cosine_sim_movies):
    """
    Calculates top 2 movies to recommend based on given movie titles genres. 
    :param movie_title: title of movie to be taken for base of recommendation
    :param cosine_sim_movies: cosine similarity between movies 
    :return: Titles of movies recommended to user
    """
    # Get the index of the movie that matches the title
    idx_movie = df_movies.loc[df_movies['Title'].isin([movie_title])]
    idx_movie = idx_movie.index
    
    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores_movies = list(enumerate(cosine_sim_movies[idx_movie][0]))
    
    # Sort the movies based on the similarity scores
    sim_scores_movies = sorted(sim_scores_movies, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores_movies = sim_scores_movies[1:3]
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores_movies]
    
    # Return the top 2 most similar movies
    return df_movies['Title'].iloc[movie_indices]

In [135]:
get_recommendations_based_on_genres("Father of the Bride Part II (1995)")

18    Ace Ventura: When Nature Calls (1995)
37                      It Takes Two (1995)
Name: Title, dtype: object

In [138]:
def get_recommendation_content_model(userId):
    """
    Calculates top movies to be recommended to user based on movie user has watched.  
    :param userId: userid of user
    :return: Titles of movies recommended to user
    """
    recommended_movie_list = []
    movie_list = []
    df_rating_filtered = df_ratings[df_ratings["userId"]== userId]
    for key, row in df_rating_filtered.iterrows():
        movie_list.append((df_movies["Title"][row["MovieID"]==df_movies["MovieID"]]).values) 
    for index, movie in enumerate(movie_list):
        for key, movie_recommended in get_recommendations_based_on_genres(movie[0]).iteritems():
            recommended_movie_list.append(movie_recommended)

    # removing already watched movie from recommended list    
    for movie_title in recommended_movie_list:
        if movie_title in movie_list:
            recommended_movie_list.remove(movie_title)
    
    return set(recommended_movie_list)
get_recommendation_content_model(1)

{'Ace Ventura: When Nature Calls (1995)',
 'Age of Innocence',
 'Aladdin and the King of Thieves (1996)',
 'All Dogs Go to Heaven 2 (1996)',
 'Amateur (1994)',
 'Beavis and Butt-head Do America (1996)',
 'Beetlejuice (1988)',
 'Believers',
 'Beyond Rangoon (1995)',
 'Birdcage',
 'Bogus (1996)',
 'Breaks',
 'Butcher Boy',
 'Carrington (1995)',
 'Coneheads (1993)',
 'Cool Dry Place',
 'Corruptor',
 'Dirty Dancing (1987)',
 'FairyTale: A True Story (1997)',
 'Full Metal Jacket (1987)',
 'Goofy Movie',
 'Great Race',
 'Gumby: The Movie (1995)',
 'Heaven & Earth (1993)',
 'Horse Whisperer',
 'I Married A Strange Person (1997)',
 'It Takes Two (1995)',
 'Jack and Sarah (1995)',
 "Kid in King Arthur's Court",
 'Kiss of Death (1995)',
 'Knightriders (1981)',
 'Lady and the Tramp (1955)',
 'Last Emperor',
 'Leaving Las Vegas (1995)',
 'Madame Butterfly (1995)',
 'Man with Two Brains',
 'Mortal Kombat (1995)',
 'Muppet Treasure Island (1996)',
 'Mystery Science Theater 3000: The Movie (1996)',
 

In [139]:
from sklearn.neighbors import KNeighborsClassifier  
def get_movie_label(movie_id):
    """
    Get the cluster label to which movie belongs by KNN algorithm.  
    :param movie_id: movie id
    :return: genres label to movie belong
    """
    classifier = KNeighborsClassifier(n_neighbors=5)
    x= tfidf_movies_genres_matrix
    y = df_movies.iloc[:,-1]
    classifier.fit(x, y)
    y_pred = classifier.predict(tfidf_movies_genres_matrix[movie_id])
    return y_pred

In [141]:
true_count = 0
false_count = 0
def evaluate_content_based_model():
    """
    Evaluate content based model.  
    """
    for key, colums in df_movies.iterrows():
        movies_recommended_by_model = get_recommendations_based_on_genres(colums["Title"])
        predicted_genres  = get_movie_label(movies_recommended_by_model.index)
        for predicted_genre in predicted_genres:
            global true_count, false_count
            if predicted_genre == colums["Genres"]:
                true_count = true_count+1
            else:
#                 print(colums["genres"])
#                 print(predicted_genre)
                false_count = false_count +1
evaluate_content_based_model()
total = true_count + false_count
print("Hit:"+ str(true_count/total))
print("Fault:" + str(false_count/total))

Hit:0.8906773113571981
Fault:0.10932268864280195


In [142]:
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

In [143]:
df_movies = movies 
df_ratings = ratings

In [144]:

df_movies_ratings=pd.merge(df_movies, df_ratings)

In [145]:
df_movies_ratings.head()

Unnamed: 0,MovieID,Title,Genres,userId,rating,timestamp
0,1,Toy Story (1995),Animation|Children's|Comedy,1,5,978824268
1,1,Toy Story (1995),Animation|Children's|Comedy,6,4,978237008
2,1,Toy Story (1995),Animation|Children's|Comedy,8,4,978233496
3,1,Toy Story (1995),Animation|Children's|Comedy,9,5,978225952
4,1,Toy Story (1995),Animation|Children's|Comedy,10,5,978226474


In [147]:
ratings_matrix_items = df_movies_ratings.pivot_table(index=['MovieID'],columns=['userId'],values='rating').reset_index(drop=True)
ratings_matrix_items.fillna( 0, inplace = True )
ratings_matrix_items.shape

(3706, 6040)

In [148]:
ratings_matrix_items

userId,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
0,5.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,5.0,5.0,...,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,3.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,2.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [149]:
movie_similarity = 1 - pairwise_distances( ratings_matrix_items.as_matrix(), metric="cosine" )
np.fill_diagonal( movie_similarity, 0 ) #Filling diagonals with 0s for future use when sorting is done
ratings_matrix_items = pd.DataFrame( movie_similarity )
ratings_matrix_items

  """Entry point for launching an IPython kernel.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3696,3697,3698,3699,3700,3701,3702,3703,3704,3705
0,0.000000,0.390349,0.267943,0.178789,0.256569,0.347373,0.301490,0.125709,0.106620,0.377459,...,0.099502,0.020966,0.084105,0.081826,0.045949,0.309676,0.186633,0.093479,0.042829,0.182691
1,0.390349,0.000000,0.240946,0.155457,0.249970,0.244827,0.262772,0.196521,0.158469,0.386200,...,0.061819,0.015209,0.075310,0.095573,0.074271,0.213650,0.140781,0.087013,0.026063,0.122185
2,0.267943,0.240946,0.000000,0.192788,0.308290,0.187020,0.292230,0.092122,0.128378,0.245601,...,0.038492,0.065507,0.049512,0.087377,0.050985,0.190575,0.104837,0.062258,0.010073,0.097786
3,0.178789,0.155457,0.192788,0.000000,0.271990,0.125170,0.220024,0.049554,0.060334,0.133707,...,0.055486,0.053300,0.002227,0.025278,0.025204,0.118902,0.096318,0.022588,0.024769,0.095154
4,0.256569,0.249970,0.308290,0.271990,0.000000,0.148114,0.305107,0.095512,0.138392,0.237681,...,0.026632,0.083898,0.046399,0.047542,0.016156,0.174554,0.092403,0.051633,0.010750,0.112835
5,0.347373,0.244827,0.187020,0.125170,0.148114,0.000000,0.184966,0.055532,0.172145,0.418485,...,0.089106,0.025354,0.017274,0.112076,0.087213,0.236447,0.201419,0.115331,0.029136,0.222836
6,0.301490,0.262772,0.292230,0.220024,0.305107,0.184966,0.000000,0.049023,0.083145,0.248029,...,0.066875,0.051497,0.037842,0.065268,0.051835,0.191689,0.117660,0.059262,0.036102,0.138879
7,0.125709,0.196521,0.092122,0.049554,0.095512,0.055532,0.049023,0.000000,0.045263,0.107235,...,0.028519,0.072446,0.064868,0.059819,0.066350,0.090387,0.080523,0.084976,0.072141,0.045523
8,0.106620,0.158469,0.128378,0.060334,0.138392,0.172145,0.083145,0.045263,0.000000,0.216823,...,0.046188,0.014033,0.020523,0.103986,0.049767,0.092347,0.099554,0.004956,0.000000,0.057881
9,0.377459,0.386200,0.245601,0.133707,0.237681,0.418485,0.248029,0.107235,0.216823,0.000000,...,0.072576,0.049577,0.041950,0.121969,0.090955,0.237227,0.136374,0.097170,0.018359,0.161396


In [153]:
def item_similarity(movieName): 
    """
    recomendates similar movies
   :param data: name of the movie 
   """
    try:
        #user_inp=input('Enter the reference movie title based on which recommendations are to be made: ')
        user_inp=movieName
        inp=df_movies[df_movies['Title']==user_inp].index.tolist()
        inp=inp[0]

        df_movies['similarity'] = ratings_matrix_items.iloc[inp]
        df_movies.columns = ['MovieID', 'Title', 'release_date','similarity']
    except:
        print("Sorry, the movie is not in the database!")

In [166]:
def recommendedMoviesAsperItemSimilarity(user_id):
    """
     Recommending movie which user hasn't watched as per Item Similarity
    :param user_id: user_id to whom movie needs to be recommended
    :return: movieIds to user 
    """
    user_movie= df_movies_ratings[(df_movies_ratings.userId==user_id) & df_movies_ratings.rating.isin([5,4.5])][['Title']]
    user_movie=user_movie.iloc[0,0]
    item_similarity(user_movie)
    sorted_movies_as_per_userChoice=df_movies.sort_values( ["similarity"], ascending = False )
    sorted_movies_as_per_userChoice=sorted_movies_as_per_userChoice[sorted_movies_as_per_userChoice['similarity'] >=0.45]['MovieID']
    recommended_movies=list()
    df_recommended_item=pd.DataFrame()
    user2Movies= df_ratings[df_ratings['userId']== user_id]['MovieID']
    for MovieID in sorted_movies_as_per_userChoice:
            if MovieID not in user2Movies:
                df_new= df_ratings[(df_ratings.MovieID==MovieID)]
                df_recommended_item=pd.concat([df_recommended_item,df_new])
            best10=df_recommended_item.sort_values(["rating"], ascending = False )[1:10] 
    return best10['MovieID']

In [167]:
def movieIdToTitle(listMovieIDs):
    """
     Converting movieId to titles
    :param user_id: List of movies
    :return: movie titles
    """
    movie_titles= list()
    for id in listMovieIDs:
        movie_titles.append(df_movies[df_movies['MovieID']==id]['Title'])
    return movie_titles

In [168]:
user_id=215
print("Recommended movies:\n",movieIdToTitle(recommendedMoviesAsperItemSimilarity(user_id)))

Recommended movies,:
 [1120    Monty Python and the Holy Grail (1974)
Name: Title, dtype: object, 49    Usual Suspects
Name: Title, dtype: object, 1120    Monty Python and the Holy Grail (1974)
Name: Title, dtype: object, 49    Usual Suspects
Name: Title, dtype: object, 49    Usual Suspects
Name: Title, dtype: object, 1120    Monty Python and the Holy Grail (1974)
Name: Title, dtype: object, 1120    Monty Python and the Holy Grail (1974)
Name: Title, dtype: object, 49    Usual Suspects
Name: Title, dtype: object, 1120    Monty Python and the Holy Grail (1974)
Name: Title, dtype: object]


In [169]:
ratings_matrix_users = df_movies_ratings.pivot_table(index=['userId'],columns=['MovieID'],values='rating').reset_index(drop=True)
ratings_matrix_users.fillna( 0, inplace = True )
movie_similarity = 1 - pairwise_distances( ratings_matrix_users.as_matrix(), metric="cosine" )
np.fill_diagonal( movie_similarity, 0 ) #Filling diagonals with 0s for future use when sorting is done
ratings_matrix_users = pd.DataFrame( movie_similarity )
ratings_matrix_users

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6030,6031,6032,6033,6034,6035,6036,6037,6038,6039
0,0.000000,0.096382,0.120610,0.132455,0.090158,0.179222,0.059678,0.138241,0.226148,0.255288,...,0.170588,0.082006,0.069807,0.033663,0.114877,0.186329,0.135979,0.000000,0.174604,0.133590
1,0.096382,0.000000,0.151479,0.171176,0.114394,0.100865,0.305787,0.203337,0.190198,0.226861,...,0.112503,0.091222,0.268565,0.014286,0.183384,0.228241,0.206274,0.066118,0.066457,0.218276
2,0.120610,0.151479,0.000000,0.151227,0.062907,0.074603,0.138332,0.077656,0.126457,0.213655,...,0.092960,0.125864,0.161507,0.000000,0.097308,0.143264,0.107744,0.120234,0.094675,0.133144
3,0.132455,0.171176,0.151227,0.000000,0.045094,0.013529,0.130339,0.100856,0.093651,0.120738,...,0.163629,0.093041,0.382803,0.000000,0.082097,0.170583,0.127464,0.062907,0.064634,0.137968
4,0.090158,0.114394,0.062907,0.045094,0.000000,0.047449,0.126257,0.220817,0.261330,0.117052,...,0.100652,0.035732,0.061806,0.054151,0.179083,0.293365,0.172686,0.020459,0.027689,0.241437
5,0.179222,0.100865,0.074603,0.013529,0.047449,0.000000,0.049982,0.075234,0.111123,0.204938,...,0.037084,0.080180,0.023334,0.000000,0.053685,0.093583,0.065788,0.065711,0.167303,0.083436
6,0.059678,0.305787,0.138332,0.130339,0.126257,0.049982,0.000000,0.237550,0.162306,0.092559,...,0.112287,0.018864,0.349259,0.000000,0.116922,0.122441,0.111673,0.000000,0.014977,0.080680
7,0.138241,0.203337,0.077656,0.100856,0.220817,0.075234,0.237550,0.000000,0.291369,0.154112,...,0.080165,0.029838,0.133244,0.017670,0.275391,0.227400,0.144395,0.019242,0.044660,0.148123
8,0.226148,0.190198,0.126457,0.093651,0.261330,0.111123,0.162306,0.291369,0.000000,0.239249,...,0.175427,0.023395,0.101233,0.047537,0.234626,0.239607,0.225055,0.093470,0.046434,0.215819
9,0.255288,0.226861,0.213655,0.120738,0.117052,0.204938,0.092559,0.154112,0.239249,0.000000,...,0.179099,0.223739,0.161627,0.061829,0.314958,0.338072,0.246902,0.113789,0.296776,0.255793


In [170]:
ratings_matrix_users.idxmax(axis=1)

0       5342
1       3107
2       2999
3       4142
4       1483
5       1091
6       1848
7        367
8       1772
9       1119
10      3279
11      5258
12      3554
13      5543
14      4216
15      4243
16      2012
17      2179
18      2456
19      1383
20      1352
21      2014
22      4731
23      1039
24      4011
25      5255
26      1445
27      5733
28      3905
29      1549
        ... 
6010    5981
6011    5966
6012     499
6013    1372
6014    2028
6015    5642
6016    5788
6017     948
6018    4718
6019    5841
6020    5930
6021    3169
6022    2185
6023     603
6024     725
6025    5777
6026    3552
6027    2145
6028    5780
6029    5337
6030    2688
6031    2139
6032    5841
6033    5998
6034     880
6035    1014
6036    2063
6037    2208
6038     930
6039    1631
Length: 6040, dtype: int64

In [171]:
ratings_matrix_users.idxmax(axis=1).sample( 10, random_state = 10 )

5720    5776
6011    5966
290     3750
1985    1255
1065    2744
413      436
3600    3270
4635    4845
1969    1904
5698    4883
dtype: int64

In [172]:
similar_user_series= ratings_matrix_users.idxmax(axis=1)
df_similar_user= similar_user_series.to_frame()

In [173]:
df_similar_user.columns=['similarUser']

In [181]:
movieId_recommended=list()
def getRecommendedMoviesAsperUserSimilarity(userId):
    """
     Recommending movies which user hasn't watched as per User Similarity
    :param user_id: user_id to whom movie needs to be recommended
    :return: movieIds to user 
    """
    user2Movies= df_ratings[df_ratings['userId']== userId]['MovieID']
    sim_user=df_similar_user.iloc[0,0]
    df_recommended=pd.DataFrame(columns=['MovieID','Title','Genres','userId','rating','timestamp'])
    for MovieID in df_ratings[df_ratings['userId']== sim_user]['MovieID']:
        if MovieID not in user2Movies:
            df_new= df_movies_ratings[(df_movies_ratings.userId==sim_user) & (df_movies_ratings.MovieID==MovieID)]
            df_recommended=pd.concat([df_recommended,df_new])
        best10=df_recommended.sort_values(['rating'], ascending = False )[1:10]  
    return best10['MovieID']

In [182]:
user_id=215
recommend_movies= movieIdToTitle(getRecommendedMoviesAsperUserSimilarity(user_id))
print("Movies you should watch are:\n")
print(recommend_movies)

Movies you should watch are:

[1366    Jaws (1975)
Name: Title, dtype: object, 1885    Rocky (1976)
Name: Title, dtype: object, 585    Terminator 2: Judgment Day (1991)
Name: Title, dtype: object, 3458    Predator (1987)
Name: Title, dtype: object, 3439    Outlaw Josey Wales
Name: Title, dtype: object, 1212    Right Stuff
Name: Title, dtype: object, 1250    Back to the Future (1985)
Name: Title, dtype: object, 1255    Highlander (1986)
Name: Title, dtype: object, 1568    Hunt for Red October
Name: Title, dtype: object]


In [183]:
def get_user_similar_movies( user1, user2 ):
    
    """
     Returning common movies and ratings of same for both the users
    :param user1,user2: user ids of 2 users need to compare
    :return: movieIds to user 
    """
    common_movies = df_movies_ratings[df_movies_ratings.userId == user1].merge(
      df_movies_ratings[df_movies_ratings.userId == user2],
      on = "MovieID",
      how = "inner" )
    common_movies.drop(['MovieID','Genres_x','Genres_y', 'timestamp_x','timestamp_y','Title_y'],axis=1,inplace=True)
    return common_movies

In [184]:
get_user_similar_movies(587,511)

Unnamed: 0,Title_x,userId_x,rating_x,userId_y,rating_y
0,Maltese Falcon,587,2,511,4
1,Doors,587,2,511,3
2,Austin Powers: The Spy Who Shagged Me (1999),587,4,511,5
3,Blair Witch Project,587,3,511,3
4,Ghostbusters (1984),587,3,511,4
5,Mosquito Coast,587,3,511,3
6,Being John Malkovich (1999),587,3,511,5
7,Do the Right Thing (1989),587,4,511,3
8,Diner (1982),587,2,511,3


In [187]:
from scipy.sparse.linalg import svds

In [190]:
from surprise import Reader, Dataset, SVD, evaluate
from surprise.model_selection import cross_validate

# Load Reader library
reader = Reader()

# Load ratings dataset with Dataset library
data = Dataset.load_from_df(ratings[['userId', 'MovieID', 'rating']], reader)

# Split the dataset for 5-fold evaluation
data.split(n_folds=5)

In [191]:
svd = SVD()

In [192]:
trainset = data.build_full_trainset()
svd.train(trainset)



<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1fb8696cef0>

In [193]:
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

from surprise import SVDpp
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate

In [196]:
# Use movielens-1m data
data = Dataset.load_builtin('ml-100k')
trainset, testset = train_test_split(data, test_size=.15)
type(data)

surprise.dataset.DatasetAutoFolds

In [197]:
algo_svdpp = SVDpp(n_factors=160, n_epochs=10, lr_all=0.005, reg_all=0.1)
algo_svdpp.fit(trainset)
test_pred = algo_svdpp.test(testset)
print("SVDpp : Test Set")
accuracy.rmse(test_pred, verbose=True)

SVDpp : Test Set
RMSE: 0.9508


0.9507849826474443

In [200]:
def evaluation_collaborative_svd_model(userId,userOrItem):
    """
    hydrid the functionality of Collaborative based and svd based model to see if ratings of predicted movies 
    :param userId: userId of user, userOrItem is a boolean value if True it is User-User and if false Item-Item
    :return: dataframe of movies and ratings
    """ 
    movieIdsList= list()
    movieRatingList=list()
    movieIdRating= pd.DataFrame(columns=['MovieID','rating'])
    if userOrItem== True:
        movieIdsList=getRecommendedMoviesAsperUserSimilarity(userId)
    else:
        movieIdsList=recommendedMoviesAsperItemSimilarity(user_id)
    for MovieID in movieIdsList:
        predict = svd.predict(userId, MovieID)
        movieRatingList.append([MovieID,predict.est])
        movieIdRating = pd.DataFrame(np.array(movieRatingList), columns=['MovieID','rating'])
        count=movieIdRating[(movieIdRating['rating'])>=3]['MovieID'].count()
        total=movieIdRating.shape[0]
        hit_ratio= count/total
    return hit_ratio

In [201]:
print("Hit ratio of User-user collaborative filtering")
print(evaluation_collaborative_svd_model(user_id,True))
print("Hit ratio of Item-Item collaborative filtering")
print(evaluation_collaborative_svd_model(user_id,False))

Hit ratio of User-user collaborative filtering
1.0
Hit ratio of Item-Item collaborative filtering
1.0


In [202]:
df_movies=movies
def hybrid_content_svd_model(userId):
    """
    hydrid the functionality of content based and svd based model to recommend user top 10 movies. 
    :param userId: userId of user
    :return: list of movies recommended with rating given by svd model
    """
    recommended_movies_by_content_model = get_recommendation_content_model(userId)
    recommended_movies_by_content_model = df_movies[df_movies.apply(lambda movie: movie["Title"] in recommended_movies_by_content_model, axis=1)]
    for key, columns in recommended_movies_by_content_model.iterrows():
        predict = svd.predict(userId, columns["MovieID"])
        recommended_movies_by_content_model.loc[key, "svd_rating"] = predict.est
#         if(predict.est < 2):
#             recommended_movies_by_content_model = recommended_movies_by_content_model.drop([key])
    return recommended_movies_by_content_model.sort_values("svd_rating", ascending=False).iloc[0:11]
        
hybrid_content_svd_model(user_id)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,MovieID,Title,release_date,similarity,svd_rating
1178,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War,0.570125,4.989634
3045,3114,Toy Story 2 (1999),Animation|Children's|Comedy,0.283602,4.856262
1281,1301,Forbidden Planet (1956),Sci-Fi,0.386306,4.69692
930,942,Laura (1944),Crime|Film-Noir|Mystery,0.032268,4.584068
1891,1960,Last Emperor,The (1987),0.230735,4.576631
731,741,Ghost in the Shell (Kokaku kidotai) (1995),Animation|Sci-Fi,0.03931,4.572241
25,26,Othello (1995),Drama,0.13079,4.543656
1385,1408,Last of the Mohicans,The (1992),0.053745,4.53269
408,412,Age of Innocence,The (1993),0.11201,4.421448
3232,3301,Whole Nine Yards,The (2000),0.033632,4.411258
