In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from surprise import Reader, Dataset, KNNBasic
from surprise.model_selection import cross_validate
from surprise import SVD

In [2]:
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
X = ratings.copy()
y = ratings['userId']
#Split into training and test datasets, stratified along user_id
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify=y, random_state=42)

In [3]:
# read the ratings data from the CSV file
def read_ratings_user(ratings_df):
    
    # pivot the DataFrame to create a matrix of user ratings
    ratings_matrix = ratings_df.pivot_table(index='userId', columns='movieId', values='rating')
    # fill missing values with zeros
    ratings_matrix = ratings_matrix.fillna(0)

    return ratings_matrix

def read_ratings_item(ratings_df):

    # pivot the DataFrame to create a matrix of user ratings
    ratings_matrix = ratings_df.pivot_table(index='movieId', columns='userId', values='rating')
    # fill missing values with zeros
    ratings_matrix = ratings_matrix.fillna(0)

    return ratings_matrix


def cf(user_id, k, ratings_df, movie_id):
    user_ratings = ratings_df.loc[user_id]

    # filter the ratings data to include only the users who have rated the movie
    movie_ratings = ratings_df[ratings_df[movie_id] != 0]

    # calculate the cosine similarity between user_id and the users who have rated the movie
    similarity_matrix = cosine_similarity(user_ratings.values.reshape(1, -1), movie_ratings.values)

    # create a dataframe of (user_id, similarity) pairs
    user_similarity_df = pd.DataFrame(index = movie_ratings.index, data = similarity_matrix[0], columns=['similarity'])

    # user_similarity_df

    # sort the users by their similarity to user_id
    sorted_users = user_similarity_df.sort_values(by='similarity', ascending=False)

    # # get the top-k most similar users
    top_k_users = sorted_users.iloc[1:k+1]

    num = 0
    den = 0
    numerator = 0
    denominator = 0
    for top_user_id in top_k_users.index:
        # print('a')
        # print(top_user_id)
        numerator = user_similarity_df.loc[user_similarity_df.index == top_user_id, 'similarity'] * ratings_df.loc[top_user_id, movie_id]
        num = num + numerator.values
        denominator = user_similarity_df.loc[user_similarity_df.index == top_user_id, 'similarity']
        den = den + denominator.values

    if den == 0 :
        den = 1
        
    predicted_rating = num/den

    return predicted_rating




In [4]:
# testing 
movie = 62
user = 45
k = 10
ratings_user = read_ratings_user(ratings)
ratings_item = read_ratings_item(ratings)
ratings_df_user_train = read_ratings_user(X_train)
ratings_df_item_train = read_ratings_item(X_train)
ratings_df_user_test = read_ratings_user(X_test)
ratings_df_item_test = read_ratings_item(X_test)

prediction_user = cf(user,10,ratings_df_user_train, movie )
prediction_item = cf(movie,10,ratings_df_item_train,user)
# ratings_df.head()
print(prediction_user)
print(prediction_item)

[3.20979664]
[3.57883378]


In [5]:
# calculate average rmse
def accuracy(df):

    # squared diffs of each column
    total_sd = 0
    count = 0

    for col_index in df.columns:

        # filter out zero cells
        filtered_series = df[col_index][df[col_index] != 0]

        # sum the squared differences of the column
        squared_diff = 0
        for row_index, actual in filtered_series.iteritems():
            predicted = cf(row_index, 10, df, col_index)
            squared_diff += (predicted - actual)**2
            count += 1

        total_sd += squared_diff
    
    # return the rmse
    return np.sqrt(total_sd / count)

    

In [7]:
acc_df_user_train = accuracy(ratings_df_user_train)
acc_df_item_train = accuracy(ratings_df_item_train)
acc_df_user_test = accuracy(ratings_df_user_test)
acc_df_item_test = accuracy(ratings_df_item_test)

In [2]:
acc_df_user_train = [1.17638217]
acc_df_item_train = [0.85908481]
acc_df_user_test = [1.43586912]
acc_df_item_test = [0.95552221]

In [3]:
print(acc_df_user_train,acc_df_item_train,acc_df_user_test,acc_df_item_test)

[1.17638217] [0.85908481] [1.43586912] [0.95552221]


In [9]:
print(acc_df_user_train,acc_df_item_train,acc_df_user_test,acc_df_item_test)

[1.17638217]
[0.85908481]
[1.43586912]
[0.95552221]


In [13]:
# accuracy(ratings_df_user_test.iloc[1:5,])

array([3.61062119])

In [14]:
# accuracy(ratings_df_item_test.iloc[1:5,])

array([3.01202034])

In [6]:
# best recommendation for a user

def recommendation_user(user_id, ratings_df,movies, k):
    zero_indexes = np.where(ratings_df.loc[user_id] == 0)[0]
    column_names = ratings_df.iloc[:, zero_indexes].columns
    # prediction_user = []

    col_names = ['movieId', 'prediction']

    # Create an empty DataFrame with the defined column names
    prediction_df = pd.DataFrame(columns=col_names)

    # zero_indexes
    for movie in column_names:
        # print(movie)
        pred = cf(user_id,k,ratings_df, movie)
        # print(pred)
        row_dict = {'movieId': movie, 'prediction': pred}
        prediction_df = pd.concat([prediction_df, pd.DataFrame([row_dict])])
        # pd.concat(prediction_df,row_dict, axis= 0 ,ignore_index=True)

    prediction_df = shuffle(prediction_df)
    prediction_df = prediction_df.sort_values("prediction", ascending=False).head(k)

    movie_recommended = pd.merge(prediction_df, movies, on='movieId', how='left').drop(columns=['movieId', 'prediction'])
    
    return (movie_recommended)

def recommendation_item(user_id, ratings_df,movies, k):
    zero_indexes = np.where(ratings_df[user_id] == 0)[0]
    # zero_indexes
    movie_names = ratings_df.iloc[zero_indexes,]
    movie_index = movie_names.index
    # movie_index
    col_names = ['movieId', 'prediction']

    # Create an empty DataFrame with the defined column names
    prediction_df = pd.DataFrame(columns=col_names)

    for movie in movie_index:
        # print(movie)
        pred = cf(movie,k,ratings_df,user_id)
        # print(pred)
        row_dict = {'movieId': movie, 'prediction': pred}
        prediction_df = pd.concat([prediction_df, pd.DataFrame([row_dict])])
    prediction_df = shuffle(prediction_df)    
    prediction_df = prediction_df.sort_values("prediction", ascending=False).head(k)
    movie_recommended = pd.merge(prediction_df, movies, on='movieId', how='left').drop(columns=['movieId', 'prediction'])

    return(movie_recommended)




In [16]:
a = recommendation_item(6,ratings_df_item_train,movies,10)
a

Unnamed: 0,title,genres
0,Taxi 3 (2003),Action|Comedy
1,Lee Daniels' The Butler (2013),Drama
2,"Tale of Princess Kaguya, The (Kaguyahime no mo...",Animation|Drama|Fantasy
3,Breathe (2014),Drama
4,"Private Lives of Pippa Lee, The (2009)",Drama
5,Tooth Fairy 2 (2012),Children|Comedy
6,Play Time (a.k.a. Playtime) (1967),Comedy
7,Boy A (2007),Crime|Drama
8,Race (2016),Drama
9,Still Walking (Aruitemo aruitemo) (2008),Drama


In [17]:
b = recommendation_user(6,ratings_df_user_train,movies,10)
b

Unnamed: 0,title,genres
0,"Trial, The (Procès, Le) (1962)",Drama
1,"Hush... Hush, Sweet Charlotte (1964)",Horror|Thriller
2,"With a Friend Like Harry... (Harry, un ami qui...",Drama|Thriller
3,They Call Me Trinity (1971),Comedy|Western
4,Written on the Wind (1956),Drama
5,Dolls (2002),Drama|Romance
6,"Tree of Life, The (2011)",Drama
7,"Awful Truth, The (1937)",Comedy|Romance
8,Police Story (Ging chaat goo si) (1985),Action|Comedy|Crime|Thriller
9,"Hard Way, The (1991)",Action|Comedy


In [18]:
# Combining both dataframe and selecting random 10 choices for recommendation 
final_recommendation = pd.concat([a,b])
final_recommendation = final_recommendation.drop_duplicates()
final_recommendation = final_recommendation.sample(n=10)
final_recommendation

Unnamed: 0,title,genres
2,"With a Friend Like Harry... (Harry, un ami qui...",Drama|Thriller
4,Written on the Wind (1956),Drama
1,Lee Daniels' The Butler (2013),Drama
0,"Trial, The (Procès, Le) (1962)",Drama
9,Still Walking (Aruitemo aruitemo) (2008),Drama
8,Race (2016),Drama
6,"Tree of Life, The (2011)",Drama
3,Breathe (2014),Drama
4,"Private Lives of Pippa Lee, The (2009)",Drama
8,Police Story (Ging chaat goo si) (1985),Action|Comedy|Crime|Thriller


In [19]:
ratings = ratings.drop(columns='timestamp')
reader = Reader()
#dataset creation
data = Dataset.load_from_df(ratings, reader)
#model
knn = KNNBasic()
#Evaluating the performance in terms of RMSE
cross_validate(knn, data, measures=['RMSE', 'mae'], cv = 3)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


{'test_rmse': array([0.95804097, 0.95393686, 0.95986445]),
 'test_mae': array([0.73680026, 0.73299765, 0.735571  ]),
 'fit_time': (0.22442865371704102, 0.20461082458496094, 0.2236475944519043),
 'test_time': (3.532680034637451, 2.9211630821228027, 3.161729335784912)}

In [20]:
#Define the SVD algorithm object
svd = SVD()
#Evaluate the performance in terms of RMSE
cross_validate(svd, data, measures=['RMSE'], cv = 3)

{'test_rmse': array([0.88334761, 0.87721976, 0.87618597]),
 'fit_time': (1.868929386138916, 1.3211796283721924, 1.223247766494751),
 'test_time': (0.30908727645874023, 0.5051188468933105, 0.3965482711791992)}

In [21]:
trainset = data.build_full_trainset()
svd.fit(trainset)
ratings[ratings['userId'] == 6]
svd.predict(3, 110)

Prediction(uid=3, iid=110, r_ui=None, est=2.9626149569657874, details={'was_impossible': False})

In [22]:
# calculate average rmse
def recommend_model(user_id,df,model,movies,k):
    zero_indexes = np.where(df.loc[user_id] == 0)[0]
    column_names = df.iloc[:, zero_indexes].columns
    
    # prediction_user = []

    col_names = ['movieId', 'prediction']

    # Create an empty DataFrame with the defined column names
    prediction_df = pd.DataFrame(columns=col_names)

    for movie in column_names:
        # print(movie)
        pred = model.predict(user_id, movie).est
        # print(pred)
        row_dict = {'movieId': movie, 'prediction': pred}
        prediction_df = pd.concat([prediction_df, pd.DataFrame([row_dict])])
        # pd.concat(prediction_df,row_dict, axis= 0 ,ignore_index=True)
    prediction_df = shuffle(prediction_df)
    prediction_df = prediction_df.sort_values("prediction", ascending=False).head(k)

    movie_recommended = pd.merge(prediction_df, movies, on='movieId', how='left').drop(columns=['movieId', 'prediction'])
    
    return (movie_recommended)




In [23]:
a = recommend_model(1,ratings_item,knn,movies,10)
a

Unnamed: 0,title,genres
0,What Happened Was... (1994),Comedy|Drama|Romance|Thriller
1,Lamerica (1994),Adventure|Drama
2,Live Nude Girls (1995),Comedy
3,"Awfully Big Adventure, An (1995)",Drama
4,Chungking Express (Chung Hing sam lam) (1994),Drama|Mystery|Romance
5,Heidi Fleiss: Hollywood Madam (1995),Documentary
6,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
7,"Shawshank Redemption, The (1994)",Crime|Drama
8,Schindler's List (1993),Drama|War
9,Seven (a.k.a. Se7en) (1995),Mystery|Thriller


In [24]:
a = recommend_model(1,ratings_item,svd,movies,10)
a

Unnamed: 0,title,genres
0,In the Name of the Father (1993),Drama
1,Hoop Dreams (1994),Documentary
2,"Shawshank Redemption, The (1994)",Crime|Drama
3,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
4,Terminator 2: Judgment Day (1991),Action|Sci-Fi
5,Living in Oblivion (1995),Comedy
6,Three Colors: White (Trzy kolory: Bialy) (1994),Comedy|Drama
7,"Fugitive, The (1993)",Thriller
8,Tombstone (1993),Action|Drama|Western
9,Dazed and Confused (1993),Comedy


In [25]:
a = recommend_model(1,ratings_user,knn,movies,10)
a

Unnamed: 0,title,genres
0,Obsession (1965),Comedy
1,Colourful (Karafuru) (2010),Animation|Drama|Fantasy|Mystery
2,"Chorus Line, A (1985)",Comedy|Drama|Musical
3,Children of the Corn IV: The Gathering (1996),Horror
4,"Trial, The (Procès, Le) (1962)",Drama
5,Tenchi Muyô! In Love (1996),Animation|Comedy
6,Hudson Hawk (1991),Action|Adventure|Comedy
7,"Monster Squad, The (1987)",Adventure|Comedy|Horror
8,Investigation Held by Kolobki (1986),Animation
9,Three from Prostokvashino (1978),Animation


In [26]:
b = recommend_model(1,ratings_user,svd,movies,10)
b

Unnamed: 0,title,genres
0,Rosemary's Baby (1968),Drama|Horror|Thriller
1,North by Northwest (1959),Action|Adventure|Mystery|Romance|Thriller
2,Harry Potter and the Prisoner of Azkaban (2004),Adventure|Fantasy|IMAX
3,Hoop Dreams (1994),Documentary
4,"Shawshank Redemption, The (1994)",Crime|Drama
5,Guardians of the Galaxy (2014),Action|Adventure|Sci-Fi
6,Amadeus (1984),Drama
7,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX
8,"Great Escape, The (1963)",Action|Adventure|Drama|War
9,"Godfather: Part II, The (1974)",Crime|Drama
