In [3]:
import numpy as np
import pandas as pd

df = pd.read_csv("PreprocessedData_ml_latest_year_small.csv", index_col = 0)
df.head()

Unnamed: 0,userId,movieId,rating,title,genres,year
0,1,1,4.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,5,1,4.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
2,7,1,4.5,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
3,15,1,2.5,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
4,17,1,4.5,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995


In [4]:
df["title"] = df["title"].str.lower()
df.head()

Unnamed: 0,userId,movieId,rating,title,genres,year
0,1,1,4.0,toy story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,5,1,4.0,toy story,Adventure|Animation|Children|Comedy|Fantasy,1995
2,7,1,4.5,toy story,Adventure|Animation|Children|Comedy|Fantasy,1995
3,15,1,2.5,toy story,Adventure|Animation|Children|Comedy|Fantasy,1995
4,17,1,4.5,toy story,Adventure|Animation|Children|Comedy|Fantasy,1995


### Collaborative Filtering

The Collaborative Filtering Recommender is entirely based on the past behavior and not on the context.

### User Input

In [5]:
def AskForUserInput():
    fav_movie=input("Enter your Favorite Movie: ").lower()
    n=0
    movies=df["title"].unique()
    while fav_movie not in movies and n<5:
        print("The Movie ", fav_movie," does not exist in our database.")
        fav_movie = input("Please enter another favourite Movie: ").lower()
        n+=1
    return fav_movie

### 1. Create a Pivot Matrix using only the train data
We pivot the dataframe to have userId as rows and movieId as columns.

We do not specify a special feature. We consider all features (rating, dummitized genres and year)

In [6]:
df_movie_pivot = df.pivot(
    index="movieId",
    columns="userId",
    values="rating"
).fillna(0)
df_movie_pivot.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


### 2. Sparse matrix
A lot of values n the above matrix are zero, i.e. the data is extremely sparse.

>A sparse matrix is a matrix that is comprised of mostly zero values.

>Convert into an Array Matrix by using csr_matrix

In [7]:
from scipy.sparse import csr_matrix
feature_matrix = csr_matrix(df_movie_pivot.values)

## 3. Build the Model

In [8]:
from sklearn.neighbors import NearestNeighbors
#We only consider 10 nearest neighbors
n_neigh = 10
model_knn_rating = NearestNeighbors(metric="cosine", n_neighbors=n_neigh)
model_knn_rating.fit(feature_matrix)

### 4. Create The Function to recommend movies

In [9]:
def recommend_movie_by_rating(query_title):
    
    #Find the movie ID w.r.t the the title
    query_index=df[df['title']==query_title]['movieId'].unique()[0]
    
    distances,indices=model_knn_rating.kneighbors(
        df_movie_pivot.iloc[query_index,:].values.reshape(1,-1)
    )
    
    MovieIds=[]
    CosDistance=[]
    ratedBy=[]
    
    for i in range(0,len(distances.flatten())):
        movieId=df_movie_pivot.index[indices.flatten()[i]]
        MovieIds.append(movieId)
    
        CosDistance.append(distances.flatten()[i])
        ratedBy.append(len(df[df['movieId']==movieId]['rating']))
        
    df_out=df[df.movieId.isin(MovieIds)].drop(["userId"],axis=1)
    
    df_out=df_out.groupby(['movieId','title',"year","genres"]).mean()
    df_out['average rating']=df_out['rating'].round(2)
    df_out=df_out.drop(['rating'],axis=1)
    df_out['number of ratings']=ratedBy
    df_out['Cosine Distance, ML Model']=CosDistance
    df_out['Cosine Distance, ML Model']=df_out['Cosine Distance, ML Model'].round(2)
    print("Because You like the movie",query_title,"we'd recommend you to watch:")
    display(df_out.head(10))

    #Results are automatically sorted as in the pivot table the indices are the movie ids

In [10]:
df_out.head()

NameError: name 'df_out' is not defined

### 5. Test

In [None]:
input_movie=AskForUserInput()
recommend_movie_by_rating(input_movie)

### 6. Save your model

In [11]:
import pickle
filename = "Model_1_NearestNeighbours_ratingbased.sav"
pickle.dump(model_knn_rating, open(filename, "wb"))

## 2. Model 2: Model-based Collaborative Filtering, user-based, K-means


In this approach, CF models are developed using machine learning algorithms to predict user’s rating of unrated items.

>We will use Surprise
>>Surprise is a Python scikit for building and analyzing recommender systems that deal with explicit rating data.

### 1. Load Surprise Data from DataFrame, Train-Test Split

In [12]:
from surprise import accuracy
from surprise.dataset import Dataset
from surprise.reader import Reader
from surprise.model_selection import train_test_split,cross_validate,RandomizedSearchCV
from surprise import KNNWithMeans
from surprise import KNNBasic
reader = Reader()

#delete unnecessary columns away
df_imp=df.drop(['title','year','genres'],axis=1)

surprise_data = Dataset.load_from_df(df_imp, reader)
trainset, testset = train_test_split(surprise_data, test_size=.25,random_state=10)

In [13]:
def find_best_model(model, parameters,data):
    clf = RandomizedSearchCV(model, parameters, n_jobs=-1, measures=['rmse'])
    clf.fit(data)             
    print(clf.best_score)
    print(clf.best_params)
    print(clf.best_estimator)
    return clf

### 2. Find Optimal k using RandomizedSearchCV

In [14]:
sim_options = {
    "name": ["msd", "cosine", "pearson", "pearson_baseline"],"user_based": [True]
}
params = { 'k': range(2,25,1),'sim_options': sim_options}
clf = find_best_model(KNNWithMeans, params, surprise_data)

{'rmse': 0.8960994039645082}
{'rmse': {'k': 24, 'sim_options': {'name': 'msd', 'user_based': True}}}
{'rmse': <surprise.prediction_algorithms.knns.KNNWithMeans object at 0x7fa533ac7790>}


### 3. Build Model using best parameters

In [15]:
knnwithmeans = clf.best_estimator['rmse']

### Recommender System Class
(Inspired from https://www.kaggle.com/code/rangarajansaranathan/collaborative-filtering-based-recommender-system)

In [16]:
from collections import defaultdict

def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        #cross-relate other information from the fulldf
        movieName=df[df['movieId']==iid]['title'].unique()[0]
        movieYear=df[df['movieId']==iid]['year'].unique()[0]
        genres=df[df['movieId']==iid]['genres'].unique()[0]
        avgRat=df[df['movieId']==iid]['rating'].mean().round(2)
        ratedBy=len(df[df['movieId']==iid]['rating'])
        
        top_n[uid].append((iid, movieName, movieYear, genres, avgRat, ratedBy))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

def rated_already(uid):
    df_out=df[df['userId']==uid]
    return df_out.title.unique()
    
class collab_filtering_Kmeans_Model():
    def __init__(self, model, trainset, testset, fulldf, data):
        self.model = model
        self.trainset = trainset
        self.testset = testset
        self.data = data
        self.pred_test = None
        self.recommendations = None
        self.top_n = None
        self.recommenddf = None

    def fit_and_predict(self):        
        print('**Fitting the train data...**')
        self.model.fit(self.trainset)       

        print('**Predicting the test data...**')
        self.pred_test = self.model.test(self.testset)        
        rmse = round(accuracy.rmse(self.pred_test), 3)
        print('**RMSE for the predicted result is ' + str(rmse) + '**')   
        
        #display(self.pred_test)
        self.top_n = get_top_n(self.pred_test)
      
        self.recommenddf = pd.DataFrame(columns=['userId', 'movieId', 'title', 'year', 'genres', 'average rating','number of ratings'])
        
        for item in self.top_n:
            subdf = pd.DataFrame(self.top_n[item], columns=['movieId','title',  'year', 'genres', 'average rating','number of ratings'])
            subdf['userId'] = item
            
            cols = subdf.columns.tolist()
            cols = cols[-1:] + cols[:-1]
            subdf = subdf[cols]        
            self.recommenddf = pd.concat([self.recommenddf, subdf], axis = 0)        
        return rmse
        
    def cross_validate(self):
        print('**Cross Validating the data...**')
        cv_result = cross_validate(self.model, self.data, n_jobs=-1)
        cv_result = round(cv_result['test_rmse'].mean(),3)
        print('**Mean CV RMSE is ' + str(cv_result)  + '**')
        return cv_result

    def recommend(self, user_id, n=5):
        List_already=rated_already(user_id)
        print('The User ',user_id,"has already rated", len(List_already),"other movies.")
        #print(List_already)
        
        print('Recommending top ',n,' movies for userid ',user_id,':')
        df = self.recommenddf[self.recommenddf.userId == user_id].head(n)
        
        #Just to ensure that you are not recommending something already rated by the user
        #common=list(set(List_already).intersection(df.title))
        #print("Common movies:", common)
        #One thing I do not understand how the model knows to recommend  movies not already rated?
        return df

In [17]:
CF_knnwithmeans = collab_filtering_Kmeans_Model(knnwithmeans, trainset, testset, df, surprise_data)

### 5. Fit, predict and cross-validate

In [18]:
knnwithmeans_rmse=CF_knnwithmeans.fit_and_predict()

**Fitting the train data...**
Computing the msd similarity matrix...
Done computing similarity matrix.
**Predicting the test data...**
RMSE: 0.9019
**RMSE for the predicted result is 0.902**


In [19]:
knnwithmeans_cv_rmse_cv = CF_knnwithmeans.cross_validate()

**Cross Validating the data...**
**Mean CV RMSE is 0.898**


### 6. Recommend (User based)

In [20]:
def AskForUserInput_userId():
    inp_id = int(input('Enter your UserId: '))
    n= 0
    
    while inp_id not in df.userId.unique() and n<5:# maximum 5 prompts
        print("The UserId ", inp_id, "does not exist in the database.") 
        inp_id = input('Please enter UserId: ')
        n +=1
    return inp_id

inp_id=AskForUserInput_userId()
result_knn_user1 = (CF_knnwithmeans.recommend(user_id=inp_id, n=10)).drop(['userId'],axis=1)
display(result_knn_user1)

Enter your UserId: 22
The User  22 has already rated 119 other movies.
Recommending top  10  movies for userid  22 :


Unnamed: 0,movieId,title,year,genres,average rating,number of ratings
0,60069,wall·e,2008,Adventure|Animation|Children|Romance|Sci-Fi,4.06,104
1,60950,vicky cristina barcelona,2008,Comedy|Drama|Romance,3.09,16
2,2968,time bandits,1981,Adventure|Comedy|Fantasy|Sci-Fi,3.73,41
3,1042,that thing you do!,1996,Comedy|Drama,3.27,42
4,68536,stanley kubrick: a life in pictures,2001,Documentary,4.0,1
5,4306,shrek,2001,Adventure|Animation|Children|Comedy|Fantasy|Ro...,3.87,170
6,318,"shawshank redemption, the",1994,Crime|Drama,4.43,317
7,5617,secretary,2002,Comedy|Drama|Romance,3.9,25
8,5464,road to perdition,2002,Crime|Drama,3.52,49
9,3949,requiem for a dream,2000,Drama,3.92,96


### 7. Save Your Model

In [21]:
import pickle
filename = 'Model_2_KNN_Means_ratingbased.sav'
pickle.dump(CF_knnwithmeans, open(filename, 'wb'))

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimatin