### TF IDF MODEL

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer #obtaining tdf vectors
from itertools import combinations #finding combination of genres for a given movie
from sklearn.metrics.pairwise import cosine_similarity #To compute the cosine similarities between all tf-idf vectors


movies = pd.read_csv("../../Data/ml-latest-small/movies.csv",index_col=0)

In [2]:
tf = TfidfVectorizer(stop_words = 'english', analyzer = 'word')
tfidf_matrix = tf.fit_transform(movies['genres'])

In [3]:
cosine_sim = cosine_similarity(tfidf_matrix)
cosine_sim_df = pd.DataFrame(cosine_sim, index=movies['title'], columns=movies['title'])
#print('Shape:', cosine_sim_df.shape)
#find a sample of the result below
cosine_sim_df.sample(5, axis=1).round(2) 

title,Sunflower (Xiang ri kui) (2005),Hamlet (1990),"Irony of Fate, or Enjoy Your Bath! (Ironiya sudby, ili S legkim parom!) (1975)",Sahara (2005),Cheaper by the Dozen 2 (2005)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Toy Story (1995),0.00,0.00,0.14,0.40,0.50
Jumanji (1995),0.00,0.00,0.00,0.35,0.43
Grumpier Old Men (1995),0.00,0.00,0.88,0.25,0.31
Waiting to Exhale (1995),0.47,0.47,1.00,0.22,0.27
Father of the Bride Part II (1995),0.00,0.00,0.51,0.43,0.54
...,...,...,...,...,...
Black Butler: Book of the Atlantic (2017),0.00,0.00,0.16,0.40,0.17
No Game No Life: Zero (2017),0.00,0.00,0.18,0.15,0.19
Flint (2017),1.00,1.00,0.47,0.00,0.00
Bungo Stray Dogs: Dead Apple (2018),0.00,0.00,0.00,0.34,0.00


In [4]:
def genre_recommendations(i, M, items, k=10):
    """
    Recommends movies based on a similarity dataframe

    Parameters
    ----------
    i : str
        Movie (index of the similarity dataframe)
    M : pd.DataFrame
        Similarity dataframe, symmetric, with movies as indices and columns
    items : pd.DataFrame
        Contains both the title and some other features used to define similarity
    k : int
        Amount of recommendations to return

    """
    ix = M.loc[:,i].to_numpy().argpartition(range(-1,-k,-1))
    closest = M.columns[ix[-1:-(k+2):-1]]
    closest = closest.drop(i, errors='ignore')
    return pd.DataFrame(closest).merge(items).head(k)

In [5]:
# for example, find a harry potter movie
movies[movies["title"].str.contains('Harry')]

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1307,When Harry Met Sally... (1989),Comedy|Romance
1701,Deconstructing Harry (1997),Comedy|Drama
2184,"Trouble with Harry, The (1955)",Comedy|Mystery
3387,Who's Harry Crumb? (1989),Comedy|Mystery
3388,Harry and the Hendersons (1987),Children|Comedy
3389,Let's Get Harry (1986),Action|Adventure
4077,"With a Friend Like Harry... (Harry, un ami qui...",Drama|Thriller
4855,Dirty Harry (1971),Action|Crime|Thriller
4896,Harry Potter and the Sorcerer's Stone (a.k.a. ...,Adventure|Children|Fantasy
5816,Harry Potter and the Chamber of Secrets (2002),Adventure|Fantasy


In [6]:
genre_recommendations('Harry Potter and the Order of the Phoenix (2007)', cosine_sim_df, movies[['title', 'genres']])

Unnamed: 0,title,genres
0,Jack the Giant Slayer (2013),Adventure|Fantasy|IMAX
1,Harry Potter and the Prisoner of Azkaban (2004),Adventure|Fantasy|IMAX
2,"Hobbit: An Unexpected Journey, The (2012)",Adventure|Fantasy|IMAX
3,"Hobbit: The Desolation of Smaug, The (2013)",Adventure|Fantasy|IMAX
4,Alice in Wonderland (2010),Adventure|Fantasy|IMAX
5,Thor (2011),Action|Adventure|Drama|Fantasy|IMAX
6,"Twilight Saga: Breaking Dawn - Part 2, The (2012)",Adventure|Drama|Fantasy|Romance|IMAX
7,Harry Potter and the Goblet of Fire (2005),Adventure|Fantasy|Thriller|IMAX
8,Star Wars: Episode VII - The Force Awakens (2015),Action|Adventure|Fantasy|Sci-Fi|IMAX
9,Man of Steel (2013),Action|Adventure|Fantasy|Sci-Fi|IMAX


### KNN MODEL

In [7]:
df=pd.read_csv("../../Data/ml-latest-small/PreprocessedData_ml_latest_year_small.csv",index_col=0)
df["title"] = df["title"].str.lower()

In [8]:
def AskForUserInput():
    fav_movie=input("Enter your Favorite Movie: ").lower()
    n=0
    movies=df["title"].unique()
    while fav_movie not in movies and n<5:
        print("The Movie ", fav_movie," does not exist in our database.")
        fav_movie = input("Please enter another favourite Movie: ").lower()
        n+=1
    return fav_movie

In [9]:
from surprise import accuracy
from surprise.dataset import Dataset
from surprise.reader import Reader
from surprise.model_selection import train_test_split,cross_validate,RandomizedSearchCV
from surprise import KNNWithMeans
from surprise import KNNBasic
reader = Reader()

#delete unnecessary columns away
df_imp=df.drop(['title','year','genres'],axis=1)

surprise_data = Dataset.load_from_df(df_imp, reader)
trainset, testset = train_test_split(surprise_data, test_size=.25,random_state=10)

In [10]:
def find_best_model(model, parameters,data):
    clf = RandomizedSearchCV(model, parameters, n_jobs=-1, measures=['rmse'])
    clf.fit(data)             
    print(clf.best_score)
    print(clf.best_params)
    print(clf.best_estimator)
    return clf

In [11]:
sim_options = {
    "name": ["msd", "cosine", "pearson", "pearson_baseline"],"user_based": [True]
}
params = { 'k': range(2,25,1),'sim_options': sim_options}
clf = find_best_model(KNNWithMeans, params, surprise_data)

{'rmse': 0.8975729221519952}
{'rmse': {'k': 19, 'sim_options': {'name': 'pearson_baseline', 'user_based': True}}}
{'rmse': <surprise.prediction_algorithms.knns.KNNWithMeans object at 0x7fc64609eb90>}


In [12]:
knnwithmeans = clf.best_estimator['rmse']

In [13]:
from collections import defaultdict

def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        #cross-relate other information from the fulldf
        movieName=df[df['movieId']==iid]['title'].unique()[0]
        movieYear=df[df['movieId']==iid]['year'].unique()[0]
        genres=df[df['movieId']==iid]['genres'].unique()[0]
        avgRat=df[df['movieId']==iid]['rating'].mean().round(2)
        ratedBy=len(df[df['movieId']==iid]['rating'])
        
        top_n[uid].append((iid, movieName, movieYear, genres, avgRat, ratedBy))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

def rated_already(uid):
    df_out=df[df['userId']==uid]
    return df_out.title.unique()
    
class collab_filtering_Kmeans_Model():
    def __init__(self, model, trainset, testset, fulldf, data):
        self.model = model
        self.trainset = trainset
        self.testset = testset
        self.data = data
        self.pred_test = None
        self.recommendations = None
        self.top_n = None
        self.recommenddf = None

    def fit_and_predict(self):        
        print('**Fitting the train data...**')
        self.model.fit(self.trainset)       

        print('**Predicting the test data...**')
        self.pred_test = self.model.test(self.testset)        
        rmse = round(accuracy.rmse(self.pred_test), 3)
        print('**RMSE for the predicted result is ' + str(rmse) + '**')   
        
        #display(self.pred_test)
        self.top_n = get_top_n(self.pred_test)
      
        self.recommenddf = pd.DataFrame(columns=['userId', 'movieId', 'title', 'year', 'genres', 'average rating','number of ratings'])
        
        for item in self.top_n:
            subdf = pd.DataFrame(self.top_n[item], columns=['movieId','title',  'year', 'genres', 'average rating','number of ratings'])
            subdf['userId'] = item
            
            cols = subdf.columns.tolist()
            cols = cols[-1:] + cols[:-1]
            subdf = subdf[cols]        
            self.recommenddf = pd.concat([self.recommenddf, subdf], axis = 0)        
        return rmse
        
    def cross_validate(self):
        print('**Cross Validating the data...**')
        cv_result = cross_validate(self.model, self.data, n_jobs=-1,cv=5,verbose = True)
        
        return cv_result

    def recommend_foruser(self, user_id, n=5):
        List_already=rated_already(user_id)
        print('The User ',user_id,"has already rated", len(List_already),"other movies.")
        #print(List_already)
        
        print('Recommending top ',n,' movies for userid ',user_id,':')
        df_out = self.recommenddf[self.recommenddf.userId == user_id].head(n)
        
        #Just to ensure that you are not recommending something already rated by the user
        #common=list(set(List_already).intersection(df.title))
        #print("Common movies:", common)
        #One thing I do not understand how the model knows to recommend  movies not already rated?
        return df_out
    
    def recommend_similar_items(self,movie_title,n=5):
        movieId=df[df['title']==movie_title]['movieId'].unique()[0]
        movie_neighbours=self.model.get_neighbors(movieId,n)
        
        df_out=df[df.movieId.isin(movie_neighbours)].drop(["userId"],axis=1)
        df_out=df_out.groupby(['movieId','title',"year","genres"]).mean()
        df_out['average rating']=df_out['rating'].round(2)
        df_out=df_out.drop(['rating'],axis=1)
        df_out['number of ratings']=len(df[df.movieId.isin(movie_neighbours)]['rating'])
        
        return df_out 

In [14]:
CF_knnwithmeans = collab_filtering_Kmeans_Model(knnwithmeans, trainset, testset, df, surprise_data)

In [15]:
knnwithmeans_rmse=CF_knnwithmeans.fit_and_predict()

**Fitting the train data...**
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
**Predicting the test data...**
RMSE: 0.9008
**RMSE for the predicted result is 0.901**


In [16]:
knnwithmeans_cv_rmse_cv = CF_knnwithmeans.cross_validate()
mean_cv_RMSE_result = round(knnwithmeans_cv_rmse_cv['test_rmse'].mean(),3)
print('**Mean CV RMSE is ' + str(mean_cv_RMSE_result)  + '**')

**Cross Validating the data...**
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8925  0.8953  0.8943  0.8965  0.9003  0.8958  0.0026  
MAE (testset)     0.6766  0.6808  0.6792  0.6785  0.6801  0.6790  0.0014  
Fit time          0.24    0.26    0.27    0.24    0.23    0.25    0.01    
Test time         1.07    1.09    1.09    1.06    1.06    1.08    0.01    
**Mean CV RMSE is 0.896**


In [17]:
inp_movie=AskForUserInput()
df_out=CF_knnwithmeans.recommend_similar_items(inp_movie, n=10)
print("Because You like the movie",inp_movie,"we'd recommend you to watch:")
display(df_out.head(10))

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,average rating,number of ratings
movieId,title,year,genres,Unnamed: 4_level_1,Unnamed: 5_level_1
13,balto,1995,Adventure|Animation|Children,3.12,471
16,casino,1995,Crime|Drama,3.93,471
17,sense and sensibility,1995,Drama|Romance,3.78,471
36,dead man walking,1995,Crime|Drama,3.84,471
60,"indian in the cupboard, the",1995,Adventure|Children|Fantasy,3.24,471
76,screamers,1995,Action|Sci-Fi|Thriller,3.4,471
137,man of the year,1995,Documentary,3.0,471
242,farinelli: il castrato,1994,Drama|Musical,3.6,471
352,crooklyn,1994,Comedy|Drama,2.5,471
457,"fugitive, the",1993,Thriller,3.99,471


In [18]:
class HybridModel:
    def __init__(self, content_model, cf_model):
        self.content_model = content_model
        self.cf_model = cf_model

    def recommend_movies(self, user_input, n=10):
        # Get content-based recommendations
        content_recs = self.content_model.recommend_similar_items(user_input, n=n)
        content_recs = content_recs.reset_index()
        content_recs.rename(columns={"title": "movie_title"}, inplace=True)

        # Get CF-based recommendations
        cf_recs = self.cf_model.recommend_similar_items(user_input, n=n)
        cf_recs = cf_recs.reset_index()

        # Merge the two recommendation dataframes
        merged_recs = pd.merge(content_recs, cf_recs, on=["movieId", "year"], how="outer")
        merged_recs["average rating"] = merged_recs[["average rating_x", "average rating_y"]].mean(axis=1)
        merged_recs["number of ratings"] = merged_recs[["number of ratings_x", "number of ratings_y"]].mean(axis=1)
        merged_recs = merged_recs[["movie_title", "year", "genres_x", "average rating", "number of ratings"]]
        merged_recs.rename(columns={"genres_x": "genres"}, inplace=True)

        return merged_recs

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer #obtaining tdf vectors
from itertools import combinations #finding combination of genres for a given movie
from sklearn.metrics.pairwise import cosine_similarity #To compute the cosine similarities between all tf-idf vectors

from surprise import accuracy
from surprise.dataset import Dataset
from surprise.reader import Reader
from surprise.model_selection import train_test_split, cross_validate, RandomizedSearchCV
from surprise import KNNWithMeans
from surprise import KNNBasic

# Read in preprocessed movie data
movies_df = pd.read_csv('../../Data/ml-latest-small/PreprocessedData_ml_latest_year_small.csv', index_col=0)


# Content-based model
# Create a TF-IDF vectorizer to convert the text data into vectors
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(movies_df['genres'])

# Compute cosine similarity between all movie pairs
cosine_sim = cosine_similarity(tfidf)

# Collaborative filtering model
# Define the reader
reader = Reader()

# Load in the ratings data
ratings_df = pd.read_csv('../../Data/ml-latest-small/ratings.csv')

# Create the Surprise dataset
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

# Split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.25)

# Train the model
#cf_model = KNNWithMeans(k=20, sim_options={'name':["msd", "cosine", "pearson"], 'user_based': True})
cf_model = KNNWithMeans(k=20, sim_options={'name':["pearson"], 'user_based': True})
cf_model.fit(trainset)



In [None]:
# Hybrid model
class HybridModel:
    def __init__(self, content_model, cf_model):
        self.content_model = content_model
        self.cf_model = cf_model
        
    def recommend_movies(self, user_input, n=10):
        # Find the top 10 similar movies based on the content-based model
        movie_idx = movies_df[movies_df['title'] == user_input].index[0]
        similar_movies = list(enumerate(cosine_sim[movie_idx]))
        similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)
        similar_movies = similar_movies[1:n+1]
        
        # Get the predicted ratings for each similar movie based on the collaborative filtering model
        similar_movie_ids = [movies_df.iloc[i[0]]['movieId'] for i in similar_movies]
        similar_movie_ratings = []
        for movie_id in similar_movie_ids:
            prediction = cf_model.predict(1, movie_id)
            similar_movie_ratings.append((prediction.iid, prediction.est))
        
        # Sort the movies by predicted rating and return the top 10
        similar_movie_ratings = sorted(similar_movie_ratings, key=lambda x: x[1], reverse=True)
        top_movies = similar_movie_ratings[:n]
        
        return top_movies

In [None]:
# Hybrid model
class HybridModel:
    def __init__(self, cosine_sim_df, CF_knnwithmeans):
        self.cosine_sim_df = cosine_sim_df
        self.CF_knnwithmeans = CF_knnwithmeans
        
    def recommend_movies(self, user_input, n=10):
        # Find the top 10 similar movies based on the content-based model
        movie_idx = movies[movies['title'] == user_input].index[0]
        similar_movies = list(enumerate(cosine_sim[movie_idx]))
        similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)
        similar_movies = similar_movies[1:n+1]
        
        # Get the predicted ratings for each similar movie based on the collaborative filtering model
        similar_movie_ids = [movies.iloc[i[0]]['movieId'] for i in similar_movies]
        similar_movie_ratings = []
        for movie_id in similar_movie_ids:
            prediction = CF_knnwithmeans.predict(1, movie_id)
            similar_movie_ratings.append((prediction.iid, prediction.est))
        
        # Sort the movies by predicted rating and return the top 10
        similar_movie_ratings = sorted(similar_movie_ratings, key=lambda x: x[1], reverse=True)
        top_movies = similar_movie_ratings[:n]
        
        return top_movies

In [None]:
# Create an instance of the hybrid model
hybrid_model = HybridModel(cosine_sim, cf_model)

# Get recommendations based on the user input
user_input = 'Jumanji'
hybrid_recs = hybrid_model.recommend_movies(user_input, n=10)

# Print out the recommendations
print(f"Recommended movies based on {user_input}:")
for movie_id, rating in hybrid_recs:
    movie_title = movies_df[movies_df['movieId'] == movie_id]['title'].iloc[0]
    print(f"{movie_title} ({rating:.2f})")

In [None]:
# Create an instance of the hybrid model
hybrid_model = HybridModel(cosine_sim_df, CF_knnwithmeans)

# Get recommendations based on the user input
user_input = 'Jumanji'
hybrid_recs = hybrid_model.recommend_movies(user_input, n=10)

# Print out the recommendations
print(f"Recommended movies based on {user_input}:")
for movie_id, rating in hybrid_recs:
    movie_title = movies_df[movies['movieId'] == movie_id]['title'].iloc[0]
    print(f"{movie_title} ({rating:.2f})")