In [13]:
# Imports

import pandas as pd
import numpy as np
import re
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity



In [23]:
# Global objects

ratings=pd.read_csv('ratings.csv')
ratings.drop('timestamp', inplace=True, axis=1)
movies=pd.read_csv('movies.csv')
movies.drop('genres',inplace=True,axis=1)
ratings=pd.merge(movies,ratings)
movieRatings=ratings.pivot_table(index=['userId'],columns=['title'],values='rating')
corrMatrix=movieRatings.corr(method='pearson', min_periods=100)


In [24]:
corrMatrix.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),,,,,,,,,,,...,,,,,,,,,,
'Hellboy': The Seeds of Creation (2004),,,,,,,,,,,...,,,,,,,,,,
'Round Midnight (1986),,,,,,,,,,,...,,,,,,,,,,
'Salem's Lot (2004),,,,,,,,,,,...,,,,,,,,,,
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,


In [25]:
#Function to return top 10 movies based on a given user's watch history using item based collaborative filtering:

def CollaborativeBasedFiltering(user_id):

    myRatings=movieRatings.loc[user_id].dropna()
    simCandidates=pd.Series(dtype="float64")
    same=[]
    for i in range(len(myRatings.index)):
        sims=corrMatrix[myRatings.index[i]].dropna()
        sims=sims.map(lambda x:x*myRatings[i])
        simCandidates=simCandidates.append(sims)
    simCandidates.sort_values(inplace=True, ascending=False)
    simCandidates=simCandidates.groupby(simCandidates.index).sum()
    simCandidates.sort_values(inplace=True, ascending=False)
    
    same=[]
    for movie in myRatings.index:
        if(movie in simCandidates.index):
            same.append(movie)
            
    ddf=pd.DataFrame(simCandidates)
    filteredCandidates=ddf
    for movie in same:
        filteredCandidates=filteredCandidates.drop(movie)
    filteredCandidates.columns=['Rating']

    return filteredCandidates


In [26]:
def UserMovies(user_id):
    myRatings=movieRatings.loc[user_id].dropna()
    print('Top Movies rated by user are :')
    print(myRatings.sort_values(ascending=False).head(10))
    print()
    print('Lowest rated movies by user are :')
    print(myRatings.sort_values(ascending=True).head(10))
    print()
    

In [35]:
def ContentBasedFiltering(movie):
    df = pd.read_csv("Content_movie_dataset.csv")
    features = ['keywords','cast','genres','director']
    
    def combine_features(row):
        return row['keywords']+" "+row['cast']+" "+row['genres']+" "+row['director']
    
    def get_title_from_index(index):
        return df[df.index == index]["title"].values[0]
    
    def get_index_from_title(title):
        return df[df.title == title]["index"].values[0]
    
    for feature in features:
        df[feature] = df[feature].fillna('') #filling all NaNs with blank string

    df["combined_features"] = df.apply(combine_features,axis=1)
    
    for feature in features:
        df[feature] = df[feature].fillna('')
        
    cv = CountVectorizer() #creating new CountVectorizer() object
    count_matrix = cv.fit_transform(df["combined_features"])
    cosine_sim = cosine_similarity(count_matrix)
    
    movie_user_likes=movie
    movie_index = get_index_from_title(movie_user_likes)
    #accessing the row corresponding to given movie to find all the similarity scores 
    #for that movie and then enumerating over it
    similar_movies = list(enumerate(cosine_sim[movie_index])) 
    sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)[1:]
    print("Top similar movie to "+movie_user_likes+" is:")
    print(get_title_from_index(sorted_similar_movies[0][0]),"\n")
    return get_title_from_index(sorted_similar_movies[0][0])


In [38]:
def HybridRecommender(userid):
    
    filteredCandidates=CollaborativeBasedFiltering(userid)
    
    FinalRecommendations = []
    for movie in filteredCandidates.index[0:10]:

        # removing text in brackets
        movie=re.sub(r"\([^()]*\)", "", movie)
        # pre processing movie title : "Shawshank Redemption, The" -> "The Shawshank Redemption"
        if(", " in movie):
            movie=movie.split(", ")[1] +""+movie.split(", ")[0]
        try:

            recom = ContentBasedFiltering(movie)
            FinalRecommendations.append(recom)

        except:
            print(f"No other similar movies found for :{movie}")
            print()
            FinalRecommendations.append(movie)
            
    Hybrid_Recomm = set(FinalRecommendations).union(set(filteredCandidates.index[0:10])) 

    
    UserMovies(userid)
    print("------------------------------------------Recommended Movies---------------------------------------------")
    print(Hybrid_Recomm)

    

In [39]:
HybridRecommender(69)

No other similar movies found for :Saving Private Ryan 

Top similar movie to The Godfather is:
The Godfather: Part II 

No other similar movies found for :Raiders of the Lost Ark  

No other similar movies found for :Braveheart 

No other similar movies found for :Fight Club 

No other similar movies found for :Jurassic Park 

No other similar movies found for :Schindler's List 

No other similar movies found for :Gladiator 

No other similar movies found for :Back to the Future 

Top similar movie to The Lord of the Rings: The Two Towers is:
The Lord of the Rings: The Return of the King 

Top Movies rated by user are :
title
One Flew Over the Cuckoo's Nest (1975)                    5.0
Sixth Sense, The (1999)                                   5.0
Blade Runner (1982)                                       5.0
Blazing Saddles (1974)                                    5.0
Caddyshack (1980)                                         5.0
Fistful of Dollars, A (Per un pugno di dollari) (1964) 