In [1]:
import os
os.chdir("../")
%pwd

'c:\\Users\\abhis\\Desktop\\MLProjects\\Movie-Recommendation-Sysytem'

In [2]:
import pandas as pd
from scipy.sparse import load_npz
import json
import numpy as np
import pickle
from surprise.dataset import Trainset


In [4]:

class RecommendationPipeline:
    def __init__(self):
        self.movies_df = pd.read_csv(os.path.join("artifacts","data_preparation","final_data","movies.csv"))[['title','movieId','tmdbId','genres','poster_path']]
        self.movies_df["year"] = (self.movies_df["title"].str.extract(r"\((\d+)\)").fillna(-1))
        self.ratings_df = pd.read_csv(os.path.join("artifacts","data_preparation","final_data","ratings.csv"))
        self.unique_genres = json.load(open(os.path.join("artifacts","data_preparation","final_data","unique_categories.json"),'rb'))
        self.cosine_sim = load_npz(os.path.join("artifacts","content_based_model","content_matrix.npz"))
        self.svd_model = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","svd_model.pkl"),'rb'))
        self.svd_user_indices = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","svd_user_indices.pkl"),'rb'))
        self.svd_item_indices = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","svd_item_indices.pkl"),'rb'))
        self.indices = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","nn_item_indices.pkl"),'rb'))
        self.user_movie_matrix = load_npz(os.path.join("artifacts","collaborative_filtering_model","user_movie_matrix.npz"))
        self.nn_model = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","nearest_neighbors_movie.pkl"),'rb'))


    def create_ranked_df(self,movies, reviews,min_rating=10):
        '''
        INPUT
        movies - the movies dataframe
        reviews - the reviews dataframe
        
        OUTPUT
        ranked_movies - a dataframe with movies that are sorted by highest avg rating, more reviews, 
                        then time, and must have more than 4 ratings
        '''
        
        # Pull the average ratings and number of ratings for each movie
        C = reviews["rating"].mean()
        movie_ratings = reviews.groupby('movieId')['rating'] 
        avg_ratings = movie_ratings.mean() # R
        num_ratings = movie_ratings.count() # v
        m = min_rating
        weighted_rating = ((avg_ratings*num_ratings)/(num_ratings+m))+((C*m)/(num_ratings+m))

        rating_count_df = pd.DataFrame({'num_ratings':num_ratings,'weighted_rating': weighted_rating}).reset_index()


        # merge with the movies dataset	
        movie_recs = movies.merge(rating_count_df, on = 'movieId')

        # filter out the movies that qualify for the chart
        ratings_filtered=movie_recs[movie_recs['num_ratings']>m]


        # sort by top avg rating and number of ratings
        ranked_movies = ratings_filtered.sort_values(['weighted_rating', 'num_ratings'], ascending=False)
        
        return ranked_movies
    def watched_movies_by_user(self,user_id):
        
        trainset: Trainset = self.svd_model.trainset
        
        user_ratings = trainset.ur[user_id]  # Get the user's ratings from the trainset.ur attribute
        watched_movies = [item_id for item_id, _ in user_ratings]  # Extract the item IDs
        watched_movies = [key for key, value in self.svd_item_indices.items() for item_id in watched_movies if value == item_id]
        return watched_movies


    def get_user_profile(self,user_id):
        
        for genre in self.unique_genres:
            self.movies_df[genre] = self.movies_df['genres'].apply(lambda x: 1 if genre in x else 0)

        # do the above outside the function
        watched_movies = self.watched_movies_by_user(user_id)
        watched_movie_genres = self.movies_df[self.unique_genres][self.movies_df['title'].isin(watched_movies)].sum(axis=0).to_dict()
        return watched_movie_genres
        
        
    def get_avg_ratings(self,movie_lists):
        
        # Get the index of the movie that matches the title
        ratings = []
        for movie in movie_lists:
            idx = self.indices[movie]
            ratings.append(np.round(np.mean(self.user_movie_matrix.getrow(idx).data),2))
        return ratings

    def fetch_poster_url(self,movie_lists):
        poster_path_url = 'https://image.tmdb.org/t/p/w500'
        urls = []
        for movie in movie_lists:
            urls.append(poster_path_url+self.movies_df['poster_path'][self.movies_df['title'] == movie].iloc[0])
        return urls
    
    
    def popular_recs_filtered(self, n_top = 10, years=None, genres=None):
        '''
        REDO THIS DOC STRING
        
        INPUT:
        user_id - the user_id (str) of the individual you are making recommendations for
        n_top - an integer of the number recommendations you want back
        ranked_movies - a pandas dataframe of the already ranked movies based on avg rating, count, and time
        years - a list of strings with years of movies
        genres - a list of strings with genres of movies
        
        OUTPUT:
        top_movies - a list of the n_top recommended movies by movie title in order best to worst
        '''
    
        ranked_movies = self.create_ranked_df(self.movies_df, self.ratings_df)
        ranked_movies['year'] = ranked_movies['title'].str.extract(r'\((\d+)\)').fillna(-1)


        # Create new columns based on the number of unique genres
        genre_columns = []
        for genre in self.unique_genres:
            genre_columns.append(ranked_movies['genres'].apply(lambda x: int(genre in x)).rename(f'{genre}'))

        # Concatenate the genre columns with the movies_df DataFrame
        df_concatenated = pd.concat([ranked_movies] + genre_columns, axis=1)

        # Filter movies based on years and genres if provided
        if years is not None and genres is not None:
            filtered_movies = df_concatenated[(df_concatenated['year'].isin(years)) & (df_concatenated[genres].sum(axis=1) > 0)]
        elif years is not None:
            filtered_movies = df_concatenated[df_concatenated['year'].isin(years)]
        elif genres is not None:
            filtered_movies = df_concatenated[df_concatenated[genres].sum(axis=1) > 0]
        else:
            filtered_movies = df_concatenated.copy()
        
        # Sort the filtered movies by rank and select the top n_top movies
        top_movies = filtered_movies['title'].head(n_top)
        poster_url = self.fetch_poster_url(top_movies)
        avg_rating =  self.get_avg_ratings(top_movies)
        return pd.DataFrame({'movie':top_movies, 'poster_path': poster_url,'rating': avg_rating})
    
    # Function that takes in movie title as input and outputs most similar movies
    def content_recommendations(self,movie_name, n_top = 10):

        movie_data = self.create_ranked_df(self.movies_df, self.ratings_df)
        # cosine_sim = pd.DataFrame(cosine_sim.todense())

        indices=pd.Series(data=list(self.movies_df.index), index = self.movies_df['title'] )
        
        # Get the index of the movie that matches the title
        idx = indices[movie_name]
        
        # Get the row vector of cosine similarity scores
        similarity_scores = self.cosine_sim[idx, :]

        # Convert the row vector to a dense array
        sim_scores_dense = similarity_scores.toarray()[0]

        # Enumerate the similarity scores with their indices
        sim_scores = list(enumerate(sim_scores_dense))

        # Sort the movies based on the similarity scores
        sim_scores.sort(key=lambda x: x[1], reverse=True)

        # Get the scores of the 10 most similar movies
        sim_scores=sim_scores[1: (n_top + 1)]
        
        # Get the movie indices
        ind=[x[0] for x in sim_scores]

        movies_list = self.movies_df.iloc[ind]['title'].tolist()
        movie_data = movie_data[movie_data['title'].isin(movies_list)]


        poster_url = self.fetch_poster_url(movies_list)
        print (poster_url)
        # movie_data['poster_path'] = poster_url
        movie_data = movie_data.rename(columns={'weighted_rating': 'rating', 'title': 'movie'})
        movie_data['rating'] = movie_data['rating'].apply(lambda x: np.round(x, 2))
        return movie_data[['movie', 'rating']]#movie_data[['movie', 'poster_path', 'rating']]
    
    
    def recommend_movie_neighbour(self,movie_name, n_top = 10):
        

        # Get the index of the movie that matches the title
        idx = self.indices[movie_name]

        movie_list = []
        # movie_id = np.where(user_movie_matrix.tocoo().row == movie_name)[0][0]
        distance, suggestion = self.nn_model.kneighbors(self.user_movie_matrix[idx],n_neighbors = int(n_top)+1)

        for movies_id in suggestion[0]:
            movie = next(key for key, val in self.indices.items() if val == movies_id) # since we know the value is present
            movie_list.append(movie)

        poster_url = self.fetch_poster_url(movie_list)
        avg_rating =  self.get_avg_ratings(movie_list)

        return pd.DataFrame({'movie':movie_list[1:], 'poster_path': poster_url[1:],'rating': avg_rating[1:]})
    
    def recommend_top_movie_user(self,user_id, n_top = 10):
        
        watched_movies = self.watched_movies_by_user(user_id)
        unwatched_movies = [key for key, _ in self.svd_item_indices.items() if key not in watched_movies]
        predicted_ratings = {item_id: self.svd_model.predict(user_id, item_id).est for item_id in unwatched_movies}
        sorted_movies_dict = {item_id: predicted_ratings[item_id] for item_id in sorted(unwatched_movies, key=lambda x: predicted_ratings[x], reverse=True)}
        
        top_movies = list(sorted_movies_dict.items())[:n_top]

        # Separate the values into two lists
        movies, pred_rating = zip(*top_movies)
        poster_url = self.fetch_poster_url(movies)
        # avg_rating =  self.get_avg_ratings(movies)
        return pd.DataFrame({'movie': movies, 'poster_path': poster_url,'pred_rating': pred_rating}) 
    
    def recommend_similar_movie_user(self,user_id, movie, topn = 10):
    
        watched_movies = self.watched_movies_by_user(user_id)
        unwatched_movies = [key for key, _ in self.svd_item_indices.items() if key not in watched_movies]
        # Step 3: Determine the number of similar movies to consider
        num_similar_movies = max(len(watched_movies) * 1.5, 20)

        # Step 1: Get recommendations from recommend_movie_neighbour
        recommendations = self.recommend_movie_neighbour(movie, n_top=num_similar_movies)['movie'].to_list()
        recommended_unwatched_movies = set(unwatched_movies) & set(recommendations)
        # Step 6: Predict ratings for unwatched movies
        predicted_ratings = {movie: self.svd_model.predict(user_id, movie).est for movie in recommended_unwatched_movies}
        sorted_movies_dict = {movie: predicted_ratings[movie] for movie in sorted(recommended_unwatched_movies, key=lambda x: predicted_ratings[x], reverse=True)}
        
        top_movies = list(sorted_movies_dict.items())[:topn]

        # Separate the values into two lists
        movies, pred_rating = zip(*top_movies)
        poster_url = self.fetch_poster_url(movies)
        # avg_rating =  self.get_avg_ratings(movies)
        return pd.DataFrame({'movie': movies, 'poster_path': poster_url,'pred_rating': pred_rating}) 


In [4]:
make_reco = RecommendationPipeline()

In [5]:
make_reco.content_recommendations('Avengers, The (2012)')  

Unnamed: 0,movie,poster_path,rating
26130,Avengers: Age of Ultron (2015),https://image.tmdb.org/t/p/w500/gKzYx79y0AQTL4...,3.59
3649,X-Men (2000),https://image.tmdb.org/t/p/w500/gNGq4u0cymxZqG...,3.56
12649,"Incredible Hulk, The (2008)",https://image.tmdb.org/t/p/w500/4ssDuvEDkSArWE...,3.2
1971,"Rocketeer, The (1991)",https://image.tmdb.org/t/p/w500/AuEzNLF8yvzd16...,3.05
5016,"Time Machine, The (2002)",https://image.tmdb.org/t/p/w500/mXlS41qbd0bzhO...,2.86
3297,Teenage Mutant Ninja Turtles (1990),https://image.tmdb.org/t/p/w500/n4Pvcc669wrLCk...,2.85
23376,Ra.One (2011),https://image.tmdb.org/t/p/w500/shfAU6xIIEAEts...,2.82
40594,Max Steel (2016),https://image.tmdb.org/t/p/w500/pARvZxEWxFa6u7...,2.66
20742,Captain America (1979),https://image.tmdb.org/t/p/w500/bRDAc4GogyS9ci...,2.56
3298,Teenage Mutant Ninja Turtles II: The Secret of...,https://image.tmdb.org/t/p/w500/9QB6wIc6XOtoi0...,2.55


In [6]:
make_reco.recommend_movie_neighbour('Avengers, The (2012)')

Unnamed: 0,movie,poster_path,rating
0,Guardians of the Galaxy (2014),https://image.tmdb.org/t/p/w500/r7vmZjiyZw9rpJ...,3.85
1,Captain America: The First Avenger (2011),https://image.tmdb.org/t/p/w500/vSNxAJTlD0r02V...,3.4
2,X-Men: First Class (2011),https://image.tmdb.org/t/p/w500/vUvlOY575rztBu...,3.65
3,Iron Man 2 (2010),https://image.tmdb.org/t/p/w500/6WBeq4fCfn7AN0...,3.36
4,Thor (2011),https://image.tmdb.org/t/p/w500/prSfAi1xGrhLQN...,3.29
5,Captain America: The Winter Soldier (2014),https://image.tmdb.org/t/p/w500/tVFRpFw3xTedgP...,3.58
6,Iron Man 3 (2013),https://image.tmdb.org/t/p/w500/qhPtAc1TKbMPqN...,3.39
7,"Dark Knight Rises, The (2012)",https://image.tmdb.org/t/p/w500/85cWkCVftiVs0B...,3.84
8,Avengers: Age of Ultron (2015),https://image.tmdb.org/t/p/w500/4ssDuvEDkSArWE...,3.43
9,X-Men: Days of Future Past (2014),https://image.tmdb.org/t/p/w500/tYfijzolzgoMOt...,3.65


In [7]:
make_reco.popular_recs_filtered(n_top = 5,years=['2015','2016'])

Unnamed: 0,movie,poster_path,rating
38964,Piper (2016),https://image.tmdb.org/t/p/w500/rfEkkVzmrMYqGe...,4.1
39836,Your Name. (2016),https://image.tmdb.org/t/p/w500/q719jXXEzOoYap...,4.03
33128,Human (2015),https://image.tmdb.org/t/p/w500/vdZgH8cr73DJTL...,4.24
32737,Spotlight (2015),https://image.tmdb.org/t/p/w500/gWkgMnIsd8Od7i...,4.03
38286,The Handmaiden (2016),https://image.tmdb.org/t/p/w500/x2lZKoKPqVodhY...,4.02


In [8]:
make_reco.recommend_similar_movie_user(user_id = 4, movie = 'Avengers, The (2012)')

Unnamed: 0,movie,poster_path,pred_rating
0,Iron Man (2008),https://image.tmdb.org/t/p/w500/78lPtwv72eTNqF...,5.0
1,Inception (2010),https://image.tmdb.org/t/p/w500/edv5CZvWj09upO...,4.942769
2,"Dark Knight Rises, The (2012)",https://image.tmdb.org/t/p/w500/85cWkCVftiVs0B...,4.793086
3,Deadpool (2016),https://image.tmdb.org/t/p/w500/fSRb7vyIP8rQpL...,4.719584
4,The Martian (2015),https://image.tmdb.org/t/p/w500/5BHuvQ6p9kfc09...,4.708432
5,Ip Man (2008),https://image.tmdb.org/t/p/w500/ggTTUXZg7trvAh...,4.696023
6,Limitless (2011),https://image.tmdb.org/t/p/w500/hv5JMCrMVLvV6H...,4.646333
7,Taken (2008),https://image.tmdb.org/t/p/w500/y5Va1WXDX6nZEl...,4.611688
8,Guardians of the Galaxy (2014),https://image.tmdb.org/t/p/w500/r7vmZjiyZw9rpJ...,4.606628
9,Red (2010),https://image.tmdb.org/t/p/w500/8eeK3OB5PeSRQD...,4.588877


In [9]:
make_reco.recommend_similar_movie_user(user_id = 42, movie = 'Avengers, The (2012)')

Unnamed: 0,movie,poster_path,pred_rating
0,Whiplash (2014),https://image.tmdb.org/t/p/w500/7fn624j5lj3xTm...,4.613118
1,"Grand Budapest Hotel, The (2014)",https://image.tmdb.org/t/p/w500/eWdyYQreja6JGC...,4.376862
2,Ex Machina (2015),https://image.tmdb.org/t/p/w500/dmJW8IAKHKxFNi...,4.282013
3,Star Wars: Episode VII - The Force Awakens (2015),https://image.tmdb.org/t/p/w500/wqnLdwVXoBjKib...,4.280724
4,Birdman: Or (The Unexpected Virtue of Ignoranc...,https://image.tmdb.org/t/p/w500/rHUg2AuIuLSIYM...,4.266851
5,Blade Runner 2049 (2017),https://image.tmdb.org/t/p/w500/gajva2L0rPYkEW...,4.263572
6,La La Land (2016),https://image.tmdb.org/t/p/w500/uDO8zWDhfWwoFd...,4.256999
7,Her (2013),https://image.tmdb.org/t/p/w500/eCOtqtfvn7mxGl...,4.237284
8,"Big Short, The (2015)",https://image.tmdb.org/t/p/w500/isuQWbJPbjybBE...,4.204046
9,Django Unchained (2012),https://image.tmdb.org/t/p/w500/7oWY8VDWW7thTz...,4.174808


In [12]:
make_reco.get_user_profile(4)

{'Documentary': 0,
 'History': 10,
 'Drama': 94,
 'Romance': 39,
 'Thriller': 43,
 'Western': 6,
 'Family': 75,
 'Horror': 15,
 'Mystery': 10,
 'Action': 92,
 'War': 8,
 'Music': 7,
 'Science Fiction': 70,
 'TV Movie': 0,
 'Comedy': 83,
 'Adventure': 110,
 'Fantasy': 60,
 'Animation': 68,
 'Crime': 39}

In [13]:
make_reco.get_user_profile(42)

{'Documentary': 6,
 'History': 14,
 'Drama': 200,
 'Romance': 61,
 'Thriller': 94,
 'Western': 3,
 'Family': 33,
 'Horror': 15,
 'Mystery': 37,
 'Action': 78,
 'War': 19,
 'Music': 5,
 'Science Fiction': 50,
 'TV Movie': 0,
 'Comedy': 66,
 'Adventure': 67,
 'Fantasy': 50,
 'Animation': 34,
 'Crime': 67}

In [1]:
categories = ['a', 'b', 'c', 'd']

In [3]:
categories + [None]

['a', 'b', 'c', 'd', None]

In [4]:
make_reco.ratings_df

NameError: name 'make_reco' is not defined

In [5]:

movies_df = pd.read_csv(os.path.join("artifacts","data_preparation","final_data","movies.csv"))[['title','movieId','tmdbId','genres','poster_path']]
movies_df["year"] = (movies_df["title"].str.extract(r"\((\d+)\)").fillna(-1))

In [11]:
movies_df['year'].unique()

array(['1995', '1994', '1996', '1976', '1992', '1988', '1967', '1993',
       '1964', '1977', '1965', '1982', '1985', '1990', '1991', '1989',
       '1937', '1940', '1969', '1981', '1973', '1970', '1960', '1955',
       '1959', '1968', '1980', '1975', '1948', '1943', '1950', '1987',
       '1997', '1974', '1956', '1958', '1949', '1972', '1953', '1998',
       '06', '1933', '2010', '1952', '1951', '1957', '1961', '1954',
       '1934', '1944', '1963', '1942', '1941', '1939', '1947', '1946',
       '1945', '1938', '1935', '1936', '1926', '1932', '1979', '1971',
       '1986', '1978', '1966', '1962', '1983', '1984', '1931', '1922',
       '1999', '1927', '1929', '1930', '1928', '1925', '2012', '2000',
       '1919', '1923', '1920', '1918', '1921', '2001', '1924', '2013',
       '2002', '2003', '1915', '2004', '1916', '1917', '2005', '2006',
       '1902', -1, '2011', '1903', '2007', '2008', '2009', '1914', '500',
       '1912', '1913', '1898', '1894', '1909', '1910', '1893', '1896',
     

In [8]:
data = movies_df.loc[0]

In [11]:
data['title']

'Toy Story (1995)'

In [12]:
movies_df['title'][0]

'Toy Story (1995)'

In [13]:
indices = pickle.load(open(os.path.join(
                    "artifacts", "collaborative_filtering_model", "nn_item_indices.pkl"
                ),
                "rb",
            )
        )

In [18]:
list(indices.keys())

['"Great Performances" Cats (1998)',
 "'71 (2014)",
 "'Round Midnight (1986)",
 "'Til There Was You (1997)",
 "'burbs, The (1989)",
 "'night Mother (1986)",
 '(500) Days of Summer (2009)',
 '*batteries not included (1987)',
 '+1 (2013)',
 '...And Justice for All (1979)',
 '1-900 (06) (1994)',
 '10 (1979)',
 '10 Cloverfield Lane (2016)',
 '10 Items or Less (2006)',
 '10 Rillington Place (1971)',
 '10 Things I Hate About You (1999)',
 '10 Years (2011)',
 '10 to Midnight (1983)',
 '10,000 BC (2008)',
 '100 Feet (2008)',
 '100 Girls (2000)',
 '1000 Eyes of Dr. Mabuse, The (Die 1000 Augen des Dr. Mabuse) (1960)',
 '101 Dalmatians (1996)',
 '101 Dalmatians (One Hundred and One Dalmatians) (1961)',
 "101 Dalmatians II: Patch's London Adventure (2003)",
 '101 Reykjavik (101 Reykjavík) (2000)',
 '102 Dalmatians (2000)',
 '10th & Wolf (2006)',
 '10th Victim, The (La decima vittima) (1965)',
 '11\'09"01 - September 11 (2002)',
 '11:14 (2003)',
 '11th Hour, The (2007)',
 '12 (2007)',
 '12 Angry Me

In [5]:
make_reco = RecommendationPipeline()

In [6]:


# Example data
categories = make_reco.unique_genres
years = make_reco.movies_df["year"].astype(str).unique()


movies_list = list(make_reco.indices.keys())


content_reco_df = make_reco.content_recommendations(movies_list[0])
print(content_reco_df)


['https://image.tmdb.org/t/p/w500/gQM8tVWVRoYbyDe0qWyu5luixjf.jpg', 'https://image.tmdb.org/t/p/w500/xouK7Q74DzkHMDpQYhy2vK62sOv.jpg', 'https://image.tmdb.org/t/p/w500/cwMYG1qtk4sZbwKvb4zPSsS5BzS.jpg', 'https://image.tmdb.org/t/p/w500/qhjcjiLcPhB0kniRlQfIoGiILbn.jpg', 'https://image.tmdb.org/t/p/w500/sf3HuKkvmkQqZkee39MpcT3UeYi.jpg', 'https://image.tmdb.org/t/p/w500/71Q9f7byPqH8trF4BvZOlXnhO2D.jpg', 'https://image.tmdb.org/t/p/w500/s549IYn0XDUlFHJpOLIrWE1uYXv.jpg', 'https://image.tmdb.org/t/p/w500/eCyWdrWp6uTmXq0ZeFFxzWPNByN.jpg', 'https://image.tmdb.org/t/p/w500/qj7owFXzCtIbZC51O8NwLe4Kde4.jpg', 'https://image.tmdb.org/t/p/w500/dmDLyOAZ203SPRYPfG4uUIa8ea8.jpg']
                                                   movie  rating
7364                              Animal Crackers (1930)    3.93
7830                               Into the Woods (1991)    3.90
4272                             Entertainer, The (1960)    3.71
12269  Lemonade Joe (Limonádový Joe aneb Konská opera...    3.68
2741

In [10]:
movies_list[0]

'"Great Performances" Cats (1998)'

In [8]:
movie_data = make_reco.create_ranked_df(make_reco.movies_df, make_reco.ratings_df)
# cosine_sim = pd.DataFrame(cosine_sim.todense())

indices=pd.Series(data=list(make_reco.movies_df.index), index = make_reco.movies_df['title'] )

# Get the index of the movie that matches the title
idx = indices[movies_list[0]]

# Get the row vector of cosine similarity scores
similarity_scores = make_reco.cosine_sim[idx, :]

# Convert the row vector to a dense array
sim_scores_dense = similarity_scores.toarray()[0]

# Enumerate the similarity scores with their indices
sim_scores = list(enumerate(sim_scores_dense))

# Sort the movies based on the similarity scores
sim_scores.sort(key=lambda x: x[1], reverse=True)

# Get the scores of the 10 most similar movies
sim_scores=sim_scores[1: (10 + 1)]

# Get the movie indices
ind=[x[0] for x in sim_scores]

movies_list = make_reco.movies_df.iloc[ind]['title'].tolist()
movie_data = movie_data[movie_data['title'].isin(movies_list)]


poster_url = make_reco.fetch_poster_url(movies_list)
print (poster_url)
# movie_data['poster_path'] = poster_url
movie_data = movie_data.rename(columns={'weighted_rating': 'rating', 'title': 'movie'})
movie_data['rating'] = movie_data['rating'].apply(lambda x: np.round(x, 2))

['https://image.tmdb.org/t/p/w500/gQM8tVWVRoYbyDe0qWyu5luixjf.jpg', 'https://image.tmdb.org/t/p/w500/xouK7Q74DzkHMDpQYhy2vK62sOv.jpg', 'https://image.tmdb.org/t/p/w500/cwMYG1qtk4sZbwKvb4zPSsS5BzS.jpg', 'https://image.tmdb.org/t/p/w500/qhjcjiLcPhB0kniRlQfIoGiILbn.jpg', 'https://image.tmdb.org/t/p/w500/sf3HuKkvmkQqZkee39MpcT3UeYi.jpg', 'https://image.tmdb.org/t/p/w500/71Q9f7byPqH8trF4BvZOlXnhO2D.jpg', 'https://image.tmdb.org/t/p/w500/s549IYn0XDUlFHJpOLIrWE1uYXv.jpg', 'https://image.tmdb.org/t/p/w500/eCyWdrWp6uTmXq0ZeFFxzWPNByN.jpg', 'https://image.tmdb.org/t/p/w500/qj7owFXzCtIbZC51O8NwLe4Kde4.jpg', 'https://image.tmdb.org/t/p/w500/dmDLyOAZ203SPRYPfG4uUIa8ea8.jpg']


In [11]:
movies_list

['Lemonade Joe (Limonádový Joe aneb Konská opera) (1964)',
 'Jacques Brel Is Alive and Well and Living in Paris (1975)',
 'No, No, Nanette (1940)',
 'Godspell: A Musical Based on the Gospel According to St. Matthew (1973)',
 'Baroque (1989)',
 'Pajama Game, The (1957)',
 'Entertainer, The (1960)',
 'Chorus Line, A (1985)',
 'Animal Crackers (1930)',
 'Into the Woods (1991)']

In [12]:
movie_data

Unnamed: 0,movie,movieId,tmdbId,genres,poster_path,year,num_ratings,rating
7364,Animal Crackers (1930),7706,13913,"['Comedy', 'Music']",/qj7owFXzCtIbZC51O8NwLe4Kde4.jpg,1930,1043,3.93
7830,Into the Woods (1991),8580,23378,"['Drama', 'Music']",/dmDLyOAZ203SPRYPfG4uUIa8ea8.jpg,1991,527,3.9
4272,"Entertainer, The (1960)",4423,18929,"['Drama', 'Music']",/s549IYn0XDUlFHJpOLIrWE1uYXv.jpg,1960,144,3.71
12269,Lemonade Joe (Limonádový Joe aneb Konská opera...,57209,20629,"['Comedy', 'Western', 'Music']",/gQM8tVWVRoYbyDe0qWyu5luixjf.jpg,1964,16,3.68
2741,"Pajama Game, The (1957)",2874,40867,"['Comedy', 'Music']",/71Q9f7byPqH8trF4BvZOlXnhO2D.jpg,1957,492,3.37
6186,"Chorus Line, A (1985)",6345,1816,"['Drama', 'Music']",/eCyWdrWp6uTmXq0ZeFFxzWPNByN.jpg,1985,524,3.22


In [22]:
movie_data = make_reco.create_ranked_df(make_reco.movies_df, make_reco.ratings_df,min_rating=0)

In [23]:
movie_data[movie_data['title'].isin(movies_list)]

Unnamed: 0,title,movieId,tmdbId,genres,poster_path,year,num_ratings,weighted_rating
7364,Animal Crackers (1930),7706,13913,"['Comedy', 'Music']",/qj7owFXzCtIbZC51O8NwLe4Kde4.jpg,1930,1043,3.936721
7830,Into the Woods (1991),8580,23378,"['Drama', 'Music']",/dmDLyOAZ203SPRYPfG4uUIa8ea8.jpg,1991,527,3.903226
12269,Lemonade Joe (Limonádový Joe aneb Konská opera...,57209,20629,"['Comedy', 'Western', 'Music']",/gQM8tVWVRoYbyDe0qWyu5luixjf.jpg,1964,16,3.78125
42964,Godspell: A Musical Based on the Gospel Accord...,171361,43158,"['Comedy', 'Family', 'Music']",/qhjcjiLcPhB0kniRlQfIoGiILbn.jpg,1973,8,3.75
4272,"Entertainer, The (1960)",4423,18929,"['Drama', 'Music']",/s549IYn0XDUlFHJpOLIrWE1uYXv.jpg,1960,144,3.71875
2741,"Pajama Game, The (1957)",2874,40867,"['Comedy', 'Music']",/71Q9f7byPqH8trF4BvZOlXnhO2D.jpg,1957,492,3.36687
6186,"Chorus Line, A (1985)",6345,1816,"['Drama', 'Music']",/eCyWdrWp6uTmXq0ZeFFxzWPNByN.jpg,1985,524,3.211832
16363,Jacques Brel Is Alive and Well and Living in P...,83389,198317,"['Drama', 'Music']",/xouK7Q74DzkHMDpQYhy2vK62sOv.jpg,1975,2,3.0
48313,Baroque (1989),184899,296933,"['Documentary', 'Drama', 'Music', 'History']",/sf3HuKkvmkQqZkee39MpcT3UeYi.jpg,1989,2,2.0
39867,"No, No, Nanette (1940)",163298,413232,"['Music', 'Comedy']",/cwMYG1qtk4sZbwKvb4zPSsS5BzS.jpg,1940,1,1.0


In [15]:
movie_data.shape

(22448, 8)

In [17]:
make_reco.movies_df.iloc[ind]

Unnamed: 0,title,movieId,tmdbId,genres,poster_path,year
12275,Lemonade Joe (Limonádový Joe aneb Konská opera...,57209,20629,"['Comedy', 'Western', 'Music']",/gQM8tVWVRoYbyDe0qWyu5luixjf.jpg,1964
16386,Jacques Brel Is Alive and Well and Living in P...,83389,198317,"['Drama', 'Music']",/xouK7Q74DzkHMDpQYhy2vK62sOv.jpg,1975
42812,"No, No, Nanette (1940)",163298,413232,"['Music', 'Comedy']",/cwMYG1qtk4sZbwKvb4zPSsS5BzS.jpg,1940
46285,Godspell: A Musical Based on the Gospel Accord...,171361,43158,"['Comedy', 'Family', 'Music']",/qhjcjiLcPhB0kniRlQfIoGiILbn.jpg,1973
52060,Baroque (1989),184899,296933,"['Documentary', 'Drama', 'Music', 'History']",/sf3HuKkvmkQqZkee39MpcT3UeYi.jpg,1989
2741,"Pajama Game, The (1957)",2874,40867,"['Comedy', 'Music']",/71Q9f7byPqH8trF4BvZOlXnhO2D.jpg,1957
4272,"Entertainer, The (1960)",4423,18929,"['Drama', 'Music']",/s549IYn0XDUlFHJpOLIrWE1uYXv.jpg,1960
6186,"Chorus Line, A (1985)",6345,1816,"['Drama', 'Music']",/eCyWdrWp6uTmXq0ZeFFxzWPNByN.jpg,1985
7366,Animal Crackers (1930),7706,13913,"['Comedy', 'Music']",/qj7owFXzCtIbZC51O8NwLe4Kde4.jpg,1930
7832,Into the Woods (1991),8580,23378,"['Drama', 'Music']",/dmDLyOAZ203SPRYPfG4uUIa8ea8.jpg,1991


In [37]:
pattern = r"\((\d{4})\)"
string = '(500) Days of Summer (2009)'

In [38]:
string = '(500) Days of Summer (2009)'
df = pd.DataFrame({'title': [string]})
df["year"] = df["title"].str.extract(pattern)

In [40]:
d = {'a': 2,'b':3,'y':5}

In [42]:
list(d.values())

[2, 3, 5]

In [43]:
svd_user_indices = pickle.load(open(os.path.join("artifacts","collaborative_filtering_model","svd_user_indices.pkl"),'rb'))

In [45]:
svd_user_indices.keys()

dict_keys([4, 42, 43, 51, 55, 56, 71, 73, 79, 100, 114, 134, 147, 160, 180, 183, 196, 214, 235, 239, 268, 294, 295, 313, 321, 339, 343, 345, 357, 363, 374, 378, 382, 384, 402, 408, 450, 458, 465, 471, 491, 502, 505, 540, 549, 553, 561, 572, 590, 593, 601, 603, 605, 627, 667, 670, 697, 698, 716, 719, 738, 758, 776, 781, 788, 804, 807, 814, 815, 818, 828, 830, 842, 854, 856, 861, 864, 867, 890, 907, 911, 926, 928, 930, 942, 953, 972, 995, 996, 1000, 1010, 1024, 1028, 1030, 1035, 1040, 1051, 1064, 1068, 1073, 1075, 1089, 1093, 1117, 1120, 1132, 1134, 1158, 1165, 1171, 1179, 1180, 1185, 1186, 1191, 1197, 1200, 1212, 1226, 1228, 1242, 1263, 1272, 1281, 1293, 1327, 1337, 1367, 1372, 1383, 1393, 1400, 1413, 1419, 1423, 1440, 1464, 1466, 1471, 1488, 1507, 1513, 1519, 1523, 1534, 1546, 1549, 1563, 1567, 1570, 1599, 1603, 1636, 1654, 1672, 1695, 1752, 1770, 1776, 1781, 1803, 1823, 1827, 1832, 1846, 1854, 1858, 1860, 1866, 1874, 1880, 1881, 1911, 1914, 1916, 1927, 1931, 1957, 1980, 1991, 2000, 20