## Content Based Recommendation System


In [74]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_csv("movie_dataset.csv")

In [75]:
df = pd.read_csv("movie_dataset.csv")
#If you visualize the dataset, you will see that it has many extra info about a movie.
#We don’t need all of them. So, we choose keywords, cast, 
#genres and director column to use as our feature set(the so called “content” of the movie)

In [76]:
df.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [77]:
df.columns,df.dtypes


(Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
        'original_language', 'original_title', 'overview', 'popularity',
        'production_companies', 'production_countries', 'release_date',
        'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
        'vote_average', 'vote_count', 'cast', 'crew', 'director'],
       dtype='object'), index                     int64
 budget                    int64
 genres                   object
 homepage                 object
 id                        int64
 keywords                 object
 original_language        object
 original_title           object
 overview                 object
 popularity              float64
 production_companies     object
 production_countries     object
 release_date             object
 revenue                   int64
 runtime                 float64
 spoken_languages         object
 status                   object
 tagline                  object
 title                  

In [78]:
features = ['keywords','cast','genres','director']

In [79]:
#Our next task is to create a function for combining the values of these columns into a single string

In [80]:
def combine_features(row):
    return row['keywords']+" "+row['cast']+" "+row['genres']+" "+row['director']
#Takes all these attributes from  a row and combine it as string 
#To do so for all the rows do the below cell

In [81]:
#Now, we need to call this function over each row of our dataframe. 
#But, before doing that, we need to clean and preprocess the data for our use.
#We will fill all the NaN values with blank string in the dataframe

In [82]:
for feature in features:
    df[feature] = df[feature].fillna('') #filling all NaNs with blank string in features

df["combined_features"] = df.apply(combine_features,axis=1)#forms new  column

#applying combined_features() method over each rows of dataframe and storing the combined 
#string in "combined_features" column

In [83]:
df.iloc[0].combined_features#Printing value at key 0

'culture clash future space war space colony society Sam Worthington Zoe Saldana Sigourney Weaver Stephen Lang Michelle Rodriguez Action Adventure Fantasy Science Fiction James Cameron'

In [84]:
#text = ["London Paris London","Paris Paris London"]
#Now, we need to find a way to represent these texts as vectors. 
#The CountVectorizer() class from sklearn.feature_extraction.text library can do this for us. 
#We need to import this library before we can create a new CountVectorizer() object.

In [85]:
cosine_sim = cosine_similarity(count_matrix)#calculates similarity score
#[[1.  0.8]
#[0.8 1. ]]
cosine_sim

array([[1.        , 0.10540926, 0.12038585, ..., 0.        , 0.        ,
        0.        ],
       [0.10540926, 1.        , 0.0761387 , ..., 0.03651484, 0.        ,
        0.        ],
       [0.12038585, 0.0761387 , 1.        , ..., 0.        , 0.11145564,
        0.        ],
       ...,
       [0.        , 0.03651484, 0.        , ..., 1.        , 0.        ,
        0.04264014],
       [0.        , 0.        , 0.11145564, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.04264014, 0.        ,
        1.        ]])

In [86]:
#Now, we will define two helper functions to get movie title from movie index and vice-versa.
df["title"].head(),df["index"].head()

(0                                      Avatar
 1    Pirates of the Caribbean: At World's End
 2                                     Spectre
 3                       The Dark Knight Rises
 4                                 John Carter
 Name: title, dtype: object, 0    0
 1    1
 2    2
 3    3
 4    4
 Name: index, dtype: int64)

In [87]:
def get_title_from_index(index):
    return df[df.index == index]["title"].values[0]#values are accessed by .value[key]
def get_index_from_title(title):
    return df[df.title == title]["index"].values[0]

In [88]:
#Our next step is to get the title of the movie that the user currently likes.
#Then we will find the index of that movie.
#After that, we will access the row corresponding to this movie in the similarity matrix.
#Thus, we will get the similarity scores of all other movies from the current movie.
#Then we will enumerate through all the similarity scores of that movie to make a tuple of movie index 
#and similarity score. This will convert a row of similarity scores like this-
#[1 0.5 0.2 0.9] to this- [(0, 1) (1, 0.5) (2, 0.2) (3, 0.9)] . 
#Here, each item is in this form- (movie index, similarity score).

In [89]:
movie_user_likes = "Alien"
movie_index = get_index_from_title(movie_user_likes)
similar_movies = list(enumerate(cosine_sim[movie_index]))#cosine_sim relates it with the index
#and store it in similar movies
#accessing the row corresponding to given movie to find all the similarity scores 
#for that movie and then enumerating over it
movie_index,similar_movies


(3158,
 [(0, 0.33333333333333337),
  (1, 0.03513641844631533),
  (2, 0.040128617695256406),
  (3, 0.11322770341445959),
  (4, 0.23094010767585033),
  (5, 0.03849001794597506),
  (6, 0.0),
  (7, 0.10540925533894599),
  (8, 0.041030496993110906),
  (9, 0.03928371006591931),
  (10, 0.11547005383792516),
  (11, 0.07698003589195011),
  (12, 0.040128617695256406),
  (13, 0.03849001794597506),
  (14, 0.10910894511799618),
  (15, 0.0),
  (16, 0.10369516947304253),
  (17, 0.041030496993110906),
  (18, 0.10540925533894599),
  (19, 0.041030496993110906),
  (20, 0.07856742013183862),
  (21, 0.15713484026367724),
  (22, 0.0),
  (23, 0.0),
  (24, 0.034020690871988585),
  (25, 0.04303314829119352),
  (26, 0.10910894511799618),
  (27, 0.15713484026367724),
  (28, 0.15713484026367724),
  (29, 0.08206099398622181),
  (30, 0.03774256780481986),
  (31, 0.11547005383792516),
  (32, 0.0),
  (33, 0.14547859349066158),
  (34, 0.041030496993110906),
  (35, 0.12598815766974242),
  (36, 0.11785113019775793),
  (

In [90]:
#Now comes the most vital point.
#We will sort the list similar_movies according to similarity scores in descending order.
#Since the most similar movie to a given movie will be itself, 
#we will discard the first element after sorting the movies.

In [91]:
#Sorting that similar movies list to get the most similar first
sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)[1:]
sorted_similar_movies

[(2403, 0.4151682458530185),
 (838, 0.39283710065919303),
 (1531, 0.35737084494593163),
 (278, 0.35136418446315326),
 (0, 0.33333333333333337),
 (239, 0.3110855084191276),
 (2696, 0.30792014356780045),
 (4401, 0.28867513459481287),
 (740, 0.28721347895177635),
 (1318, 0.28090032386679487),
 (1053, 0.2809003238667948),
 (3361, 0.28005601680560194),
 (1914, 0.2749859704614352),
 (94, 0.2694301256218254),
 (2015, 0.2694301256218254),
 (4332, 0.2694301256218254),
 (300, 0.26419797463373906),
 (1473, 0.26419797463373906),
 (541, 0.264197974633739),
 (3730, 0.2592592592592593),
 (1650, 0.2501595914621521),
 (222, 0.24618298195866545),
 (770, 0.24618298195866545),
 (228, 0.24077170617153845),
 (1275, 0.24077170617153845),
 (3014, 0.24077170617153845),
 (4225, 0.24077170617153845),
 (539, 0.24077170617153842),
 (577, 0.23570226039551587),
 (1990, 0.23570226039551587),
 (335, 0.23570226039551584),
 (487, 0.23570226039551584),
 (1213, 0.23570226039551584),
 (2198, 0.23570226039551584),
 (2964, 0

In [92]:
#Now, we will run a loop to print first 5 entries from sorted_similar_movies list

In [93]:
i=0
print("Top 5 similar movies to "+movie_user_likes+" are:\n")
for element in sorted_similar_movies:
    print(get_title_from_index(element[0]))
    i=i+1
    if i>5:
        break

Top 5 similar movies to Alien are:

Aliens
Alien³
Moonraker
Planet of the Apes
Avatar
Gravity
