The Entertainment Company, which is an online movie watching platform, wants to improve its   
collection of movies and showcase those that are highly rated and recommend those movies to its   
customer by their movie watching footprint. For this, the company has collected the data and shared   
it with you to provide some analytical insights and also to come up with a recommendation algorithm so   
that it can automate its process for effective recommendations. The ratings are between -9 and +9.  

In [18]:
import pandas as pd
# import Dataset 
ent = pd.read_csv("Entertainment.csv", encoding = 'utf8')

In [19]:
ent.shape # shape

(51, 4)

In [20]:
ent.columns

Index(['Id', 'Titles', 'Category', 'Reviews'], dtype='object')

In [21]:
ent.sample(8)

Unnamed: 0,Id,Titles,Category,Reviews
32,6590,Babe (1995),"Comedy, Drama, Shounen, Sports",-1.41
47,7464,Mighty Aphrodite (1995),"Psychological, Seinen, Sports",-7.86
2,9702,Grumpier Old Men (1995),"Action, Comedy, Historical, Parody, Samurai, S...",99.0
6,1803,Sabrina (1995),"Action, Adventure, Shounen, Super Power",99.0
28,4647,City of Lost Children (1995),"Adventure, Fantasy, Historical, Mystery, Seine...",99.0
34,7547,It Takes Two (1995),"Drama, Fantasy, Shoujo, Slice of Life, Superna...",5.92
18,9679,Ace Ventura: When Nature Calls (1995),"Fantasy, Slice of Life",99.0
29,5871,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,"Action, Adventure, Comedy, Mecha, Sci-Fi",99.0


In [22]:
ent['Category'].head()

0                 Drama, Romance, School, Supernatural
1    Action, Adventure, Drama, Fantasy, Magic, Mili...
2    Action, Comedy, Historical, Parody, Samurai, S...
3                                     Sci-Fi, Thriller
4    Action, Comedy, Historical, Parody, Samurai, S...
Name: Category, dtype: object

In [23]:
#check NaN values 
ent['Category'].isnull().sum() 

0

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer #term frequencey- inverse document frequncy is a numerical statistic that is intended to reflect how important a word is to document in a collecion or corpus

# Creating a Tfidf Vectorizer to remove all stop words
tfidf = TfidfVectorizer(stop_words = "english")    # taking stop words from tfid vectorizer 

# Preparing the Tfidf matrix by fitting and transforming
tfidf_matrix = tfidf.fit_transform(ent.Category)   #Transform a count matrix to a normalized tf or tf-idf representation
tfidf_matrix.shape 

(51, 34)

In [25]:
#compute similarities pairwise
from sklearn.metrics.pairwise import linear_kernel

# Computing the cosine similarity on Tfidf matrix
cosine_sim_matrix = linear_kernel(tfidf_matrix, tfidf_matrix)



In [26]:
cosine_sim_matrix[0]

array([1.        , 0.09421367, 0.        , 0.        , 0.        ,
       0.42359591, 0.        , 0.10686286, 0.        , 0.        ,
       0.54177871, 0.58035645, 0.        , 0.08421361, 0.42359591,
       0.43279274, 0.58230377, 0.12767481, 0.        , 0.17811026,
       0.15484034, 0.26510856, 0.10403276, 0.11097268, 0.        ,
       0.62594265, 0.39703131, 0.12767481, 0.12767481, 0.        ,
       0.39430628, 0.25267036, 0.15484034, 0.12767481, 0.25267036,
       0.474735  , 0.17942927, 0.1620291 , 0.08147575, 0.51531804,
       0.12938023, 0.        , 0.34279311, 0.42359591, 0.15484034,
       0.52359466, 0.25267036, 0.        , 0.12767481, 0.16772551,
       0.31295101])

In [27]:
# creating a mapping of anime name to index number 
ent_index = pd.Series(ent.index, index = ent['Titles']).drop_duplicates()
ent_id = ent_index['To Die For (1995)']
ent_id


41

In [28]:
def get_recommendations(Name, topN):    
    # topN = 10
    # Getting the movie index using its title 
    ent_id = ent_index[Name]
    
    # Getting the pair wise similarity score for all the anime's with that 
    cosine_scores = list(enumerate(cosine_sim_matrix[ent_id]))
    
    # Sorting the cosine_similarity scores based on scores 
    cosine_scores = sorted(cosine_scores, key=lambda x:x[1], reverse = True)
    
    # Get the scores of top N most similar movies 
    cosine_scores_N = cosine_scores[0: topN+1]
    
    # Getting the movie index 
    ent_idx  =  [i[0] for i in cosine_scores_N]
    ent_scores =  [i[1] for i in cosine_scores_N]
    
    # Similar movies and scores
    ent_similar_show = pd.DataFrame(columns=["Titles", "scores"])
    ent_similar_show["Titles"] = ent.loc[ent_idx, "Titles"]
    ent_similar_show["scores"] = ent_scores
    ent_similar_show.reset_index(inplace = True)  
    return (ent_similar_show)

In [29]:
# Enter your movie name and number of movie's to be recommended 
get_recommendations('To Die For (1995)', topN = 10)


Unnamed: 0,index,Titles,scores
0,41,To Die For (1995),1.0
1,2,Grumpier Old Men (1995),0.509671
2,4,Father of the Bride Part II (1995),0.509671
3,8,Sudden Death (1995),0.509671
4,9,GoldenEye (1995),0.509671
5,12,Balto (1995),0.509671
6,23,Powder (1995),0.445853
7,39,Restoration (1995),0.254424
8,42,How to Make an American Quilt (1995),0.247151
9,20,Get Shorty (1995),0.242524
