In [None]:
#################################
#IMPORTS
#################################
import pandas as pd

import numpy as np

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics.pairwise import cosine_similarity


In [None]:
#################################
#HELPER FUNCTIONS
################################
def get_title_from_index(index):
    return df[df.index == index]["title"].values[0]

def get_index_from_title(title):
    return df[df.title == title]["index"].values[0]

In [None]:
#Step 1: Read CSV file
df = pd.read_csv("movie_dataset.csv")
df.head()
#df.columns

In [None]:
#Step 2: Select Features
features = ['keywords','cast', 'genres','director']

In [None]:
#Step 3: Create a new column which combines all the selected features

for feature in features:
    df[feature] = df[feature].fillna('')

def combine_features(row):
    try:
        return row['keywords'] + " " + row['cast'] + " " + row['genres'] + " " + row['director']
    except:
        print("Error", row)
        
df["combined_features"] = df.apply(combine_features, axis = 1)
df.combined_features

In [None]:
# Step 4: Create a count matrix from this new combined column

vectorizer = CountVectorizer()

count_matrix = vectorizer.fit_transform(df["combined_features"])                                          

count_matrix_array = count_matrix.toarray()


In [None]:
# Step 5: Compute the cosine similarity based on the count martix
cosine_sim = cosine_similarity(count_matrix)

print("\nTheir term-document matrix in array representation\n", count_matrix_array)

print("\nThe cosine similarity score:\n", cosine_sim)

movie_user_likes = "Avatar"


In [None]:
# Step 6: Get index of this movie form its title
movie_index = get_index_from_title(movie_user_likes)
similar_movies = list(enumerate(cosine_sim[movie_index]))

In [None]:
# Step 7: Get a list of similar movies in descending order of similarity score
sorted_similar_movies = sorted(similar_movies, key = lambda x:x[1], reverse = True)

In [None]:
# Step 8: Print a list of 50 movies
i=0
for movie in sorted_similar_movies:
    print(get_title_from_index(movie[0]))
    i+=1
    if i>50:
        break