In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_csv("movie_dataset.csv")

In [2]:
features = ['keywords','cast','genres','director']

In [3]:

def combine_features(row):
    return row['keywords']+" "+row['cast']+" "+row['genres']+" "+row['director']

In [4]:
for feature in features:
    df[feature] = df[feature].fillna('') #filling all NaNs with blank string

df["combined_features"] = df.apply(combine_features,axis=1) 

In [5]:
df.iloc[0].combined_features

'culture clash future space war space colony society Sam Worthington Zoe Saldana Sigourney Weaver Stephen Lang Michelle Rodriguez Action Adventure Fantasy Science Fiction James Cameron'

In [6]:
cv = CountVectorizer() #creating new CountVectorizer() object
count_matrix = cv.fit_transform(df["combined_features"])

In [7]:
cosine_sim = cosine_similarity(count_matrix)

In [13]:
def get_title_from_index(index):
    return df[df.index == index]["title"].values[0]
def get_index_from_title(title):
    return df[df.title == title]["index"].values[0]

In [None]:
#you can change the movie according to your choice and get a list of top five movies

In [20]:
movie_user_likes = "Iron Man"
movie_index = get_index_from_title(movie_user_likes)
similar_movies = list(enumerate(cosine_sim[movie_index]))

In [21]:
sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)[1:]

In [22]:
i=0
print("Top 5 similar movies to "+movie_user_likes+" are:\n")
for element in sorted_similar_movies:
    print(get_title_from_index(element[0]))
    i=i+1
    if i>5:
        break

Top 5 similar movies to Iron Man are:

Iron Man 2
Iron Man 3
Avengers: Age of Ultron
The Avengers
Captain America: Civil War
Captain America: The Winter Soldier


In [23]:
df["vote_average"].unique()

array([ 7.2,  6.9,  6.3,  7.6,  6.1,  5.9,  7.4,  7.3,  5.7,  5.4,  7. ,
        6.5,  6.4,  6.2,  7.1,  5.8,  6.6,  7.5,  5.5,  6.7,  6.8,  6. ,
        5.1,  7.8,  5.6,  5.2,  8.2,  7.7,  5.3,  8. ,  4.8,  4.9,  7.9,
        8.1,  4.7,  5. ,  4.2,  4.4,  4.1,  3.7,  3.6,  3. ,  3.9,  4.3,
        4.5,  3.4,  4.6,  8.3,  3.5,  4. ,  2.3,  3.2,  0. ,  3.8,  2.9,
        8.5,  1.9,  3.1,  3.3,  2.2,  0.5,  9.3,  8.4,  2.7, 10. ,  1. ,
        2. ,  2.8,  9.5,  2.6,  2.4])

In [24]:
sort_by_average_vote = sorted(sorted_similar_movies,key=lambda x:df["vote_average"][x[0]],reverse=True)
print(sort_by_average_vote)

[(3519, 0.0), (4045, 0.0), (4247, 0.0), (4662, 0.0), (3992, 0.0), (2386, 0.043643578047198484), (1881, 0.0), (2970, 0.0), (2796, 0.08164965809277262), (3337, 0.03779644730092272), (2731, 0.08164965809277262), (3232, 0.044721359549995794), (2294, 0.04170288281141496), (662, 0.0), (1818, 0.0), (3865, 0.0), (4755, 0.0), (1990, 0.16329931618554525), (4535, 0.044721359549995794), (1987, 0.043643578047198484), (2247, 0.04264014327112209), (1663, 0.04170288281141496), (1847, 0.04170288281141496), (809, 0.04082482904638631), (65, 0.03922322702763681), (690, 0.0), (2170, 0.0), (2947, 0.0), (3057, 0.0), (3719, 0.0), (3723, 0.0), (4602, 0.0), (2912, 0.17056057308448835), (96, 0.15689290811054724), (1553, 0.12792042981336627), (329, 0.12247448713915893), (95, 0.11547005383792516), (2453, 0.04588314677411235), (2091, 0.0), (2284, 0.0), (2760, 0.0), (3041, 0.0), (3454, 0.0), (3573, 0.0), (3622, 0.0), (3788, 0.0), (3866, 0.0), (3906, 0.0), (4238, 0.0), (4302, 0.0), (2285, 0.16329931618554525), (3208,

In [27]:

i=0
print("Suggesting top 5 movies in order of Average Votes:\n")
for element in sort_by_average_vote:
    print(get_title_from_index(element[0]))
    i=i+1
    if i>5:
        break

Suggesting top 5 movies in order of Average Votes:

Stiff Upper Lips
Dancer, Texas Pop. 81
Me You and Five Bucks
Little Big Top
Sardaarji
One Man's Hero
