## Importing essential libraries

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Reading csv file

In [2]:
df = pd.read_csv("movie_dataset.csv")

## Taking the movie name based on which recommendation should be shown

In [4]:
movie_name= input("Enter movie name: ")

Enter movie name:  Hellboy


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 4803 non-null   int64  
 1   budget                4803 non-null   int64  
 2   genres                4775 non-null   object 
 3   homepage              1712 non-null   object 
 4   id                    4803 non-null   int64  
 5   keywords              4391 non-null   object 
 6   original_language     4803 non-null   object 
 7   original_title        4803 non-null   object 
 8   overview              4800 non-null   object 
 9   popularity            4803 non-null   float64
 10  production_companies  4803 non-null   object 
 11  production_countries  4803 non-null   object 
 12  release_date          4802 non-null   object 
 13  revenue               4803 non-null   int64  
 14  runtime               4801 non-null   float64
 15  spoken_languages     

In [7]:
df.columns

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')

#### features are the attributes which will be used to compare the movies based on the similarities movie will be recommended.

In [12]:
features = ['genres', 'director', 'keywords','cast', 'popularity']

In [13]:
for f in features:
    df[f] = df[f].fillna('') # Filling all the Null value to empty string

In [16]:
def combineFeatures(row):
    return row['genres'] + " " + row['director'] + " " + row['keywords'] + " " + row['cast'] + " " + str(row['popularity'])

df['combineFeatures'] = df.apply(combineFeatures, axis = 1)

#### This combineFeatures will be used to calculate the CountVectorizer and then using cosine similarity will be compared and thus the one which has higgest value will be recommended.

In [17]:
df['combineFeatures']

0       Action Adventure Fantasy Science Fiction James...
1       Adventure Fantasy Action Gore Verbinski ocean ...
2       Action Adventure Crime Sam Mendes spy based on...
3       Action Crime Drama Thriller Christopher Nolan ...
4       Action Adventure Science Fiction Andrew Stanto...
                              ...                        
4798    Action Crime Thriller Robert Rodriguez united ...
4799    Comedy Romance Edward Burns  Edward Burns Kerr...
4800    Comedy Drama Romance TV Movie Scott Smith date...
4801     Daniel Hsia  Daniel Henney Eliza Coupe Bill P...
4802    Documentary Brian Herzlinger obsession camcord...
Name: combineFeatures, Length: 4803, dtype: object

In [20]:
cv = CountVectorizer()
count_matrix = cv.fit_transform(df['combineFeatures'])
cosine_sim = cosine_similarity(count_matrix)

#### cosine_sim is matrix of the values compared based on the fearures. Values which are closer to 1 are most likely to be recommended to the user

In [24]:
cosine_sim

array([[1.        , 0.09847982, 0.1114172 , ..., 0.        , 0.        ,
        0.        ],
       [0.09847982, 1.        , 0.07071068, ..., 0.03466876, 0.        ,
        0.        ],
       [0.1114172 , 0.07071068, 1.        , ..., 0.        , 0.10327956,
        0.        ],
       ...,
       [0.        , 0.03466876, 0.        , ..., 1.        , 0.        ,
        0.04089304],
       [0.        , 0.        , 0.10327956, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.04089304, 0.        ,
        1.        ]])

In [25]:
movie_index = df[df.title == movie_name]["index"].values[0]
similar_movies =  list(enumerate(cosine_sim[movie_index]))
sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)

#### sorted_similar_movies gives all the recommended movies in order. Now using the indexs we can access the title of the movies. 

In [33]:
[print(sorted_similar_movies[x]) for x in range(10)]

(728, 1.0000000000000007)
(420, 0.5773502691896258)
(864, 0.31426968052735443)
(51, 0.2810913475705226)
(4401, 0.2581988897471611)
(1428, 0.23094010767585035)
(1230, 0.22222222222222224)
(1932, 0.214422506967559)
(1286, 0.21081851067789195)
(1192, 0.20100756305184245)


[None, None, None, None, None, None, None, None, None, None]

In [42]:
def getTitle(index):
    return df[df.index == index]['title'].values[0]

In [60]:
n = int(input("Enter the number of movies you want to be recommended: "))

Enter the number of movies you want to be recommended:  10


In [61]:
for element in sorted_similar_movies:
    print(getTitle(element[0]))
    n -= 1
    if n==0:
        break

Hellboy
Hellboy II: The Golden Army
Blade II
Pacific Rim
The Helix... Loaded
Superhero Movie
The Shadow
Sheena
Snowpiercer
Spawn
