# Importing all the necessary libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

# Reading the file and adding a column namely 'Movie_ID'

In [2]:
df=pd.read_csv('IMDB-Movie-Data.csv')
df['Movie_ID']=range(0,1000)
df.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,Movie_ID
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,1
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,2
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0,3
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0,4


# Highlighting only necessary columns

In [3]:
imp_col=['Title','Genre','Actors', 'Director']
df[imp_col].head()

Unnamed: 0,Title,Genre,Actors,Director
0,Guardians of the Galaxy,"Action,Adventure,Sci-Fi","Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",James Gunn
1,Prometheus,"Adventure,Mystery,Sci-Fi","Noomi Rapace, Logan Marshall-Green, Michael Fa...",Ridley Scott
2,Split,"Horror,Thriller","James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",M. Night Shyamalan
3,Sing,"Animation,Comedy,Family","Matthew McConaughey,Reese Witherspoon, Seth Ma...",Christophe Lourdelet
4,Suicide Squad,"Action,Adventure,Fantasy","Will Smith, Jared Leto, Margot Robbie, Viola D...",David Ayer


# Checking for null values

In [4]:
df[imp_col].isnull().sum()

Title       0
Genre       0
Actors      0
Director    0
dtype: int64

# Function to merge all the important features

In [5]:
def get_imp_feat(data):
    important_features=[]
    for i in range(0,data.shape[0]):
        important_features.append(data['Title'][i]+'  '+data['Genre'][i]+'  '+data['Actors'][i]+'  '+data['Director'][i])
    return important_features

# Create a column to hold combined strings

In [6]:
df['Important_Features']=get_imp_feat(df)
df.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,Movie_ID,Important_Features
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,0,"Guardians of the Galaxy Action,Adventure,Sci-..."
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,1,"Prometheus Adventure,Mystery,Sci-Fi Noomi Ra..."
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,2,"Split Horror,Thriller James McAvoy, Anya Tay..."
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0,3,"Sing Animation,Comedy,Family Matthew McConau..."
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0,4,"Suicide Squad Action,Adventure,Fantasy Will ..."


# Convert text to a matrix of token counts

In [7]:
cm=CountVectorizer().fit_transform(df['Important_Features'])

# Get the cosine similarity matrix from the Count Matrix

In [8]:
cs=cosine_similarity(cm)
print(cs)

[[1.         0.1767767  0.06085806 ... 0.0571662  0.06537205 0.        ]
 [0.1767767  1.         0.         ... 0.         0.06933752 0.        ]
 [0.06085806 0.         1.         ... 0.         0.         0.        ]
 ...
 [0.0571662  0.         0.         ... 1.         0.06726728 0.        ]
 [0.06537205 0.06933752 0.         ... 0.06726728 1.         0.07161149]
 [0.         0.         0.         ... 0.         0.07161149 1.        ]]


# Function to print all the movies' titles

In [9]:
def titles(data):
    data.sort_values("Title",inplace=True)
    print("Choose from following:\n")
    for index,title in enumerate(data['Title']):
        print(f'{index+1}.',title) 
titles(df)

Choose from following:

1. (500) Days of Summer
2. 10 Cloverfield Lane
3. 10 Years
4. 12 Years a Slave
5. 127 Hours
6. 13 Hours
7. 1408
8. 17 Again
9. 2012
10. 20th Century Women
11. 21
12. 21 Jump Street
13. 22 Jump Street
14. 2307: Winter's Dream
15. 28 Weeks Later
16. 3 Days to Kill
17. 3 Idiots
18. 300
19. 300: Rise of an Empire
20. 31
21. 42
22. 5- 25- 77
23. 50/50
24. A Bigger Splash
25. A Cure for Wellness
26. A Dark Song
27. A Good Year
28. A Hologram for the King
29. A Kind of Murder
30. A Million Ways to Die in the West
31. A Monster Calls
32. A Quiet Passion
33. A Street Cat Named Bob
34. A United Kingdom
35. A Walk Among the Tombstones
36. About Time
37. Absolutely Anything
38. Across the Universe
39. Adoration
40. After Earth
41. Ah-ga-ssi
42. Alexander and the Terrible, Horrible, No Good, Very Bad Day
43. Alice Through the Looking Glass
44. Alice in Wonderland
45. Aliens vs Predator - Requiem
46. All Good Things
47. All We Had
48. Allegiant
49. Allied
50. Amateur Night
51

# Get the title of the movie that the user likes

In [10]:
while True:
    try:
        title=input("Insert movie's title:\n").capitalize()
        #Find the movie's id
        movie_id=df[df.Title == title]['Movie_ID'].values[0]
    except:
        print("\nWrong name, no such name in a dataset.\nPlease Try again.\n")
        titles(df.sort_values("Title"))
    else:
        break

Insert movie's title:
15

Wrong name, no such name in a dataset.
Please Try again.

Choose from following:

1. (500) Days of Summer
2. 10 Cloverfield Lane
3. 10 Years
4. 12 Years a Slave
5. 127 Hours
6. 13 Hours
7. 1408
8. 17 Again
9. 2012
10. 20th Century Women
11. 21
12. 21 Jump Street
13. 22 Jump Street
14. 2307: Winter's Dream
15. 28 Weeks Later
16. 3 Days to Kill
17. 3 Idiots
18. 300
19. 300: Rise of an Empire
20. 31
21. 42
22. 5- 25- 77
23. 50/50
24. A Bigger Splash
25. A Cure for Wellness
26. A Dark Song
27. A Good Year
28. A Hologram for the King
29. A Kind of Murder
30. A Million Ways to Die in the West
31. A Monster Calls
32. A Quiet Passion
33. A Street Cat Named Bob
34. A United Kingdom
35. A Walk Among the Tombstones
36. About Time
37. Absolutely Anything
38. Across the Universe
39. Adoration
40. After Earth
41. Ah-ga-ssi
42. Alexander and the Terrible, Horrible, No Good, Very Bad Day
43. Alice Through the Looking Glass
44. Alice in Wonderland
45. Aliens vs Predator - Requ

Insert movie's title:
zootopia


# Create a list of enumerations for the similarity score

(movie_id, similarity_score for this movie_id)

In [11]:
scores = list(enumerate(cs[movie_id]))
scores[0:5]# Convert text to a matrix of token counts

[(0, 0.06299407883487121),
 (1, 0.0668153104781061),
 (2, 0.0),
 (3, 0.14285714285714288),
 (4, 0.06900655593423542)]

# Sort the list of the scores

In [12]:
sorted_scores=sorted(scores, key = lambda x:x[1], reverse=True)
sorted_scores = sorted_scores[1:]#Because the first element is the movie itself
sorted_scores[0:5]

[(403, 0.2760262237369417),
 (567, 0.2760262237369417),
 (607, 0.2672612419124244),
 (814, 0.2672612419124244),
 (840, 0.2672612419124244)]

# Create a loop to print the first n similar movies

In [13]:
n=1001
while n<0 or n>1000:
    n=int(input("Insert how many similar movies you want to see:\n"))
    
print("\n\n\nThe {} most similar recommended movies to '{}' are:".format(n,title))
for index,item in enumerate(sorted_scores):
    movie_title = df[df.Movie_ID==item[0]]['Title'].values[0]
    print(index+1,movie_title)
    if index==n-1:
        break

Insert how many similar movies you want to see:
5



The 5 most similar recommended movies to 'Zootopia' are:
1 Despicable Me
2 Juno
3 Horrible Bosses
4 Fantastic Mr. Fox
5 Horrible Bosses 2
