In [3]:
#IMPORTING THE REQUIRED LIBRARIES

import numpy as np
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity

MAJOR ALGORITHMS OR FUNCTIONS USED:

1. Cosine Similarity Function:
   -Cosine similarity is like a measuring tool that helps us figure out how much two things (like movies) are alike. 
    For a movie recommender, it checks how similar two movies are by looking at their features (like genres or ratings).
    Higher similarity means they are more alike, and this helps the recommender suggest similar movies to users.

2. Count Vectorizer:
   -Count Vectorizer is a way to turn words in movie descriptions or reviews into numbers that a computer can understand.
    It counts how often words appear, turning each movie's words into a set of numbers. 
    This helps the recommender system consider the actual words people use when suggesting movies.

3. Port Stemmer Function: FROM NLTK
   -The Porter Stemmer is like a word simplifier. It takes words and chops off the ends to make them simpler. 
    This is useful in a movie recommender because it ensures that similar words (like "running" and "runner") are treated
    as the same. It helps the system understand the meaning of words better and suggests movies more accurately.

In [4]:
#READING THE REQUIRED CSV DATA FILES 

movies=pd.read_csv('tmdb_5000_movies.csv')
credits= pd.read_csv('tmdb_5000_credits.csv')

In [5]:
#OUTPUT THE FIRST FIVE ROWS FROM THE MOVIES DATA FILE
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [6]:
#OUTPUT THE FIRST FIVE ROWS FROM THE CREDITS DATA FILE
credits.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [7]:
3#MERGING THE TWO DATASETS

movies=movies.merge(credits,on='title')

In [8]:
#OUTPUT THE MERGED DATASET'S FIRST FIVE ROWS

movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,206647,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,49026,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,49529,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [9]:
#KEEPING ONLY THE REQUIRED COLOUMNS IN THE MOVIES DATASET

movies=movies[['movie_id','title','keywords','overview','genres','cast','crew']]

In [10]:
#READING THE FIRST FIVE LINES

movies.head()

Unnamed: 0,movie_id,title,keywords,overview,genres,cast,crew
0,19995,Avatar,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [11]:
#REMOVING THE FIELDS WITH NULL VALUES IN THE DATA

movies.dropna(inplace=True)

In [12]:
#OUTPUTS THE NUMBER OF NULL VALUES IN THE COLOUMNS

movies.isnull().sum()

movie_id    0
title       0
keywords    0
overview    0
genres      0
cast        0
crew        0
dtype: int64

In [13]:
#CREATES A FUNCTION THAT EXTRACTS ONLY THE 'NAME' ATTRIBUTE FROM THE PARTICULAR COLOUMN AND DISCARDS THE REST OF THE ATTRIBUTES
#AST.LITERAL_EVAL IS USED TO CONVERT THE STRING INTO LIST

def convert(obj):
    l=[]
    for i in ast.literal_eval(obj):
        l.append(i['name'])
    return l

In [14]:
#APPLIES FUNCTION 'CONVERT' THAT EXTRACTS ONLY THE 'NAME' ATTRIBUTE FROM THE GENRES AND DISCARDS THE REST OF THE ATTRIBUTES

movies['genres'].apply(convert)

0       [Action, Adventure, Fantasy, Science Fiction]
1                        [Adventure, Fantasy, Action]
2                          [Action, Adventure, Crime]
3                    [Action, Crime, Drama, Thriller]
4                [Action, Adventure, Science Fiction]
                            ...                      
4804                        [Action, Crime, Thriller]
4805                                [Comedy, Romance]
4806               [Comedy, Drama, Romance, TV Movie]
4807                                               []
4808                                    [Documentary]
Name: genres, Length: 4806, dtype: object

In [15]:
movies['genres']= movies['genres'].apply(convert)

In [16]:
#SEE THE CHANGES MADE

movies.head()

Unnamed: 0,movie_id,title,keywords,overview,genres,cast,crew
0,19995,Avatar,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [17]:
#APPLIES FUNCTION 'CONVERT' THAT EXTRACTS ONLY THE 'NAME' ATTRIBUTE FROM THE KEYWORDS AND DISCARDS THE REST OF THE ATTRIBUTES

movies['keywords']= movies['keywords'].apply(convert)

In [18]:
movies.head()

Unnamed: 0,movie_id,title,keywords,overview,genres,cast,crew
0,19995,Avatar,"[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[ocean, drug abuse, exotic island, east india ...","Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[spy, based on novel, secret agent, sequel, mi...",A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[dc comics, crime fighter, terrorist, secret i...",Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[based on novel, mars, medallion, space travel...","John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [19]:
#DEFINE A FUNCTION CASTER WHICH TAKES THE FIRST THREE NAME ATTRIBUTES FROM A COLOUMN 


def caster(cas):
    l=[]
    count = 0
    for i in ast.literal_eval(cas):
        if count!=3:
            l.append(i['name'])
            count =count+1
        else:
            break
    return l

In [20]:
#APPLY CASTER TO CAST COLOUMN

movies['cast']=movies['cast'].apply(caster)

In [21]:
movies.head()

Unnamed: 0,movie_id,title,keywords,overview,genres,cast,crew
0,19995,Avatar,"[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[ocean, drug abuse, exotic island, east india ...","Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[Johnny Depp, Orlando Bloom, Keira Knightley]","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[spy, based on novel, secret agent, sequel, mi...",A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[Daniel Craig, Christoph Waltz, Léa Seydoux]","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[dc comics, crime fighter, terrorist, secret i...",Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[Christian Bale, Michael Caine, Gary Oldman]","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[based on novel, mars, medallion, space travel...","John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[Taylor Kitsch, Lynn Collins, Samantha Morton]","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [22]:
#DEFINE A FUNCTION DIRECTOR WHICH EXTRACTS THE NAME OF DIRECTOR ONLY FROM THE COLOUMN 'CREW'

def director(obj):
    l=[]
    for i in ast.literal_eval(obj):
        if(i['job']=='Director'):
            l.append(i['name'])
            break
    return l
                        

In [23]:
#APPLY DIRECTOR FUNCTION TO CREW

movies['crew']=movies['crew'].apply(director)

In [24]:
movies.head()

Unnamed: 0,movie_id,title,keywords,overview,genres,cast,crew
0,19995,Avatar,"[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[ocean, drug abuse, exotic island, east india ...","Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,"[spy, based on novel, secret agent, sequel, mi...",A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,The Dark Knight Rises,"[dc comics, crime fighter, terrorist, secret i...",Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,John Carter,"[based on novel, mars, medallion, space travel...","John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


In [25]:
#SPLITS THE OVERVIEW COLOUMN INTO A LIST OF WORDS WHICH IS NECESSARY FOR CREATING TAG FIELD

movies['overview']=movies['overview'].apply(lambda x:x.split())

In [26]:
movies.head()

Unnamed: 0,movie_id,title,keywords,overview,genres,cast,crew
0,19995,Avatar,"[culture clash, future, space war, space colon...","[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[ocean, drug abuse, exotic island, east india ...","[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,"[spy, based on novel, secret agent, sequel, mi...","[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,The Dark Knight Rises,"[dc comics, crime fighter, terrorist, secret i...","[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,John Carter,"[based on novel, mars, medallion, space travel...","[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, Science Fiction]","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


In [27]:
#REMOVES THE SPACE BETWEEN TWO INITIALS IN A SAME NAME IN ANY FIELDS...EG.   SAM ALTMAN IS WRITTEN AS   SAMALTMAN. THIS IS NECESSARY TO NOT LOOSE THEM TO TAGS

movies['crew']=movies['crew'].apply(lambda x:[i.replace(" ","")for i in x])

In [28]:
#REMOVES THE SPACE BETWEEN TWO INITIALS IN A SAME NAME IN ANY FIELDS...EG.   SAM ALTMAN IS WRITTEN AS   SAMALTMAN. THIS IS NECESSARY TO NOT LOOSE THEM TO TAGS

movies['genres']=movies['genres'].apply(lambda x:[i.replace(" ","")for i in x])
movies['keywords']=movies['keywords'].apply(lambda x:[i.replace(" ","")for i in x])
movies['cast']=movies['cast'].apply(lambda x:[i.replace(" ","")for i in x])

In [29]:
movies.head()

Unnamed: 0,movie_id,title,keywords,overview,genres,cast,crew
0,19995,Avatar,"[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[ocean, drugabuse, exoticisland, eastindiatrad...","[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]
2,206647,Spectre,"[spy, basedonnovel, secretagent, sequel, mi6, ...","[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes]
3,49026,The Dark Knight Rises,"[dccomics, crimefighter, terrorist, secretiden...","[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan]
4,49529,John Carter,"[basedonnovel, mars, medallion, spacetravel, p...","[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton]


In [30]:
#CREATES THE TAG FIELD BY JOINING THE OVERVIEW ,KEYWORDS,CAST,CREW AND GENRES FIELD

movies['tags']=movies['overview']+movies['keywords']+movies['cast']+movies['crew']+movies['genres']

In [31]:
movies.head()

Unnamed: 0,movie_id,title,keywords,overview,genres,cast,crew,tags
0,19995,Avatar,"[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[ocean, drugabuse, exoticisland, eastindiatrad...","[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[spy, basedonnovel, secretagent, sequel, mi6, ...","[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes],"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[dccomics, crimefighter, terrorist, secretiden...","[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan],"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[basedonnovel, mars, medallion, spacetravel, p...","[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton],"[John, Carter, is, a, war-weary,, former, mili..."


In [32]:
#SEE THE FIRST ROW OF THE TAGS FIELD

movies['tags'][0]

['In',
 'the',
 '22nd',
 'century,',
 'a',
 'paraplegic',
 'Marine',
 'is',
 'dispatched',
 'to',
 'the',
 'moon',
 'Pandora',
 'on',
 'a',
 'unique',
 'mission,',
 'but',
 'becomes',
 'torn',
 'between',
 'following',
 'orders',
 'and',
 'protecting',
 'an',
 'alien',
 'civilization.',
 'cultureclash',
 'future',
 'spacewar',
 'spacecolony',
 'society',
 'spacetravel',
 'futuristic',
 'romance',
 'space',
 'alien',
 'tribe',
 'alienplanet',
 'cgi',
 'marine',
 'soldier',
 'battle',
 'loveaffair',
 'antiwar',
 'powerrelations',
 'mindandsoul',
 '3d',
 'SamWorthington',
 'ZoeSaldana',
 'SigourneyWeaver',
 'JamesCameron',
 'Action',
 'Adventure',
 'Fantasy',
 'ScienceFiction']

In [33]:
#CREATE A NEW LIST FROM THE MOVIE ID, TITLE AND TAGS FIELD

new_list=movies[['movie_id','title','tags']]

In [34]:
new_list.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."


In [35]:
#CONVERT THE TAGS FIELD BACK TO STRING

new_list['tags']=new_list['tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_list['tags']=new_list['tags'].apply(lambda x:" ".join(x))


In [36]:
new_list.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [37]:
#CONVERT THE TAGS FIELD INTO LOWER CASE

new_list['tags']=new_list['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_list['tags']=new_list['tags'].apply(lambda x:x.lower())


In [38]:
new_list.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


In [39]:
#USE THE COUNUTVECTORIZER FUNCTION FROM SKLEARN TO EXTRACT 5000 FEAUTRES AND REMOVE STOP WORDS 

cv=CountVectorizer(max_features=5000,stop_words='english')

In [40]:
#CONVERTS SPARSE MATRIX INTO NUMPY ARRAY

vectors=cv.fit_transform(new_list['tags']).toarray()

In [41]:
#EXTRTACTS THE FEATURE NAMES
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

In [42]:
#USES STEM FUNCTION TO REMOVE THE UNWANTED DUPLICITY

ps=PorterStemmer()

In [43]:
#DEFINE STEM FUNCTION WHICH SPLITS AND APPLIES IT AND AND THEN REJOINS IT BACK TO A STRING

def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [44]:
#APPLY STEM FUNCTION TO TAGS

new_list['tags']=new_list['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_list['tags']=new_list['tags'].apply(stem)


In [45]:
new_list.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weary, former militari ca..."


In [46]:
#EXTRACT FEATURE NAMES AFTER STEMMING

features=cv.get_feature_names_out()

In [47]:
print(features[:])

['000' '007' '10' ... 'zone' 'zoo' 'zooeydeschanel']


In [48]:
#EXTRACTS FEATURE AFTER STEMMING
cv=CountVectorizer(max_features=5000,stop_words='english')

In [49]:
#DOES THE DEFINING OF STEM FUNCTION #DUPLICATE LINES

vectors=cv.fit_transform(new_list['tags']).toarray()
cv.get_feature_names_out()
ps=PorterStemmer()
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [50]:
new_list['tags']=new_list['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_list['tags']=new_list['tags'].apply(stem)


In [51]:
new_list.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weary, former militari ca..."


In [52]:
#features=cv.get_feature_names_out()
#for i in range(len(features)):
    #print(features[i])

In [53]:
#USE COSINESIMILARITY FUNCTION TO EXTRACT THE SIMILARITY ANGLE DISTANCES BETWEEN THE VECTORS WITH EACH OF THE OTHER VECTORS
similarity=cosine_similarity(vectors)

In [54]:
#VEIWING THE SIMILARITY MATRIX
similarity[:]

array([[1.        , 0.08346223, 0.0860309 , ..., 0.04499213, 0.        ,
        0.        ],
       [0.08346223, 1.        , 0.06063391, ..., 0.02378257, 0.        ,
        0.02615329],
       [0.0860309 , 0.06063391, 1.        , ..., 0.02451452, 0.        ,
        0.        ],
       ...,
       [0.04499213, 0.02378257, 0.02451452, ..., 1.        , 0.03962144,
        0.04229549],
       [0.        , 0.        , 0.        , ..., 0.03962144, 1.        ,
        0.08714204],
       [0.        , 0.02615329, 0.        , ..., 0.04229549, 0.08714204,
        1.        ]])

In [57]:
#DEFINING THE RECOMMEND FUNCTION WHICH MATCHES THE PARAMETER WITH THE NAME OF MOVIES IN THE NEW_LIST AND THE CALCULATES THE SIMILARITY
#0F THAT MOVIE WITH OTHER ONES i.e. EXTRACTS ITS SIMILARITY VECTOR AND THEN DOES THE SORTING BASED ON THE SECOND ATTRIBUTE OF THE 
#INDEX COLOUMN IN THE REVERSE ORDER AND EXTRACTS THE FIRST FIVE MOVIE NAMES WHICH MATCH THE MOST OR HAVE THE HIGHEST SIMILARITY

def recommend(obj):
    movie_index=new_list[new_list['title']==obj].index[0]
    distances=similarity[movie_index]
    print('cosine similarity values are:')
    print(distances)
    last=sorted(list(enumerate(distances)),reverse=True,key= lambda x:x[1])[1:6]
    for i in last:
        print(new_list.iloc[i[0]].title)
        
        

In [61]:
#INPUT THE USERS QUERY AND PASS IT TO THE RECOMMEND FUNCTION WHICH THEN RECOMMENDS THE MOVIES

quest= input('tell me your favourite movie   ')
recommend(quest)

tell me your favourite movie   Superman
cosine similarity values are:
[0.09269795 0.09799919 0.05050763 ... 0.01981072 0.10204082 0.08714204]
Superman Returns
Superman II
Iron Man 2
Superman III
Superman IV: The Quest for Peace
