# Movie Recommendation System

Dataset link: https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

# warnings
import warnings
warnings.filterwarnings('ignore')

# abstract syntax trees
import ast

# feature extraction
from sklearn.feature_extraction.text import CountVectorizer

# measure document similarity in text analysis
from sklearn.metrics.pairwise import cosine_similarity

# split data
from sklearn.model_selection import train_test_split

# Linear Regression
from sklearn.linear_model import LinearRegression, LogisticRegression

# Polynomial features
from sklearn.preprocessing import PolynomialFeatures

# calculate accuracy
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score

# natuaral language tool kit
import nltk
from nltk.stem.porter import PorterStemmer

In [2]:
df_credits = pd.read_csv("credits.csv")
df_movies_data = pd.read_csv("movies_metadata.csv")
df_keywords = pd.read_csv("keywords.csv")

In [3]:
df_credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [4]:
pd.set_option('display.max_columns', None)
df_movies_data.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [5]:
df_keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [6]:
df_credits.shape


(45476, 3)

In [7]:
df_movies_data.shape

(45466, 24)

In [8]:
df_keywords.shape

(46419, 2)

In [9]:
df_credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45476 entries, 0 to 45475
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   cast    45476 non-null  object
 1   crew    45476 non-null  object
 2   id      45476 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


In [10]:
df_movies_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [11]:
df_keywords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46419 entries, 0 to 46418
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        46419 non-null  int64 
 1   keywords  46419 non-null  object
dtypes: int64(1), object(1)
memory usage: 725.4+ KB


In [12]:
df_movies_data.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [13]:
df_movies_data["cast"] = df_credits["cast"]
df_movies_data["crew"] = df_credits["crew"]
df_movies_data["keywords"] = df_keywords["keywords"]
df_movies = df_movies_data[["id", "original_title", "genres", "overview", "cast", "crew", "keywords"]]
df_movies.head()

Unnamed: 0,id,original_title,genres,overview,cast,crew,keywords
0,862,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Led by Woody, Andy's toys live happily in his ...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",When siblings Judy and Peter discover an encha...,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",A family wedding reignites the ancient feud be...,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","Cheated on, mistreated and stepped on, the wom...","[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",Just when George Banks has recovered from his ...,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [14]:
df_movies.isnull().sum()

id                  0
original_title      0
genres              0
overview          954
cast                0
crew                0
keywords            0
dtype: int64

In [15]:
df_movies.dropna(inplace = True)
df_movies.isnull().sum()

id                0
original_title    0
genres            0
overview          0
cast              0
crew              0
keywords          0
dtype: int64

In [16]:
df_movies.duplicated().sum()

8

In [17]:
df_movies.drop_duplicates(inplace = True)
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 44504 entries, 0 to 45465
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              44504 non-null  object
 1   original_title  44504 non-null  object
 2   genres          44504 non-null  object
 3   overview        44504 non-null  object
 4   cast            44504 non-null  object
 5   crew            44504 non-null  object
 6   keywords        44504 non-null  object
dtypes: object(7)
memory usage: 2.7+ MB


In [18]:
df_movies.head()

Unnamed: 0,id,original_title,genres,overview,cast,crew,keywords
0,862,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Led by Woody, Andy's toys live happily in his ...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",When siblings Judy and Peter discover an encha...,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",A family wedding reignites the ancient feud be...,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","Cheated on, mistreated and stepped on, the wom...","[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",Just when George Banks has recovered from his ...,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [19]:
df_movies.iloc[0]["genres"]

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [20]:
df_movies.iloc[0]["cast"]

"[{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3a36847f8024f95', 'gender': 2, 'id': 31, 'name': 'Tom Hanks', 'order': 0, 'profile_path': '/pQFoyx7rp09CJTAb932F2g8Nlho.jpg'}, {'cast_id': 15, 'character': 'Buzz Lightyear (voice)', 'credit_id': '52fe4284c3a36847f8024f99', 'gender': 2, 'id': 12898, 'name': 'Tim Allen', 'order': 1, 'profile_path': '/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg'}, {'cast_id': 16, 'character': 'Mr. Potato Head (voice)', 'credit_id': '52fe4284c3a36847f8024f9d', 'gender': 2, 'id': 7167, 'name': 'Don Rickles', 'order': 2, 'profile_path': '/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg'}, {'cast_id': 17, 'character': 'Slinky Dog (voice)', 'credit_id': '52fe4284c3a36847f8024fa1', 'gender': 2, 'id': 12899, 'name': 'Jim Varney', 'order': 3, 'profile_path': '/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg'}, {'cast_id': 18, 'character': 'Rex (voice)', 'credit_id': '52fe4284c3a36847f8024fa5', 'gender': 2, 'id': 12900, 'name': 'Wallace Shawn', 'order': 4, 'profile_path': '/oGE6JqPP2xH4t

In [21]:
df_movies.iloc[0]["crew"]

'[{\'credit_id\': \'52fe4284c3a36847f8024f49\', \'department\': \'Directing\', \'gender\': 2, \'id\': 7879, \'job\': \'Director\', \'name\': \'John Lasseter\', \'profile_path\': \'/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f4f\', \'department\': \'Writing\', \'gender\': 2, \'id\': 12891, \'job\': \'Screenplay\', \'name\': \'Joss Whedon\', \'profile_path\': \'/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f55\', \'department\': \'Writing\', \'gender\': 2, \'id\': 7, \'job\': \'Screenplay\', \'name\': \'Andrew Stanton\', \'profile_path\': \'/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f5b\', \'department\': \'Writing\', \'gender\': 2, \'id\': 12892, \'job\': \'Screenplay\', \'name\': \'Joel Cohen\', \'profile_path\': \'/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f61\', \'department\': \'Writing\', \'gender\': 0, \'id\': 12893, \'job\': \'Screenplay\', \'name\': \'A

In [22]:
df_movies.iloc[0]["keywords"]

"[{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id': 6054, 'name': 'friendship'}, {'id': 9713, 'name': 'friends'}, {'id': 9823, 'name': 'rivalry'}, {'id': 165503, 'name': 'boy next door'}, {'id': 170722, 'name': 'new toy'}, {'id': 187065, 'name': 'toy comes to life'}]"

In [23]:
def extract(obj):
    arr = []
    for dict in ast.literal_eval(obj):
        arr.append(dict["name"].lower())
    return arr
df_movies["genres"] = df_movies["genres"].apply(extract)
df_movies["cast"] = df_movies["cast"].apply(extract)
df_movies["crew"] = df_movies["crew"].apply(extract)
df_movies["keywords"] = df_movies["keywords"].apply(extract)
df_movies.head()

Unnamed: 0,id,original_title,genres,overview,cast,crew,keywords
0,862,Toy Story,"[animation, comedy, family]","Led by Woody, Andy's toys live happily in his ...","[tom hanks, tim allen, don rickles, jim varney...","[john lasseter, joss whedon, andrew stanton, j...","[jealousy, toy, boy, friendship, friends, riva..."
1,8844,Jumanji,"[adventure, fantasy, family]",When siblings Judy and Peter discover an encha...,"[robin williams, jonathan hyde, kirsten dunst,...","[larry j. franco, jonathan hensleigh, james ho...","[board game, disappearance, based on children'..."
2,15602,Grumpier Old Men,"[romance, comedy]",A family wedding reignites the ancient feud be...,"[walter matthau, jack lemmon, ann-margret, sop...","[howard deutch, mark steven johnson, mark stev...","[fishing, best friend, duringcreditsstinger, o..."
3,31357,Waiting to Exhale,"[comedy, drama, romance]","Cheated on, mistreated and stepped on, the wom...","[whitney houston, angela bassett, loretta devi...","[forest whitaker, ronald bass, ronald bass, ez...","[based on novel, interracial relationship, sin..."
4,11862,Father of the Bride Part II,[comedy],Just when George Banks has recovered from his ...,"[steve martin, diane keaton, martin short, kim...","[alan silvestri, elliot davis, nancy meyers, n...","[baby, midlife crisis, confidence, aging, daug..."


In [24]:
df_movies["overview"] = df_movies["overview"].apply(lambda char:char.lower())

In [25]:
df_movies["overview"] = df_movies["overview"].apply(lambda x:x.split())

In [26]:
df_movies.head()

Unnamed: 0,id,original_title,genres,overview,cast,crew,keywords
0,862,Toy Story,"[animation, comedy, family]","[led, by, woody,, andy's, toys, live, happily,...","[tom hanks, tim allen, don rickles, jim varney...","[john lasseter, joss whedon, andrew stanton, j...","[jealousy, toy, boy, friendship, friends, riva..."
1,8844,Jumanji,"[adventure, fantasy, family]","[when, siblings, judy, and, peter, discover, a...","[robin williams, jonathan hyde, kirsten dunst,...","[larry j. franco, jonathan hensleigh, james ho...","[board game, disappearance, based on children'..."
2,15602,Grumpier Old Men,"[romance, comedy]","[a, family, wedding, reignites, the, ancient, ...","[walter matthau, jack lemmon, ann-margret, sop...","[howard deutch, mark steven johnson, mark stev...","[fishing, best friend, duringcreditsstinger, o..."
3,31357,Waiting to Exhale,"[comedy, drama, romance]","[cheated, on,, mistreated, and, stepped, on,, ...","[whitney houston, angela bassett, loretta devi...","[forest whitaker, ronald bass, ronald bass, ez...","[based on novel, interracial relationship, sin..."
4,11862,Father of the Bride Part II,[comedy],"[just, when, george, banks, has, recovered, fr...","[steve martin, diane keaton, martin short, kim...","[alan silvestri, elliot davis, nancy meyers, n...","[baby, midlife crisis, confidence, aging, daug..."


In [27]:
df_movies["full"] = df_movies["genres"] + df_movies["overview"] + df_movies["cast"] + df_movies["crew"] + df_movies["keywords"]

In [28]:
df_movies["full"] = df_movies["full"].apply(lambda str:" ".join(str))

In [29]:
df_movies.head()

Unnamed: 0,id,original_title,genres,overview,cast,crew,keywords,full
0,862,Toy Story,"[animation, comedy, family]","[led, by, woody,, andy's, toys, live, happily,...","[tom hanks, tim allen, don rickles, jim varney...","[john lasseter, joss whedon, andrew stanton, j...","[jealousy, toy, boy, friendship, friends, riva...","animation comedy family led by woody, andy's t..."
1,8844,Jumanji,"[adventure, fantasy, family]","[when, siblings, judy, and, peter, discover, a...","[robin williams, jonathan hyde, kirsten dunst,...","[larry j. franco, jonathan hensleigh, james ho...","[board game, disappearance, based on children'...",adventure fantasy family when siblings judy an...
2,15602,Grumpier Old Men,"[romance, comedy]","[a, family, wedding, reignites, the, ancient, ...","[walter matthau, jack lemmon, ann-margret, sop...","[howard deutch, mark steven johnson, mark stev...","[fishing, best friend, duringcreditsstinger, o...",romance comedy a family wedding reignites the ...
3,31357,Waiting to Exhale,"[comedy, drama, romance]","[cheated, on,, mistreated, and, stepped, on,, ...","[whitney houston, angela bassett, loretta devi...","[forest whitaker, ronald bass, ronald bass, ez...","[based on novel, interracial relationship, sin...","comedy drama romance cheated on, mistreated an..."
4,11862,Father of the Bride Part II,[comedy],"[just, when, george, banks, has, recovered, fr...","[steve martin, diane keaton, martin short, kim...","[alan silvestri, elliot davis, nancy meyers, n...","[baby, midlife crisis, confidence, aging, daug...",comedy just when george banks has recovered fr...


In [30]:
df_movies.iloc[5]["full"]

"action crime drama thriller obsessive master thief, neil mccauley leads a top-notch crew on various insane heists throughout los angeles while a mentally unstable detective, vincent hanna pursues him without rest. each man recognizes and respects the ability and the dedication of the other even though they are aware their cat-and-mouse game may end in violence. al pacino robert de niro val kilmer jon voight tom sizemore diane venora amy brenneman ashley judd mykelti williamson natalie portman ted levine tom noonan tone loc hank azaria wes studi dennis haysbert danny trejo henry rollins william fichtner kevin gage susan traylor jerry trimble ricky harris jeremy piven xander berkeley begonya plaza rick avery hazelle goodman ray buktenica max daniels vince deadrick jr. steven ford farrah forke patricia healy paul herman cindy katz brian libby dan martin mario roberts thomas rosales, jr. yvonne zima mick gould bud cort viviane vives kim staunton martin ferrero brad baldridge andrew camucc

In [31]:
df_movies_random = df_movies.sample(5000).reset_index()

In [32]:
df_movies_random.head()

Unnamed: 0,index,id,original_title,genres,overview,cast,crew,keywords,full
0,29741,288931,Pump!,[documentary],"[a, documentary, about, the, history, of, amer...","[archie panjabi, renu setna, steve jackson, sy...",[kenneth glenaan],[],documentary a documentary about the history of...
1,28312,12504,Le marginal,"[action, crime, drama, thriller]","[in, another, typical, jean-paul, belmondo, ve...",[],[],[suspense],action crime drama thriller in another typical...
2,34333,81551,King Uncle,[],"[ashok, bansal, has, a, traumatic, childhood, ...","[salman khan, bhagyashree, alok nath, rajeev v...","[sooraj r. barjatya, sooraj r. barjatya, tarac...",[],ashok bansal has a traumatic childhood which t...
3,35636,17902,Windkracht 10: Koksijde Rescue,"[drama, action, thriller]","[na, een, ‘incidentje’, tijdens, een, receptie...","[ella-june henrard, laura ballyn, peter bastia...","[hans herbots, nele meirhaeghe, christian verv...","[airport, alien, teacher, school]",drama action thriller na een ‘incidentje’ tijd...
4,23625,169607,Finding Vivian Maier,[documentary],"[vivian, maier's, photos, were, seemingly, des...","[marcin dorociński, piotr nerlewski, patrick w...","[władysław pasikowski, władysław pasikowski, k...","[spy, 1970s, chase, martial law, biography, fu...",documentary vivian maier's photos were seeming...


In [33]:
cv = CountVectorizer(max_features = 5200, stop_words = "english")
arr = cv.fit_transform(df_movies_random["full"]).toarray()
arr.shape

(5000, 5200)

In [34]:
ps = PorterStemmer()
def joiningAll(text):
    array = []
    for string in text.split():
        array.append(ps.stem(string))
    return " ".join(array)
df_new = df_movies_random
df_new["full"] = df_new["full"].apply(joiningAll)

In [35]:
df_movies_random.head()

Unnamed: 0,index,id,original_title,genres,overview,cast,crew,keywords,full
0,29741,288931,Pump!,[documentary],"[a, documentary, about, the, history, of, amer...","[archie panjabi, renu setna, steve jackson, sy...",[kenneth glenaan],[],documentari a documentari about the histori of...
1,28312,12504,Le marginal,"[action, crime, drama, thriller]","[in, another, typical, jean-paul, belmondo, ve...",[],[],[suspense],action crime drama thriller in anoth typic jea...
2,34333,81551,King Uncle,[],"[ashok, bansal, has, a, traumatic, childhood, ...","[salman khan, bhagyashree, alok nath, rajeev v...","[sooraj r. barjatya, sooraj r. barjatya, tarac...",[],ashok bansal ha a traumat childhood which teac...
3,35636,17902,Windkracht 10: Koksijde Rescue,"[drama, action, thriller]","[na, een, ‘incidentje’, tijdens, een, receptie...","[ella-june henrard, laura ballyn, peter bastia...","[hans herbots, nele meirhaeghe, christian verv...","[airport, alien, teacher, school]",drama action thriller na een ‘incidentje’ tijd...
4,23625,169607,Finding Vivian Maier,[documentary],"[vivian, maier's, photos, were, seemingly, des...","[marcin dorociński, piotr nerlewski, patrick w...","[władysław pasikowski, władysław pasikowski, k...","[spy, 1970s, chase, martial law, biography, fu...",documentari vivian maier' photo were seemingli...


In [36]:
df_new.head()

Unnamed: 0,index,id,original_title,genres,overview,cast,crew,keywords,full
0,29741,288931,Pump!,[documentary],"[a, documentary, about, the, history, of, amer...","[archie panjabi, renu setna, steve jackson, sy...",[kenneth glenaan],[],documentari a documentari about the histori of...
1,28312,12504,Le marginal,"[action, crime, drama, thriller]","[in, another, typical, jean-paul, belmondo, ve...",[],[],[suspense],action crime drama thriller in anoth typic jea...
2,34333,81551,King Uncle,[],"[ashok, bansal, has, a, traumatic, childhood, ...","[salman khan, bhagyashree, alok nath, rajeev v...","[sooraj r. barjatya, sooraj r. barjatya, tarac...",[],ashok bansal ha a traumat childhood which teac...
3,35636,17902,Windkracht 10: Koksijde Rescue,"[drama, action, thriller]","[na, een, ‘incidentje’, tijdens, een, receptie...","[ella-june henrard, laura ballyn, peter bastia...","[hans herbots, nele meirhaeghe, christian verv...","[airport, alien, teacher, school]",drama action thriller na een ‘incidentje’ tijd...
4,23625,169607,Finding Vivian Maier,[documentary],"[vivian, maier's, photos, were, seemingly, des...","[marcin dorociński, piotr nerlewski, patrick w...","[władysław pasikowski, władysław pasikowski, k...","[spy, 1970s, chase, martial law, biography, fu...",documentari vivian maier' photo were seemingli...


In [37]:
similar_df = cosine_similarity(arr)
similar_df

array([[1.        , 0.        , 0.05356716, ..., 0.05825311, 0.        ,
        0.        ],
       [0.        , 1.        , 0.02782074, ..., 0.01512723, 0.        ,
        0.        ],
       [0.05356716, 0.02782074, 1.        , ..., 0.03198465, 0.        ,
        0.        ],
       ...,
       [0.05825311, 0.01512723, 0.03198465, ..., 1.        , 0.023879  ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.023879  , 1.        ,
        0.02499024],
       [0.        , 0.        , 0.        , ..., 0.        , 0.02499024,
        1.        ]])

In [38]:
similar_df[5]

array([0.        , 0.        , 0.        , ..., 0.16786398, 0.01440527,
       0.01097973])

In [39]:
similar_df.shape

(5000, 5000)

In [40]:
def recommended(movie):
    movie_index = df_new[df_new["original_title"] == movie].index[0]
    diff = similar_df[movie_index]
    movie_lists = sorted(list(enumerate(diff)), reverse = True, key = lambda x:x[1])[1:11]
    for index in movie_lists:
        print(df_new.iloc[index[0]].original_title)

In [42]:
recommended("Pump!")

The Crisis of Civilization
Ayn Rand & the Prophecy of Atlas Shrugged
Budd Boetticher: An American Original
Lon Chaney: A Thousand Faces
Hockney
Jonestown: Paradise Lost
Forget Baghdad: Jews and Arabs - The Iraqi Connection
B.B. King: The Life of Riley
Le Tombeau d'Alexandre
An Apology to Elephants


In [43]:
recommended("King Uncle")

Я тебя помню
Apenas o Fim
Sameblod
The Shift
Voyage à travers l'impossible
Gangster Squad
Kun taivas putoaa...
خانه‌ی دوست کجاست؟
Çoğunluk
Cześć, Tereska
