# Movie Recommender Using Cosine Similarity

In [1]:
import pandas as pd
import numpy as np
import json 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from matplotlib import pyplot as  plt
import heapq
import re

In [2]:
#Read Data. At First all strings
df=pd.read_csv("data/movies_metadata.csv",dtype=str)
df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173


In [3]:
#Read Data Credits (Director)
credits=pd.read_csv("data/credits.csv",dtype=str)
credits[f'crew']=credits['crew'].str.replace("'",'"').str.replace("[","").str.replace("]","")
credits[f'cast']=credits['cast'].str.replace("'",'"').str.replace("[","").str.replace("]","")
credits.head()

Unnamed: 0,cast,crew,id
0,"{""cast_id"": 14, ""character"": ""Woody (voice)"", ...","{""credit_id"": ""52fe4284c3a36847f8024f49"", ""dep...",862
1,"{""cast_id"": 1, ""character"": ""Alan Parrish"", ""c...","{""credit_id"": ""52fe44bfc3a36847f80a7cd1"", ""dep...",8844
2,"{""cast_id"": 2, ""character"": ""Max Goldman"", ""cr...","{""credit_id"": ""52fe466a9251416c75077a89"", ""dep...",15602
3,"{""cast_id"": 1, ""character"": ""Savannah ""Vannah""...","{""credit_id"": ""52fe44779251416c91011acb"", ""dep...",31357
4,"{""cast_id"": 1, ""character"": ""George Banks"", ""c...","{""credit_id"": ""52fe44959251416c75039ed7"", ""dep...",11862


In [4]:
#Drop na in credits
credits=credits.replace("",np.NaN)
credits=credits.dropna()

In [5]:
df.shape,credits.shape

((45466, 24), (42708, 3))

In [6]:
#Extract Director from Credits database
def extractDirector(text,join=False):
    data=[txt.replace("{","").replace("}","") for txt in text.split("},")]
    new_data=None
    for i,d in enumerate(data):
        _dict={}
        for ele in d.split(","):
            if len(ele.split(":"))!=2:
                continue
            key,value=ele.split(":")
            key=key.replace('"',"").strip()
            value=value.replace('"',"").strip()
            _dict[key]=value
        if _dict.get('department')=='Directing':
            new_data=(_dict)
            break
    try:  
        director=new_data['name']
        if join:
            return "".join(director.split())
        return director
    except TypeError:
        return np.NaN


In [7]:
credits['_director']=credits['crew'].apply(lambda x: extractDirector(x,join=True))
credits['Director']=credits['crew'].apply(lambda x: extractDirector(x))

In [8]:
#credits[credits['Director']=='Martin Scorsese']

In [9]:
#Top Directors
#credits['Director'].value_counts()[:10]

In [10]:
def extractMainActor(text,join=False):
    data=[txt.replace("{","").replace("}","") for txt in text.split("},")]
    new_data=None
    d=data[0]
    _dict={}
    for ele in d.split(","):
        if len(ele.split(":"))!=2:
            continue
        key,value=ele.split(":")
        key=key.replace('"',"").strip()
        value=value.replace('"',"").strip()
        _dict[key]=value
    try:
        mainActor=_dict['name']
        if join:
            return "".join(mainActor.split())
        return mainActor
    except:
        #print("error:",text)
        return np.NaN
    

In [11]:
credits['mainActor']=credits['cast'].apply(lambda x: extractMainActor(x))
credits['main_actor']=credits['cast'].apply(lambda x: extractMainActor(x,join=True)) 

In [12]:
#Merge Both DataFrames
df=df.merge(credits[['id','mainActor','Director','main_actor','_director']])
df.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,status,tagline,title,video,vote_average,vote_count,mainActor,Director,main_actor,_director
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,Released,,Toy Story,False,7.7,5415,Tom Hanks,John Lasseter,TomHanks,JohnLasseter
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413,Robin Williams,Joe Johnston,RobinWilliams,JoeJohnston


In [13]:
for col in df.columns:
    print(col)

adult
belongs_to_collection
budget
genres
homepage
id
imdb_id
original_language
original_title
overview
popularity
poster_path
production_companies
production_countries
release_date
revenue
runtime
spoken_languages
status
tagline
title
video
vote_average
vote_count
mainActor
Director
main_actor
_director


In [17]:
#generate new columns from data in form of json

def readJson(string):
    try:
        x=json.loads(string)
    except:
        x=""
    return x

def newColFromJson(col,join=False):
    df[f'{col}2']=df[col].str.replace("'",'"')
    df[f'{col}2']=df[f'{col}2'].map(readJson)
    df[f'{col}2']=df[f'{col}2'].apply(lambda x: ",".join([ele['name'] for ele in x]))
    if join:
        df[f"_{col}"]=df[f'{col}2'].apply(lambda x: "".join(x.split()))
    

In [18]:
newColFromJson('genres',join=False)
newColFromJson("production_companies",join=True)

In [22]:
df['genres2']=df['genres2'].str.replace(","," ")

In [23]:
#Put number type
df['vote_average']=df['vote_average'].astype(float)
df['vote_count']=df['vote_count'].astype(float)
df['popularity']=df['popularity'].astype(float)

In [24]:
#Important Columns to Engine
columns=["title","genres2","production_companies2",'mainActor','Director']

In [25]:
#Show important cols
df[columns].head(2)

Unnamed: 0,title,genres2,production_companies2,mainActor,Director
0,Toy Story,Animation Comedy Family,Pixar Animation Studios,Tom Hanks,John Lasseter
1,Jumanji,Adventure Fantasy Family,"TriStar Pictures,Teitler Film,Interscope Commu...",Robin Williams,Joe Johnston


In [26]:
#look for Nas
df[columns].isnull().values.any()

True

In [27]:
#Clean the Nas
data=df[columns].replace("",np.NaN).dropna()
df=df.loc[data.index]
print(df.shape)
df.head()

(32216, 31)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,video,vote_average,vote_count,mainActor,Director,main_actor,_director,genres2,production_companies2,_production_companies
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,False,7.7,5415.0,Tom Hanks,John Lasseter,TomHanks,JohnLasseter,Animation Comedy Family,Pixar Animation Studios,PixarAnimationStudios
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,False,6.9,2413.0,Robin Williams,Joe Johnston,RobinWilliams,JoeJohnston,Adventure Fantasy Family,"TriStar Pictures,Teitler Film,Interscope Commu...","TriStarPictures,TeitlerFilm,InterscopeCommunic..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,False,6.5,92.0,Walter Matthau,Howard Deutch,WalterMatthau,HowardDeutch,Romance Comedy,"Warner Bros.,Lancaster Gate","WarnerBros.,LancasterGate"
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,False,6.1,34.0,Whitney Houston,Forest Whitaker,WhitneyHouston,ForestWhitaker,Comedy Drama Romance,Twentieth Century Fox Film Corporation,TwentiethCenturyFoxFilmCorporation
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,False,5.7,173.0,Steve Martin,Charles Shyer,SteveMartin,CharlesShyer,Comedy,"Sandollar Productions,Touchstone Pictures","SandollarProductions,TouchstonePictures"


In [28]:
#Clean Duplicates
df=df.drop_duplicates()

In [29]:
#filter Bad movies
#df=df[df['vote_average']>5]

In [30]:
#Create a Column with important features

df['importantFeatures']=df['title']+" "+ \
    df["genres2"]+" "+df["_production_companies"] +" " \
    + " " +df['main_actor']+ " " + df['_director']

In [31]:
#Filter popularity movies
#df[df['popularity']>4].sort_values(by="popularity",ascending=True)[['title','popularity']]
#df=df[df['popularity']>4]
df=df[df['popularity']>7]

In [32]:
#Reset the Index cuz drop values
df=df.reset_index(drop=True)

In [33]:
#Text to Matrix using Count Vector
vectorizer = CountVectorizer(stop_words='english')
cm = vectorizer.fit_transform(df['importantFeatures'])
features=vectorizer.get_feature_names()
print(len(features))
#features

16505


In [34]:
#Cosine Similarity

#cs[i,j]=x, 0=<x<=1. x is the similarity between movie i and movie j. if i=j, x=1.

cs=cosine_similarity(cm)
cs

array([[1.        , 0.11785113, 0.11785113, ..., 0.18898224, 0.15811388,
        0.1118034 ],
       [0.11785113, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.11785113, 0.        , 1.        , ..., 0.08908708, 0.1490712 ,
        0.10540926],
       ...,
       [0.18898224, 0.        , 0.08908708, ..., 1.        , 0.        ,
        0.        ],
       [0.15811388, 0.        , 0.1490712 , ..., 0.        , 1.        ,
        0.14142136],
       [0.1118034 , 0.        , 0.10540926, ..., 0.        , 0.14142136,
        1.        ]])

In [35]:
#Shape od cosine similarity
cs.shape,df.shape

((5898, 5898), (5898, 32))

In [36]:
#Round cs matrix
cs=cs.round(2)

In [37]:
#Directors count in df
df['Director'].value_counts()[:10]

Steven Spielberg    26
Woody Allen         24
Clint Eastwood      22
Ridley Scott        19
Ron Howard          19
Alfred Hitchcock    18
Tim Burton          18
Joel Schumacher     18
Joel Coen           17
Martin Scorsese     17
Name: Director, dtype: int64

In [38]:
df[df['main_actor']=="JimCarrey"][columns]

Unnamed: 0,title,genres2,production_companies2,mainActor,Director
11,Ace Ventura: When Nature Calls,Crime Comedy Adventure,"O Entertainment,Warner Bros.,Morgan Creek Prod...",Jim Carrey,Steve Oedekerk
91,Dumb and Dumber,Comedy,"New Line Cinema,Motion Picture Corporation of ...",Jim Carrey,Peter Farrelly
135,Ace Ventura: Pet Detective,Comedy Mystery,"Warner Bros.,Morgan Creek Productions",Jim Carrey,Tom Shadyac
148,The Mask,Romance Comedy Crime Fantasy,"New Line Cinema,Dark Horse Entertainment",Jim Carrey,Chuck Russell
274,The Cable Guy,Comedy Drama Thriller,Columbia Pictures Corporation,Jim Carrey,Ben Stiller
578,Liar Liar,Comedy,"Imagine Entertainment,Universal Pictures",Jim Carrey,Tom Shadyac
635,The Truman Show,Comedy Drama,"Paramount Pictures,Scott Rudin Productions",Jim Carrey,Peter Weir
1200,Man on the Moon,Comedy Drama Romance,Mutual Film Company,Jim Carrey,Miloš Forman
1387,"Me, Myself & Irene",Comedy,"Twentieth Century Fox Film Corporation,Conundr...",Jim Carrey,Bobby Farrelly
1456,How the Grinch Stole Christmas,Family Comedy Fantasy,"Imagine Entertainment,Universal Pictures",Jim Carrey,Ron Howard


In [45]:
df[df['_director']=='ChristopherNolan'][columns]

Unnamed: 0,title,genres2,production_companies2,mainActor,Director
1530,Memento,Mystery Thriller,Summit Entertainment Newmarket Capital Group T...,Guy Pearce,Christopher Nolan
1831,Insomnia,Crime Mystery Thriller,Section Eight Alcon Entertainment Witt/Thomas ...,Al Pacino,Christopher Nolan
2721,Batman Begins,Action Crime Drama,DC Comics Legendary Pictures Warner Bros. DC E...,Christian Bale,Christopher Nolan
2987,The Prestige,Drama Mystery Thriller,Warner Bros. Touchstone Pictures Syncopy,Hugh Jackman,Christopher Nolan
3262,The Dark Knight,Drama Action Crime Thriller,DC Comics Legendary Pictures Warner Bros. DC E...,Christian Bale,Christopher Nolan
3767,Inception,Action Thriller Science Fiction Mystery Adventure,Legendary Pictures Warner Bros. Syncopy,Leonardo DiCaprio,Christopher Nolan
4081,The Dark Knight Rises,Action Crime Drama Thriller,Legendary Pictures Warner Bros. DC Entertainme...,Christian Bale,Christopher Nolan
4684,Interstellar,Adventure Drama Science Fiction,Paramount Pictures Legendary Pictures Warner B...,Matthew McConaughey,Christopher Nolan
5857,Dunkirk,Action Drama History Thriller War,Canal+ Studio Canal Warner Bros. Syncopy RatPa...,Fionn Whitehead,Christopher Nolan


In [41]:
#Extend the columnd needed
columns.extend(["overview","release_date","id","imdb_id"])

In [42]:
#Get Similarity by name

def getRecommendations(title,n=6):

    if title in df['title'].unique():
        _id=df[df['title']==title].index
        a=cs[_id][0]
        recommendations=heapq.nlargest(n, range(len(a)), a.take)
        x=df.loc[recommendations][columns]
        x['Percentage']=a[recommendations]
        return x[1:]
    else:
        return "Movie not Found"
        
        
res=getRecommendations('Batman',n=10)
res

Unnamed: 0,title,genres2,production_companies2,mainActor,Director,overview,id,imdb_id,overview.1,release_date,id.1,imdb_id.1,Percentage
535,Batman Returns,Action Fantasy,"PolyGram Filmed Entertainment,Warner Bros.",Michael Keaton,Tim Burton,"Having defeated the Joker, Batman now faces th...",364,tt0103776,"Having defeated the Joker, Batman now faces th...",1992-06-19,364,tt0103776,0.71
55,Batman Forever,Action Crime Fantasy,"Warner Bros.,Polygram Filmed Entertainment",Val Kilmer,Joel Schumacher,The Dark Knight of Gotham City confronts a das...,414,tt0112462,The Dark Knight of Gotham City confronts a das...,1995-06-16,414,tt0112462,0.56
589,Batman & Robin,Action Crime Fantasy,"PolyGram Filmed Entertainment,Warner Bros.",George Clooney,Joel Schumacher,Along with crime-fighting partner Robin and ne...,415,tt0118688,Along with crime-fighting partner Robin and ne...,1997-06-20,415,tt0118688,0.56
2188,The Witches of Eastwick,Fantasy Horror Comedy,"Kennedy Miller Productions,The Guber-Peters Co...",Jack Nicholson,George Miller,Three single women in a picturesque village ha...,6069,tt0094332,Three single women in a picturesque village ha...,1987-06-12,6069,tt0094332,0.5
1502,Innerspace,Action Comedy Science Fiction,"Amblin Entertainment,The Guber-Peters Company,...",Martin Short,Joe Dante,Test pilot Tuck Pendleton volunteers to test a...,2614,tt0093260,Test pilot Tuck Pendleton volunteers to test a...,1987-06-30,2614,tt0093260,0.4
833,Beetlejuice,Fantasy Comedy,"Geffen Company, The,Warner Bros.",Geena Davis,Tim Burton,"Thanks to an untimely demise via drowning, a y...",4011,tt0094721,"Thanks to an untimely demise via drowning, a y...",1988-02-29,4011,tt0094721,0.38
5703,The Lego Batman Movie,Action Animation Comedy Family Fantasy,"Lin Pictures,Warner Bros. Animation,Warner Bro...",Will Arnett,Chris McKay,In the irreverent spirit of fun that made “The...,324849,tt4116284,In the irreverent spirit of fun that made “The...,2017-02-08,324849,tt4116284,0.36
4527,Batman: Mystery of the Batwoman,Adventure Fantasy Animation Action Science Fic...,"Warner Bros. Pictures,DC Comics",Kevin Conroy,Curt Geda,"A new vigilante, Batwoman, is wreaking havoc i...",21683,tt0346578,"A new vigilante, Batwoman, is wreaking havoc i...",2003-10-21,21683,tt0346578,0.34
544,Mars Attacks!,Comedy Fantasy Science Fiction,Tim Burton Productions,Jack Nicholson,Tim Burton,'We come in peace' is not what those green men...,75,tt0116996,'We come in peace' is not what those green men...,1996-12-12,75,tt0116996,0.33


In [34]:
import requests
def getAPIinfo(movie_id):
    api_key="24c0062c5a18f7b09b0f1c2fd6a3d34a"
    BASE_URL=f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}"
    
    r=requests.get(BASE_URL)
    if r.status_code==200:
        data=r.json()
        poster=data['poster_path']
        IMAGE_URL = f"https://image.tmdb.org/t/p/original{poster}"
        return IMAGE_URL
    else:
        return None

res['posterUrl']=res['imdb_id'].apply(lambda x: getAPIinfo(x))
res

Unnamed: 0,title,genres2,production_companies2,mainActor,Director,id,imdb_id,Percentage,posterUrl
1727,Toy Story 2,Animation Comedy Family,Pixar Animation Studios,Tom Hanks,Lee Unkrich,863,tt0120363,0.875,https://image.tmdb.org/t/p/original/eVGu0zseza...
6226,Toy Story 3,Animation Family Comedy,Walt Disney Pictures Pixar Animation Studios,Tom Hanks,Lee Unkrich,10193,tt0435761,0.75,https://image.tmdb.org/t/p/original/4cpGytCB0e...
8354,Toy Story That Time Forgot,Animation Family,Pixar Animation Studios,Tom Hanks,Steve Purcell,256835,tt3473654,0.707107,https://image.tmdb.org/t/p/original/pw1YgzcBw4...
5813,Up,Animation Comedy Family Adventure,Pixar Animation Studios,Ed Asner,Bob Peterson,14160,tt1049413,0.534522,https://image.tmdb.org/t/p/original/eAdO0qa9m0...
4812,Cars,Animation Adventure Comedy Family,Walt Disney Pictures Pixar Animation Studios,Owen Wilson,John Lasseter,920,tt0317219,0.5,https://image.tmdb.org/t/p/original/qa6HCwP4Z1...


In [35]:
import os
os.environ['IMDB_API_KEY'] = "24c0062c5a18f7b09b0f1c2fd6a3d34a"

In [36]:
os.environ['IMDB_API_KEY']

'24c0062c5a18f7b09b0f1c2fd6a3d34a'

In [43]:
#Export Cosine Matrix and data
df[columns].to_csv("output/data.csv",index=False)
#np.save('output/cosineSim.npy', cs)

In [44]:
from numpy import asarray
from numpy import savez_compressed
data = asarray(cs)
# save to npy file
savez_compressed('output/cosineSim.npz', data)

In [54]:
#cs=np.load("output/cosineSim.npy")

In [57]:

# load dict of arrays
dict_data = np.load('output/cosineSim.npz')
# extract the first array
cs = dict_data['arr_0']
cs

array([[1.  , 0.13, 0.12, ..., 0.19, 0.16, 0.12],
       [0.13, 1.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.12, 0.  , 1.  , ..., 0.09, 0.15, 0.11],
       ...,
       [0.19, 0.  , 0.09, ..., 1.  , 0.  , 0.  ],
       [0.16, 0.  , 0.15, ..., 0.  , 1.  , 0.15],
       [0.12, 0.  , 0.11, ..., 0.  , 0.15, 1.  ]])

## Get Posible Names for user entry

In [58]:
import string
from fuzzywuzzy import process
import pandas as pd
alphabet=list(string.ascii_lowercase)
alphabet.extend(['0','1','2','3','4','5','6','7','8','9'])
df=pd.read_csv("output/data.csv")
df.head()

Unnamed: 0,title,genres2,production_companies2,mainActor,Director,id,imdb_id
0,Toy Story,Animation Comedy Family,Pixar Animation Studios,Tom Hanks,John Lasseter,862,tt0114709
1,Jumanji,Adventure Fantasy Family,TriStar Pictures Teitler Film Interscope Commu...,Robin Williams,Joe Johnston,8844,tt0113497
2,Grumpier Old Men,Romance Comedy,Warner Bros. Lancaster Gate,Walter Matthau,Howard Deutch,15602,tt0113228
3,Father of the Bride Part II,Comedy,Sandollar Productions Touchstone Pictures,Steve Martin,Charles Shyer,11862,tt0113041
4,Heat,Action Crime Drama Thriller,Regency Enterprises Forward Pass Warner Bros.,Al Pacino,Michael Mann,949,tt0113277


In [59]:
#Función de normalizar textos
def NormText(text,myreplace={'á':'a','é':'e','í':'i','ó':'o','ú':'u',}):
    _str="".join(re.findall(r"[\w&\s]",text)).lower().strip()
    _str=re.sub("\s{2,}"," ",_str)
    for key,replace in myreplace.items():
        _str=_str.replace(key,replace)
    _str=re.sub("\s{2,}"," ",_str)
    return _str.strip()

In [60]:
#Doc->Array using alphabet
def arr2doc(doc):
    arr=[]
    for letter in alphabet:
        arr.append(doc.count(letter))
    return arr

def makeArray(documents):
    return list(map(arr2doc,documents))


In [61]:
#Documents are titles of movies

title=["Inside Out"]
documents=df['title']
new=list(map(NormText,documents))
new2=list(map(NormText,title))
Arr=makeArray(new)
Arr2=makeArray(new2)

In [62]:
#Matrix Count alphabet from all titles in df
pd.DataFrame(np.array(Arr),index=df['title'],columns=alphabet)

Unnamed: 0_level_0,a,b,c,d,e,f,g,h,i,j,...,0,1,2,3,4,5,6,7,8,9
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Jumanji,1,0,0,0,0,0,0,0,1,2,...,0,0,0,0,0,0,0,0,0,0
Grumpier Old Men,0,0,0,1,2,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
Father of the Bride Part II,2,1,0,1,3,2,0,2,3,0,...,0,0,0,0,0,0,0,0,0,0
Heat,1,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
The Visitors: Bastille Day,2,1,0,1,2,0,0,1,3,0,...,0,0,0,0,0,0,0,0,0,0
The Pope Must Die,0,0,0,1,3,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
LEGO DC Super Hero Girls: Brain Drain,2,1,1,2,3,0,2,1,3,0,...,0,0,0,0,0,0,0,0,0,0
Take Me,1,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [63]:
#Matrix Count alphabet from input of user
pd.DataFrame(np.array(Arr2),index=[title],columns=alphabet)

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,0,1,2,3,4,5,6,7,8,9
Inside Out,0,0,0,1,1,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0


In [64]:
#Cosine Similarity Names
csNames=cosine_similarity(Arr2,Arr)

In [65]:
a=csNames[0]
recommendations=heapq.nlargest(5, range(len(a)), a.take)
_idx=df['title'][recommendations].index
df.loc[_idx][columns]

Unnamed: 0,title,genres2,production_companies2,mainActor,Director,id,imdb_id
5201,Inside Out,Drama Comedy Animation Family,Walt Disney Pictures Pixar Animation Studios,Amy Poehler,Pete Docter,150540,tt2096673
3839,Undisputed III : Redemption,Action Thriller,Nu Image Films Undisputed 3 Productions,Scott Adkins,Isaac Florentine,38234,tt1156466
3006,Tenacious D in The Pick of Destiny,Comedy Music,New Line Cinema Red Hour Films,Jack Black,Liam Lynch,2179,tt0365830
3929,Insidious,Horror Thriller,Alliance Films IM Global Stage 6 Films Haunted...,Patrick Wilson,James Wan,49018,tt1591095
1529,Exit Wounds,Action Crime Thriller,Village Roadshow Pictures NPV Entertainment Wa...,Steven Seagal,Andrzej Bartkowiak,10877,tt0242445


In [66]:
def getSimilarNames(input_user,n=10,score=90):
    title=[input_user]
    documents=df['title']
    new=list(map(NormText,documents))
    new2=list(map(NormText,title))
    Arr=makeArray(new)
    Arr2=makeArray(new2)
    #Cosine Similarity Names
    csNames=cosine_similarity(Arr2,Arr)
    a=csNames[0]
    recommendations=heapq.nlargest(n, range(len(a)), a.take)
    _idx=df['title'][recommendations].index
    results=df.loc[_idx][columns]
    options=results['title'].map(NormText)
    fuzz=process.extractBests(NormText(input_user),options,score_cutoff=score)
    if fuzz!=[]:
        return df.loc[[f[2] for f  in fuzz]][columns]
    else:
        return "No Movie Found"


In [68]:
getSimilarNames(input("Movie: "))

Movie: Cars


Unnamed: 0,title,genres2,production_companies2,mainActor,Director,id,imdb_id
2904,Cars,Animation Adventure Comedy Family,Walt Disney Pictures Pixar Animation Studios,Owen Wilson,John Lasseter,920,tt0317219
3977,Cars 2,Animation Family Adventure Comedy,Walt Disney Pictures Pixar Animation Studios,Owen Wilson,John Lasseter,49013,tt1216475
5808,Cars 3,Family Comedy Animation Adventure,Walt Disney Pictures Pixar Animation Studios,Owen Wilson,Brian Fee,260514,tt3606752
