In [16]:
import pandas as pd
import datetime as dt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity

movies_df = pd.read_csv('../Resources/modified_movies.csv',low_memory=False)
movies_df.sort_values(by=['avg_vote'], inplace=True, ascending=False)
rating = pd.read_csv('../Resources/IMDb_ratings.csv',low_memory=False)

In [17]:
sub_movies = movies_df[movies_df['language'].str.contains('English', case=False)] # english -> 58
sub_movies.reset_index(drop=True, inplace=True)
print(sub_movies.shape)

(34343, 28)


In [18]:
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
sub_movies['description']=sub_movies['description'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(sub_movies['description'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

#tfidf.get_feature_names()[1000:1020]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_movies['description']=sub_movies['description'].fillna('')


(34343, 37103)

In [19]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
print(cosine_sim.shape)

indices = pd.Series(sub_movies.index, index=sub_movies['original_title']).drop_duplicates()
print(indices.size)


(34343, 34343)
34343


In [20]:
import difflib
import math 
import array

def similarity(word, pattern):
    return difflib.SequenceMatcher(a=word.lower(), b=pattern.lower()).ratio()

def fuzzy_search(title):
    threshold = 0.6
    output = []
    outputWeight = []
    for lookup in sub_movies['original_title']:
        s =similarity(title, lookup) 
        if s > threshold:
            m = sub_movies[sub_movies['original_title'] == lookup].original_title
            try:
                output.append(m.item())
                outputWeight.append(s*10000)
            except Exception:
                pass
            #print(sub_movies[sub_movies['original_title'] == lookup].original_title)

    zipped_lists = zip(outputWeight,output)
    sorted_zipped_lists = sorted(zipped_lists, reverse=True)
    sorted_list1 = [element for _, element in sorted_zipped_lists]

    return sorted_list1

fuzzy_search("Harry Potter and the ")

['Harry Potter and the Goblet of Fire',
 "Harry Potter and the Sorcerer's Stone",
 'Harry Potter and the Half-Blood Prince',
 'Harry Potter and the Chamber of Secrets',
 'Harry Potter and the Prisoner of Azkaban',
 'The Power and the Prize',
 'The Power and the Glory',
 'Harry Potter and the Order of the Phoenix',
 'The Monster and the Girl',
 'The Spider and the Fly',
 'The We and the I',
 'Harry Potter and the Deathly Hallows: Part 2',
 'Harry Potter and the Deathly Hallows: Part 1',
 'The Doctor and the Girl',
 'Harry Styles: Behind the Album',
 'The Soldier and the Lady',
 'The Baroness and the Pig',
 'Harry and the Hendersons',
 'Charley and the Angel',
 'The Professor and the Madman',
 'Harry and Tonto',
 'The Other End of the Line',
 'The Doctor and the Devils',
 'Quatermass and the Pit',
 'Androcles and the Lion']

In [21]:
def recommendation(title,cos=cosine_sim):
    if title in indices.keys():
        idx=indices[title]
    elif fuzzy_search(title)[0] in indices.keys():
        idx=indices[fuzzy_search(title)[0]]
        print("did you mean" ,fuzzy_search(title)[0], "?")

    idx = idx[0] if(isinstance(idx, list) or (not np.isscalar(idx)) ) else idx
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:len(sim_scores)]
    #print(sim_scores)
    movie_indices = [[i[0], i[1],  sub_movies['original_title'].iloc[i[0]]] for i in sim_scores]

    return movie_indices 
    #return sub_movies['original_title'].iloc[movie_indices]

In [22]:
recommendation("The Secret Life Of Pets")[1:11]

did you mean The Secret Life of Pets ?


[[19876, 0.27879295935995874, 'Max mon amour'],
 [12793, 0.2456706635313863, 'Uptown New York'],
 [24141, 0.24248044722678058, 'Free and Easy'],
 [33589, 0.2388051745671041, 'The Devil and Max Devlin'],
 [34136, 0.22082844355052314, 'A Broken Life'],
 [18578, 0.21647829451034037, 'The Zohar Secret'],
 [28028, 0.20529282079517244, 'Kissing a Fool'],
 [25237, 0.20362783695903228, 'One Night Stand'],
 [21236, 0.200881601199363, 'Margarita Happy Hour'],
 [28277, 0.20031586100214516, 'Go Fish']]

In [23]:
def getMovieName(title):

    if title in indices.keys():
        return True, sub_movies[sub_movies['original_title'] == title]
    elif fuzzy_search(title)[0] in indices.keys():
        print("did you mean" ,fuzzy_search(title)[0], "?")
        return False, sub_movies[sub_movies['original_title'] == fuzzy_search(title)[0]]


    

In [44]:
from scipy import spatial
from pandas import DataFrame
import numpy as np

def similarityFactor(movie1: DataFrame, movie2 : DataFrame, script_similarity):
    similarity = 0

    value1 = np.array(movie1['director_encoded'].strip("[").strip("]").split(), dtype=int)
    value2 = np.array(movie2['director_encoded'].strip("[").strip("]").split(), dtype=int)
    padding = abs(value1.shape[0] - value2.shape[0])
    value1 = np.pad(value1, ( padding ,0), 'constant') if value1.shape[0] < value2.shape[0] else value1
    value2 = np.pad(value2, ( padding ,0), 'constant') if value1.shape[0] > value2.shape[0] else value2
    similarity_factor = 0.1 *difflib.SequenceMatcher(a= value1, b=value2).ratio()

    value1 = np.array(movie1['writer_encoded'].strip("[").strip("]").split(), dtype=int)
    value2 = np.array(movie2['writer_encoded'].strip("[").strip("]").split(), dtype=int)
    padding = abs(value1.shape[0] - value2.shape[0])
    value1 = np.pad(value1, ( padding ,0), 'constant') if value1.shape[0] < value2.shape[0] else value1
    value2 = np.pad(value2, ( padding ,0), 'constant') if value1.shape[0] > value2.shape[0] else value2
    similarity_factor += 0.1 * difflib.SequenceMatcher(a= value1, b=value2).ratio()

    value1 = np.array(movie1['encoded_genre'].strip("[").strip("]").split(), dtype=int)
    value2 = np.array(movie2['encoded_genre'].strip("[").strip("]").split(), dtype=int)
    padding = abs(value1.shape[0] - value2.shape[0])
    value1 = np.pad(value1, ( padding ,0), 'constant') if value1.shape[0] < value2.shape[0] else value1
    value2 = np.pad(value2, ( padding ,0), 'constant') if value1.shape[0] > value2.shape[0] else value2
    similarity_factor += 0.3 * difflib.SequenceMatcher(a= value1, b=value2).ratio()

    similarity_factor += 0.1 * ((movie2['avg_vote'] - 5)/5)
    
    #print( "movies: " ,movie1['original_title'].item(), " : ", movie2['original_title'])
    similarity_factor += 0.4 * script_similarity

    
    return similarity_factor 

import math
def recommend(movie):
    recommend_score = 0
    aux = 0
    movies_list = recommendation(movie['original_title'])
    movies_list = movies_list[1:math.floor(len(movies_list)*0.1)]
    for m in movies_list: 
        found, name = getMovieName(m[2])
        for index,  n in name.iterrows():
            aux = similarityFactor(movie,n, m[1] )
            if aux > recommend_score and n["original_title"] != movie["original_title"]:
                recommend_score = aux
                movie_to_recommend = name

    return recommend_score, movie_to_recommend
    

In [25]:
r = np.array(sub_movies[sub_movies['original_title'] == 'Jeeudo']['director_encoded'].item().strip("[").strip("]").split(), dtype=int)
print(r.size)

r = np.pad(r, (r.shape[0] +1 ,0), 'constant')
print(r)


1
[ 0  0 42]


In [47]:
movie=input('Please enter the movie name:')
#recommendation(movie)
found, entry = getMovieName(movie)

for index,  e in entry.iterrows():
    score, result = recommend(e)
    print("similarity factor: ", score)
    print(result)
    #for r in result['original_title']:
    print( "movie name: ", result['original_title'].item() )



did you mean Speedway ?
similarity factor:  0.374
    Unnamed: 0 imdb_title_id                             title  \
54       65483     tt9509866  The Guy Who Didn't Like Musicals   

                      original_title  year date_published  \
54  The Guy Who Didn't Like Musicals  2018     2018-12-23   

                    genre  duration country     language  ...  \
54  ['Comedy', 'Musical']       112     USA  ['English']  ...   

   usa_gross_income worlwide_gross_income metascore reviews_from_users  \
54              NaN                   NaN       NaN                4.0   

   reviews_from_critics     id  encoded_genre writer_encoded director_encoded  \
54                  NaN  65483        [ 4 14]  [29529 32675]          [17604]   

   language_encoded  
54             [62]  

[1 rows x 28 columns]
movie name:  The Guy Who Didn't Like Musicals
