In [53]:
import pandas as pd
import datetime as dt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity

movies_df = pd.read_csv('../../Resources/modified_movies.csv',low_memory=False)
movies_df.sort_values(by=['total_votes'], inplace=True, ascending=False)

In [35]:
movies_df.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'imdb_title_id', 'title',
       'original_title', 'year', 'date_published', 'genre', 'duration',
       'country', 'language', 'director', 'writer', 'production_company',
       'actors', 'description', 'avg_vote', 'votes', 'budget',
       'usa_gross_income', 'worlwide_gross_income', 'metascore',
       'reviews_from_users', 'reviews_from_critics', 'id', 'encoded_genre',
       'writer_encoded', 'director_encoded', 'language_encoded',
       'weighted_average_vote', 'total_votes', 'mean_vote', 'median_vote',
       'votes_10', 'votes_9', 'votes_8', 'votes_7', 'votes_6', 'votes_5',
       'votes_4', 'votes_3', 'votes_2', 'votes_1', 'allgenders_0age_avg_vote',
       'allgenders_0age_votes', 'allgenders_18age_avg_vote',
       'allgenders_18age_votes', 'allgenders_30age_avg_vote',
       'allgenders_30age_votes', 'allgenders_45age_avg_vote',
       'allgenders_45age_votes', 'males_allages_avg_vote',
       'males_allages_votes', 'males_0age_avg_vote

In [54]:
sub_movies = movies_df[movies_df['language'].str.contains('English', case=False)] 
sub_movies.reset_index(drop=True, inplace=True)
print(sub_movies.shape)
print(sub_movies.head())

(34343, 77)
   Unnamed: 0  Unnamed: 0.1 imdb_title_id                 title  \
0          12         24403     tt0111161  Le ali della libertà   
1          48         38972     tt0468569   Il cavaliere oscuro   
2         118         45652     tt1375666             Inception   
3         103         27399     tt0137523            Fight Club   
4          53         24349     tt0110912          Pulp Fiction   

             original_title  year date_published  \
0  The Shawshank Redemption  1994     1995-02-10   
1           The Dark Knight  2008     2008-07-23   
2                 Inception  2010     2010-09-24   
3                Fight Club  1999     1999-10-29   
4              Pulp Fiction  1994     1994-10-28   

                               genre  duration       country  ...  \
0                          ['Drama']       142           USA  ...   
1       ['Action', 'Crime', 'Drama']       152       USA, UK  ...   
2  ['Action', 'Adventure', 'Sci-Fi']       148       USA, UK  ...

In [55]:
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
sub_movies['description']=sub_movies['description'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(sub_movies['description'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

#tfidf.get_feature_names()[1000:1020]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_movies['description']=sub_movies['description'].fillna('')


(34343, 37103)

In [56]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
print(cosine_sim.shape)

indices = pd.Series(sub_movies.index, index=sub_movies['original_title']).drop_duplicates()
print(indices.size)


In [57]:
import difflib
import math 
import array

def similarity(word, pattern):
    return difflib.SequenceMatcher(a=word.lower(), b=pattern.lower()).ratio()

def fuzzy_search(title):
    threshold = 0.6
    output = []
    outputWeight = []
    for lookup in sub_movies['original_title']:
        s =similarity(title, lookup) 
        if s > threshold:
            m = sub_movies[sub_movies['original_title'] == lookup].original_title
            try:
                output.append(m.item())
                outputWeight.append(s*10000)
            except Exception:
                pass
            #print(sub_movies[sub_movies['original_title'] == lookup].original_title)

    zipped_lists = zip(outputWeight,output)
    sorted_zipped_lists = sorted(zipped_lists, reverse=True)
    sorted_list1 = [element for _, element in sorted_zipped_lists]

    return sorted_list1

fuzzy_search("Harry Potter and the ")

(34343, 34343)
34343


['Harry Potter and the Goblet of Fire',
 "Harry Potter and the Sorcerer's Stone",
 'Harry Potter and the Half-Blood Prince',
 'Harry Potter and the Chamber of Secrets',
 'Harry Potter and the Prisoner of Azkaban',
 'The Power and the Prize',
 'The Power and the Glory',
 'Harry Potter and the Order of the Phoenix',
 'The Monster and the Girl',
 'The Spider and the Fly',
 'The We and the I',
 'Harry Potter and the Deathly Hallows: Part 2',
 'Harry Potter and the Deathly Hallows: Part 1',
 'The Doctor and the Girl',
 'Harry Styles: Behind the Album',
 'The Soldier and the Lady',
 'The Baroness and the Pig',
 'Harry and the Hendersons',
 'Charley and the Angel',
 'The Professor and the Madman',
 'Harry and Tonto',
 'The Other End of the Line',
 'The Doctor and the Devils',
 'Quatermass and the Pit',
 'Androcles and the Lion']

In [58]:
def recommendation(title,cos=cosine_sim):
    if title in indices.keys():
        idx=indices[title]
    elif fuzzy_search(title)[0] in indices.keys():
        idx=indices[fuzzy_search(title)[0]]
        print("did you mean" ,fuzzy_search(title)[0], "?")

    idx = idx[0] if(isinstance(idx, list) or (not np.isscalar(idx)) ) else idx
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:len(sim_scores)]
    #print(sim_scores)
    movie_indices = [[i[0], i[1],  sub_movies['original_title'].iloc[i[0]]] for i in sim_scores]

    return movie_indices 
    #return sub_movies['original_title'].iloc[movie_indices]

In [292]:
recommendation("The Lord of the Rings: The Return of the King")[1:11]

[[11, 0.20873295626329375, 'The Lord of the Rings: The Two Towers'],
 [7, 0.15278678771685827, 'The Lord of the Rings: The Fellowship of the Ring'],
 [27044, 0.1325964686774591, '30 Years to Life'],
 [28030, 0.12889357993578718, 'Desperate Search'],
 [19897, 0.1213977318715607, 'Bullet for a Badman'],
 [119, 0.11991560699822923, 'X-Men: Days of Future Past'],
 [1318, 0.11832718467807758, 'Conan the Barbarian'],
 [17709, 0.11484332120254412, 'Popcorn'],
 [31703, 0.11352555987110945, 'Men of Means'],
 [145, 0.11171706196642081, 'The Hobbit: The Desolation of Smaug']]

In [477]:
def getMovieName(title:str):
    title = title.strip()

    if title in indices.keys():
        return True, sub_movies[sub_movies['original_title'] == title].iloc[[0]]
    elif fuzzy_search(title)[0] in indices.keys():
        print("did you mean" ,fuzzy_search(title)[0], "?")
        return False, sub_movies[sub_movies['original_title'] == fuzzy_search(title)[0]].iloc[[0]]


    

In [479]:
getMovieName(" Fight Club")[1]

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,imdb_title_id,title,original_title,year,date_published,genre,duration,country,...,females_30age_avg_vote,females_30age_votes,females_45age_avg_vote,females_45age_votes,top1000_voters_rating,top1000_voters_votes,us_voters_rating,us_voters_votes,non_us_voters_rating,non_us_voters_votes
3,103,27399,tt0137523,Fight Club,Fight Club,1999,1999-10-29,['Drama'],139,"USA, Germany",...,8.7,115892.0,8.1,20944.0,8.3,882.0,8.7,263329.0,8.8,727678.0


In [547]:
from scipy import spatial
from pandas import DataFrame
import numpy as np

def similarityFactor(movie1: DataFrame, movie2 : DataFrame, script_similarity, age=0, gender=0):
    similarity = 0

    value1 = np.array(movie1['director_encoded'].values[0].strip("[").strip("]").split(), dtype=int)
    value2 = np.array(movie2['director_encoded'].values[0].strip("[").strip("]").split(), dtype=int)
    padding = abs(value1.shape[0] - value2.shape[0])
    value1 = np.pad(value1, ( padding ,0), 'constant') if value1.shape[0] < value2.shape[0] else value1
    value2 = np.pad(value2, ( padding ,0), 'constant') if value1.shape[0] > value2.shape[0] else value2
    
    similarity_factor = 0.1 *difflib.SequenceMatcher(a= value1, b=value2).ratio()
    

    value1 = np.array(movie1['writer_encoded'].values[0].strip("[").strip("]").split(), dtype=int)
    value2 = np.array(movie2['writer_encoded'].values[0].strip("[").strip("]").split(), dtype=int)
    padding = abs(value1.shape[0] - value2.shape[0])
    value1 = np.pad(value1, ( padding ,0), 'constant') if value1.shape[0] < value2.shape[0] else value1
    value2 = np.pad(value2, ( padding ,0), 'constant') if value1.shape[0] > value2.shape[0] else value2
    
    similarity_factor += 0.1 * difflib.SequenceMatcher(a= value1, b=value2).ratio()

    value1 = np.array(movie1['encoded_genre'].values[0].strip("[").strip("]").split(), dtype=int)
    value2 = np.array(movie2['encoded_genre'].values[0].strip("[").strip("]").split(), dtype=int)
    padding = abs(value1.shape[0] - value2.shape[0])
    value1 = np.pad(value1, ( padding ,0), 'constant') if value1.shape[0] < value2.shape[0] else value1
    value2 = np.pad(value2, ( padding ,0), 'constant') if value1.shape[0] > value2.shape[0] else value2
    
    similarity_factor += 0.3 * difflib.SequenceMatcher(a= value1, b=value2).ratio()
    
    r = getMoveRating(movie2, age, gender)
    similarity_factor += 0.3 * (r/10)

    
    
    similarity_factor += 0.2 * script_similarity

    
    return similarity_factor , r

    

In [531]:
MALE = 1
FEMALE = 2
NONBINARY = 0

def getMoveRating(movie : DataFrame , age:int, gender:int):
    age_group = [0, 18, 30, 45]
    age_group = [abs(age_group[i]-age) for i in range(len(age_group))]
    group = age_group.index(min(age_group))
    attribute = ""
    attribute2 = ""

    if gender == 0 and age == 0:
        attribute = "avg_vote"
        attribute2 = "avg_vote"
    else:
        if gender == 0:
            attribute += "allgenders_"
            attribute2 += "allgenders_"
            
        elif gender == 1:
            attribute += "males_"
            attribute2 += "males_"
        else:
            attribute += "females_"
            attribute2 += "females_"

        if age == 0:
            attribute += "allages_avg_vote"
            attribute2 += "allages_votes"
        elif group == 0: # group 0
            attribute += "0age_avg_vote"
            attribute2 += "0age_votes"
        elif group == 1: # group 18
            attribute += "18age_avg_vote"
            attribute2 += "18age_votes"
        elif group == 1: # group 30
            attribute += "30age_avg_vote"
            attribute2 += "30age_votes"
        else: #group 45
            attribute += "45age_avg_vote"
            attribute2 += "45age_votes"
    

    avg = movie[attribute].values[0]
    num_voters = movie[attribute2].values[0]

    # rating (WR) = (v ÷ (v+m)) × R + (m ÷ (v+m)) × C 
    # R = average for the movie (mean) = (Rating)
    # v = number of votes for the movie = (votes)
    # m = minimum votes required to be listed in the Top 250 (currently 3000)
    # C = the mean vote across the whole report (currently 6.9)
    M = 3000
    C = 6
    rating = ( num_voters / (num_voters + M )) * avg + (M / (num_voters + M)) * C

    #print(rating)


    return rating

In [462]:
getMoveRating( 1, FEMALE)

females_0age_avg_vote


In [434]:

import math
def recommend(movie):
    recommend_score = 0
    aux = 0
    movies_list = recommendation(movie['original_title'].item())
    movies_list = movies_list[1:math.floor(len(movies_list)*0.1)]
    print(math.floor(len(movies_list)*0.1))
    for m in movies_list: 
        found, name = getMovieName(m[2])
        aux = similarityFactor(movie,name, m[1] )
        if aux > recommend_score : 
            recommend_score = aux
            movie_to_recommend = name

    return recommend_score, movie_to_recommend

In [577]:
def recommend_from_user_list(user_movie_list:DataFrame,user_hate_movie_list:DataFrame,user_not_care_movie_list:DataFrame, age = 0, gender = NONBINARY):
    recommend_score = 0
    aux = 0
    the_list = []
    movie_to_recommend = None
    portion = 0.1/len(user_movie_list)
    for index, movie in user_movie_list.iterrows():
        movies_list = recommendation(movie['original_title'])
        movies_list = movies_list[1:math.floor(len(movies_list)*portion)]
        the_list.extend(movies_list)
    the_list = [list(x) for x in set(tuple(x) for x in the_list)]
    the_list = sorted(the_list, key=lambda x: x[1], reverse=True)
    print(len(the_list))
    for m in the_list: 
        found, name = getMovieName(m[2])
        if not name.original_title.isin(user_movie_list.original_title).astype(bool).all()\
            and not name.original_title.isin(user_hate_movie_list.original_title).astype(bool).all()\
            and not name.original_title.isin(user_not_care_movie_list.original_title).astype(bool).all():
            aux = 0
            aux_hate = 0
            for index in range(len(user_movie_list)):
                user_m = user_movie_list.iloc[[index]]
                s, r = similarityFactor(user_m ,name, m[1], age, gender )
                aux += s
            for index in range(len(user_hate_movie_list)):
                user_m = user_hate_movie_list.iloc[[index]]
                s, r = similarityFactor(user_m ,name, m[1], age, gender )
                aux_hate += s
            aux = aux - 0.4 * aux_hate
            aux /= len(user_movie_list)
            if aux > recommend_score : 
                recommend_score = aux
                movie_to_recommend = name
                print(r)

    return recommend_score, movie_to_recommend
    

In [490]:
sub_movies.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'imdb_title_id', 'title',
       'original_title', 'year', 'date_published', 'genre', 'duration',
       'country', 'language', 'director', 'writer', 'production_company',
       'actors', 'description', 'avg_vote', 'votes', 'budget',
       'usa_gross_income', 'worlwide_gross_income', 'metascore',
       'reviews_from_users', 'reviews_from_critics', 'id', 'encoded_genre',
       'writer_encoded', 'director_encoded', 'language_encoded',
       'weighted_average_vote', 'total_votes', 'mean_vote', 'median_vote',
       'votes_10', 'votes_9', 'votes_8', 'votes_7', 'votes_6', 'votes_5',
       'votes_4', 'votes_3', 'votes_2', 'votes_1', 'allgenders_0age_avg_vote',
       'allgenders_0age_votes', 'allgenders_18age_avg_vote',
       'allgenders_18age_votes', 'allgenders_30age_avg_vote',
       'allgenders_30age_votes', 'allgenders_45age_avg_vote',
       'allgenders_45age_votes', 'males_allages_avg_vote',
       'males_allages_votes', 'males_0age_avg_vote

In [594]:
user_movie_list = pd.concat([
                getMovieName("Madagascar")[1],
                getMovieName("Madagascar: Escape 2 Africa")[1],
                getMovieName(" Madagascar 3: Europe's Most Wanted")[1],
                getMovieName("A Bug's Life")[1],
                getMovieName("Antz")[1],
                getMovieName(" Shrek")[1],
                ]
                )
user_hate_movie_list = pd.concat([
                
                getMovieName("Zootopia")[1],
                getMovieName("Tangled")[1],
                ]
                )

user_not_care_movie_list = pd.concat([
                getMovieName("Adventure Scouts")[1],
                getMovieName("Pitch Perfect")[1],
                ]
                )

score, result = recommend_from_user_list(user_movie_list,user_hate_movie_list,user_not_care_movie_list, age=30, gender=MALE)
print("similarity factor: ", score)
print( "movie name: ", result['original_title'].item() )
print( "statistics :" )
print()
print("avg vote: ", result['weighted_average_vote'].item(),  " , number of voters: ", result['total_votes'].item() )

print()
print("allgenders_0age_votes: ", result['allgenders_0age_avg_vote'].item(), " , number of voters: ", result['allgenders_0age_votes'].item() )
print("allgenders_18age_avg_vote: ", result['allgenders_18age_avg_vote'].item(), " , number of voters: ", result['allgenders_18age_votes'].item() )
print("allgenders_30age_avg_vote: ", result['allgenders_30age_avg_vote'].item(), " , number of voters: ", result['allgenders_30age_votes'].item() )
print("allgenders_45age_avg_vote: ", result['allgenders_45age_avg_vote'].item(), " , number of voters: ", result['allgenders_45age_votes'].item() )

print()
print("females_0age_votes: ", result['females_0age_avg_vote'].item(), " , number of voters: ", result['females_0age_votes'].item() )
print("females_18age_avg_vote: ", result['females_18age_avg_vote'].item(), " , number of voters: ", result['females_18age_votes'].item() )
print("females_30age_avg_vote: ", result['females_30age_avg_vote'].item(), " , number of voters: ", result['females_30age_votes'].item() )
print("females_45age_avg_vote: ", result['females_45age_avg_vote'].item(), " , number of voters: ", result['females_45age_votes'].item() )

print()
print("males_0age_votes: ", result['males_0age_avg_vote'].item(), " , number of voters: ", result['males_0age_votes'].item() )
print("males_18age_avg_vote: ", result['males_18age_avg_vote'].item(), " , number of voters: ", result['males_18age_votes'].item() )
print("males_30age_avg_vote: ", result['males_30age_avg_vote'].item(), " , number of voters: ", result['males_30age_votes'].item() )
print("males_45age_avg_vote: ", result['males_45age_avg_vote'].item(), " , number of voters: ", result['males_45age_votes'].item() )

3426
5.9758931445123915
5.962068965517241
5.9978160158835205
5.72864706956363
6.930177027583367
similarity factor:  0.4529658870204784
movie name:  Rango
statistics :

avg vote:  7.2  , number of voters:  231174

allgenders_0age_votes:  7.5  , number of voters:  247.0
allgenders_18age_avg_vote:  7.3  , number of voters:  59046.0
allgenders_30age_avg_vote:  7.2  , number of voters:  103846.0
allgenders_45age_avg_vote:  7.1  , number of voters:  19635.0

females_0age_votes:  7.4  , number of voters:  43.0
females_18age_avg_vote:  7.0  , number of voters:  13274.0
females_30age_avg_vote:  6.9  , number of voters:  17609.0
females_45age_avg_vote:  6.9  , number of voters:  2881.0

males_0age_votes:  7.5  , number of voters:  185.0
males_18age_avg_vote:  7.4  , number of voters:  44927.0
males_30age_avg_vote:  7.2  , number of voters:  85028.0
males_45age_avg_vote:  7.1  , number of voters:  16432.0


In [583]:
movie=input('Please enter the movie name:')

found, entry = getMovieName(movie)
score, result = recommend(entry)
print("similarity factor: ", score)
print( "movie name: ", result['original_title'].item() )



did you mean Madagascar ?
343


TypeError: '>' not supported between instances of 'tuple' and 'int'