In [1]:
from flask import Flask, jsonify, render_template, request
from flask_cors import CORS, cross_origin
import pandas as pd
from ast import literal_eval
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import boto3
from botocore.config import Config

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
#Importing the relevant datasets from the mounted Google Drive (change the code below if the data is hosted elsewhere)
metadata = pd.read_csv("D:\OneDrive - NITT\Custom_Download\movies_metadata.csv")
credits = pd.read_csv("D:\OneDrive - NITT\Custom_Download\credits.csv")

In [80]:
list(metadata)

['adult',
 'belongs_to_collection',
 'budget',
 'genres',
 'homepage',
 'id',
 'imdb_id',
 'original_language',
 'original_title',
 'overview',
 'popularity',
 'poster_path',
 'production_companies',
 'production_countries',
 'release_date',
 'revenue',
 'runtime',
 'spoken_languages',
 'status',
 'tagline',
 'title',
 'video',
 'vote_average',
 'vote_count',
 'cast',
 'crew',
 'soup']

In [23]:
row = []
for i,j in enumerate(metadata['id']):
    try:
        c = int(j)
    except:
        row.append(i)

In [25]:
metadata.drop(row,inplace=True)
metadata.reset_index(drop=True,inplace=True)

In [26]:
credits['id'] = credits['id'].astype('int')
metadata['id'] = metadata['id'].astype('int')

In [27]:
metadata = metadata.merge(credits, on='id')

In [28]:
# metadata = metadata.drop(columns=['original_title'])

In [30]:
#Print the important features of metadata
metadata[['title', 'cast', 'crew', 'genres']].head()

Unnamed: 0,title,cast,crew,genres
0,Toy Story,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,Jumanji,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
2,Grumpier Old Men,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 10749, 'name': 'Romance'}, {'id': 35, ..."
3,Waiting to Exhale,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
4,Father of the Bride Part II,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 35, 'name': 'Comedy'}]"


In [31]:
#raises an exception if the input isn't a valid Python datatype, so the code won't be executed if it's not.
#Parse the stringified features into their corresponding python objects

from ast import literal_eval

features = ['cast', 'crew', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(literal_eval)

In [33]:
# We can do something similar to extract the actors, keywords and genres with a function as well:

#Getting a list of the actors, keywords and genres
def get_list(x):
    if isinstance(x, list): #checking to see if the input is a list or not
        names = [i['name'] for i in x] #if we take a look at the data, we find that
        #the word 'name' is used as a key for the names actors, 
        #the actual keywords and the actual genres
        
        #Check if more than 3 elements exist. If yes, return only first three. 
        #If no, return entire list. Too many elements would slow down our algorithm 
        #too much, and three should be more than enough for good recommendations.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [37]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x] #cleaning up spaces in the data
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

The data is now cleaned! Finally, we are ready to create our soup for each movie. We can now create a funcion that iterates over the rows of our metadata and joins the keywords, cast, director and genres columns into one big word soup. Each element will be separated by a space " " that will signal to our vectorization function that that is a particular word, to be encoded separately and uniquely.




## Note: I need the columns original language (becuase I want this cilumn to be binary 'english' and 'arabic' and also the column soup to be in the whole merged dataset 

In [57]:
def f(x):
    d = []
    for i in x:
        d.append(i['name'])
    return d

In [58]:
metadata['genres'] = metadata['genres'].apply(lambda x:f(x))

In [62]:
#This function makes use of the property of the cosine similarity funciton that
#the order and types of inputs don't matter, what matters is the similarity
#between different soups of words
def create_soup(x):
    return ' '.join(x['cast']) + ' ' + ' '.join(x['genres'])

metadata['soup'] = metadata.apply(create_soup, axis=1)
#metadata.head()
# metadata[['title', 'soup', 'cast', 'director', 'keywords', 'genres', 'original_language']].head()

Take a look at the soup! It looks pretty good: all the information about each movie is now a compact list of names corresponding to the genres, director, actors and keywords.

Now that we have the soup for each movie, we want to create one more soup every time our recommender is run: a soup of inputs by the user. We wan't to collect what genres, directors, actors and keywords THEY like, so that we can then vectorize everything, compute pairwise cosine similarity between that input and each movie in our database, and rank which are the most similar movies to that input. You can see the below code: 

In [65]:
#Getting the user's input for genre, actors and directors of their liking.
def get_genres():
    genres = input("What Movie Genre are you interested in (if multiple, please separate them with a comma)? [Type 'skip' to skip this question] ")
    genres = " ".join(["".join(n.split()) for n in genres.lower().split(',')])
    return genres

def get_actors():
   actors = input("Who are some actors within the genre that you love (if multiple, please separate them with a comma)? [Type 'skip' to skip this question] ")
   actors = " ".join(["".join(n.split()) for n in actors.lower().split(',')])
   return actors

def get_language():
    original_language = input("What is the spoken language (type 'en' for English and  'عربي' للعربي [Type 'skip' to skip this question] ")
    original_language = " ".join(["".join(n.split()) for n in original_language.lower().split(',')])
    return original_language
#def get_directors():
#directors = input("Who are some directors within the genre that you love (if multiple, please separate them with a comma)? [Type 'skip' to skip this question] ")
 # directors = " ".join(["".join(n.split()) for n in directors.lower().split(',')])
# return directors

def get_keywords():
  keywords = input("What are some of the keywords that describe the movie you want to watch, like elements of the plot, whether or not it is about friendship, etc? (if multiple, please separate them with a comma)? [Type 'skip' to skip this question] ")
  keywords = " ".join(["".join(n.split()) for n in keywords.lower().split(',')])
  return keywords

def get_searchTerms():
    searchTerms = [] 
    genres = get_genres()
    if genres != 'skip':
     searchTerms.append(genres)

    actors = get_actors()
    if actors != 'skip':
     searchTerms.append(actors)
    
    original_language = get_language()
    if original_language != 'skip':
     searchTerms.append(original_language)

#directors = get_directors()
#if directors != 'skip':
    #searchTerms.append(directors)

#     keywords = get_keywords()
#     if keywords != 'skip':
#      searchTerms.append(keywords)
  
    
    return searchTerms

Note how each of the functions above are prompting a different type of inputs for search, and how we structured the input questions to make sure that they are adequately formatted for our funtions to convert them into lists that can then be 'word souped' and vectorized with the word soups for our movies.

Creating Our Recommendation Model Based on Count Vectoriser and Cosine Similarity

With our building blocks in place, and our data properly formatted, we can finally implement the ranking/recommendation function. As mentioned above, our function will take as an input the data that has already been pre-processed above, and will ask for user input. It will then word-soupify the user input, and add it as a row to our data. Next, it will vectorize these wordsoups using a function from the sklearn python library called CountVectorizer. CountVectorizer is extremely simple in what it does: it takes documents (different stings) and returns a tokenized matrix. Each wordsoup is encoded into frequencies of words in that wordsoup. For example, the following sentences, stored in a list:

corpus = [

'This is the first document.',

'This document is the second document.',

'And this is the third one.',

'Is this the first document?']

If we apply the CountVectorizer to them, we would get the following table:

Word 1	Word 2	Word 3	Word 4	Word 5	Word 6	Word 7	Word 8	Word 9
0	1	1	1	0	0	1	0	1
0	2	0	1	0	1	1	0	1
1	0	0	1	1	0	1	1	1
0	1	1	1	0	0	1	0	1



Note that for our recommendation algorithm, we also want to vectrorize the user input. We chose to do that by simiply adding the inputted word soup to the metadata table, as the last entry, and then running the vectorization. While this isn't the most efficient way to go about it, the CountVectorize function is very quick to run and spends little resources. The bigger problem we have to face is the cosine similarity calculations.

Cosine similarity is a mathematical computation that tells us the similarity between two vectors 
A
 and 
B
. In effect, we are calculating the cosine of the angle 
θ
between these two vectors. The function returns a value between -1, indicating complete opposite vectors, to 1, indicating the same vector. 0 indicates a lack of correlation between the vectors, and intermediate values indicate intermediate levels of similarity.

Note that the cosine similarity function increases linearly in complexity as we increase the size of A and B (note that A and B have the same size, 
n
). The dot product of A and B will require n+t more computations if we add t more values to A and B, and the magnitude of each of these will also increase linearly. So far, no trouble in computational complexity.

However, our algorithm performs cosine similarity computation between each possibe pair of movies. If we have 
k
 movies, then we need to perform 
k
2
computations. This is the reason why we had to reduce the number of movies in our dataset from 45,000 to 10,000: the 35,000 movie difference translates into 
1.925
∗
10
9
 computations. Of course, there are methods to decrease the number of computations required and therefore allow us to use the entire dataset, but we decided to leave these as backlogged potential improvements for now.

In [67]:
metadata = metadata.iloc[0:10000,:] # only using 10k movies

In [85]:
metadata = metadata[['title','genres','original_language','crew','soup','overview','release_date']]

In [89]:
metadata.dropna(inplace=True)

In [90]:
metadata['Year'] = metadata['release_date'].apply(lambda x:x.split('-')[0])

In [102]:
metadata.drop('release_date',axis=1,inplace=True)

In [91]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def make_recommendation(metadata=metadata):
 new_row = metadata.iloc[-1,:].copy() #creating a copy of the last row of the 
  #dataset, which we will use to input the user's input
  
  #grabbing the new wordsoup from the user
 searchTerms = get_searchTerms()  
 new_row.iloc[-1] = " ".join(searchTerms) #adding the input to our new row
  
  #adding the new row to the dataset
 metadata = metadata.append(new_row)
  
  #Vectorizing the entire matrix as described above!
 count = CountVectorizer(stop_words='english')
 count_matrix = count.fit_transform(metadata['soup'])

  #running pairwise cosine similarity 
 cosine_sim2 = cosine_similarity(count_matrix, count_matrix) #getting a similarity matrix
  
  #sorting cosine similarities by highest to lowest
 sim_scores = list(enumerate(cosine_sim2[-1,:]))
 sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

  #matching the simix
# similarities to the movie titles and ids
 ranked_titles = []
 for i in range(1, 11):
  indx = sim_scores[i][0]
  ranked_titles.append([metadata['title'].iloc[indx]])
 ranked_titles = pd.DataFrame(np.array(ranked_titles)[:,0],columns=['Title'])
 return ranked_titles

#let's try our recommendation function now
make_recommendation()

What Movie Genre are you interested in (if multiple, please separate them with a comma)? [Type 'skip' to skip this question] skip
Who are some actors within the genre that you love (if multiple, please separate them with a comma)? [Type 'skip' to skip this question] skip
What is the spoken language (type 'en' for English and  'عربي' للعربي [Type 'skip' to skip this question] skip


Unnamed: 0,Title
0,Life Is Sweet
1,Topsy-Turvy
2,Swimming to Cambodia
3,Monster in a Box
4,Hometown Legend
5,Princess Caraboo
6,Rough Magic
7,Wide Eyed and Legless
8,Lotto Land
9,Mina Tannenbaum


In [103]:
list(metadata)

['title', 'genres', 'original_language', 'crew', 'soup', 'overview', 'Year']

# Arabic Movies Data From Kaggle: 

In [93]:
data = pd.read_csv('D:\OneDrive - NITT\Custom_Download\Arabicdata0.csv') 
data.head()

Unnamed: 0,اسم الفيلم,تاريخ العرض,تصنيف الفيلم,مدة الفيلم (دقيقة),ملخص,تمثيل
0,قلب المرأة,1940,رومانسي,87,دار حدث فيلم خيري فتاه ثري مخطوب ابناء عموم تف...,"سليمان نجيب,أمينة رزق,دولت أبيض,عقيلة راتب,سلو..."
1,الورشة,1940,دراما,105,سافر سطا عبدالرحمن صاحب ورشه ميكانزم زميل رحله...,"عزيزة أمير,محمود ذو الفقار,أنور وجدي,نجمة إبرا..."
2,حياة الظلام,1940,دراما,94,عاءله متوسط حال مكون زوج زوج ٱبن حصل ٱبن اجازه...,"ميمي شكيب,محسن سرحان,روحية خالد,فردوس محمد,علي..."
3,الباشمقاول,1940,دراما,85,دار حدث فيلم محامي ممتاز هوه مغازله نساء رغم ز...,"فوزي الجزايرلي,ميمي شكيب,إحسان الجزايرلي,زوزو ..."
4,يوم سعيد,1940,رومانسي,125,محمد كمال شاب بسيط هوه موسيقي غناء عطف صاحب بي...,"محمد عبدالوهاب,علوية جميل,فاطمة محجوب,فاتن حما..."


In [94]:
data['original_language'] = 'ar' 

In [97]:
moviesNames=data.get('اسم الفيلم')
date=data.get('تاريخ العرض')
categories=data.get('تصنيف الفيلم')
synopses=data.get('ملخص')
actors=data.get('تمثيل')

In [123]:
ae_data = pd.DataFrame()

In [124]:
ae_data['Year'] = date
ae_data['original_language'] = 'ae'
ae_data['genres'] = categories
ae_data['overview'] = synopses
ae_data['crew'] = actors
ae_data['title'] = moviesNames

In [125]:
ae_data.dropna(inplace=True)

In [126]:
ae_data['crew'] = ae_data['crew'].apply(lambda x:x.split(','))
ae_data['genres'] = ae_data['genres'].apply(lambda x:x.split(','))

In [127]:
ae_data.head()

Unnamed: 0,Year,original_language,genres,overview,crew,title
0,1940,ae,[رومانسي],دار حدث فيلم خيري فتاه ثري مخطوب ابناء عموم تف...,"[سليمان نجيب, أمينة رزق, دولت أبيض, عقيلة راتب...",قلب المرأة
1,1940,ae,[دراما],سافر سطا عبدالرحمن صاحب ورشه ميكانزم زميل رحله...,"[عزيزة أمير, محمود ذو الفقار, أنور وجدي, نجمة ...",الورشة
2,1940,ae,[دراما],عاءله متوسط حال مكون زوج زوج ٱبن حصل ٱبن اجازه...,"[ميمي شكيب, محسن سرحان, روحية خالد, فردوس محمد...",حياة الظلام
3,1940,ae,[دراما],دار حدث فيلم محامي ممتاز هوه مغازله نساء رغم ز...,"[فوزي الجزايرلي, ميمي شكيب, إحسان الجزايرلي, ز...",الباشمقاول
4,1940,ae,[رومانسي],محمد كمال شاب بسيط هوه موسيقي غناء عطف صاحب بي...,"[محمد عبدالوهاب, علوية جميل, فاطمة محجوب, فاتن...",يوم سعيد


In [129]:
def create_soup_ae(x):
    return ' '.join(x['crew']) + ' ' + ' '.join(x['genres'])

In [130]:
ae_data['soup'] = ae_data.apply(create_soup_ae, axis=1)

In [131]:
list(ae_data)

['Year', 'original_language', 'genres', 'overview', 'crew', 'title', 'soup']

In [144]:
data_full = metadata.append(ae_data)

In [145]:
data_full

Unnamed: 0,title,genres,original_language,crew,soup,overview,Year
0,Toy Story,"[Animation, Comedy, Family]",en,"[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",tomhanks timallen donrickles Animation Comedy ...,"Led by Woody, Andy's toys live happily in his ...",1995
1,Jumanji,"[Adventure, Fantasy, Family]",en,"[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",robinwilliams jonathanhyde kirstendunst Advent...,When siblings Judy and Peter discover an encha...,1995
2,Grumpier Old Men,"[Romance, Comedy]",en,"[{'credit_id': '52fe466a9251416c75077a89', 'de...",waltermatthau jacklemmon ann-margret Romance C...,A family wedding reignites the ancient feud be...,1995
3,Waiting to Exhale,"[Comedy, Drama, Romance]",en,"[{'credit_id': '52fe44779251416c91011acb', 'de...",whitneyhouston angelabassett lorettadevine Com...,"Cheated on, mistreated and stepped on, the wom...",1995
4,Father of the Bride Part II,[Comedy],en,"[{'credit_id': '52fe44959251416c75039ed7', 'de...",stevemartin dianekeaton martinshort Comedy,Just when George Banks has recovered from his ...,1995
...,...,...,...,...,...,...,...
1177,الشك يا حبيبي,[دراما],ae,"[شادية, محمود ياسين, ناهد شريف, يحيى شاهين, سن...",شادية محمود ياسين ناهد شريف يحيى شاهين سناء جم...,رباب زوجه رجل عمل عبدالرحيم طبيب مرض نساء تولي...,1979
1178,دعوني انتقم,[جريمة],ae,"[رشدي أباظة, حسين فهمي, مديحة كامل, عادل أدهم,...",رشدي أباظة حسين فهمي مديحة كامل عادل أدهم زيزي...,القي هدي زوجه الراءد محمود ٱبن مصرع يد رجل عصا...,1979
1179,عاصفة من الدموع,[دراما],ae,"[فريد شوقي, ليلى طاهر, عمر الحريري, مريم فخر ا...",فريد شوقي ليلى طاهر عمر الحريري مريم فخر الدين...,سكرتير عزيز توصل عصام محامي زواج هبه حامل عصام...,1979
1180,خائفة من شيء ما,[ﺗﺸﻮﻳﻖ ﻭﺇﺛﺎﺭﺓ],ae,"[نجوى إبراهيم, رشدي أباظة, عزت العلايلي, جميل ...",نجوى إبراهيم رشدي أباظة عزت العلايلي جميل راتب...,راشد خطف طالب عصم منزل فاجا ٱعتدي لص رءوف انقذ...,1979
