   #                    Basic Movie Recommendation System 

## Imports 

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#url = 'https://raw.githubusercontent.com/codeheroku/Introduction-to-Machine-Learning/master/Building%20a%20Movie%20Recommendation%20Engine/movie_dataset.csv'
df1= pd.read_csv('tmdb_5000_credits.csv')
df2 = pd.read_csv('tmdb_5000_movies.csv')

In [3]:
df1.columns = ['id','tittle','cast','crew']


In [4]:
df= df2.merge(df1,on='id')

## Finding the best movies following imdb rating criteria

In [5]:
C= df2['vote_average'].mean()
m= df2['vote_count'].quantile(0.75)
k = df.copy()
l = k.copy()
good_movies = k.loc[df2['vote_count'] >= m]

In [6]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [7]:
good_movies['score'] = good_movies.apply(weighted_rating, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [8]:
good_movies = good_movies.sort_values('score', ascending=False)
good_movies[['title', 'score']]

Unnamed: 0,title,score
1881,The Shawshank Redemption,8.301547
3337,The Godfather,8.143459
662,Fight Club,8.139688
3232,Pulp Fiction,8.122458
65,The Dark Knight,8.078054
...,...,...
91,Independence Day: Resurgence,5.172190
303,Catwoman,5.102609
3746,The Boy Next Door,4.934696
210,Batman & Robin,4.847114


In [9]:
#Print the top n movies
def goodies(n):
    goodies= good_movies[['title', 'vote_average', 'score']].head(n)
    print(goodies)



In [10]:
 # print top n movies by giving the no of movies you want in the arguement. 
goodies(10) 

                         title  vote_average     score
1881  The Shawshank Redemption           8.5  8.301547
3337             The Godfather           8.4  8.143459
662                 Fight Club           8.3  8.139688
3232              Pulp Fiction           8.3  8.122458
65             The Dark Knight           8.2  8.078054
809               Forrest Gump           8.2  8.020698
96                   Inception           8.1  7.997869
1818          Schindler's List           8.3  7.978806
3865                  Whiplash           8.3  7.973979
95                Interstellar           8.1  7.972478


## Recommendation using content-based filtering

In [11]:
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    df[feature] = df[feature].apply(literal_eval)

In [12]:
# Returns the name of the director

def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan


# Returns the list top 4 elements or entire list; whichever is more.
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 4:
            names = names[:4]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [13]:

df['director'] = df['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    df[feature] = df[feature].apply(get_list)

In [14]:
# Print the new features of the first 3 films
df[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

Unnamed: 0,title,cast,director,keywords,genres
0,Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weave...",James Cameron,"[culture clash, future, space war, space colony]","[Action, Adventure, Fantasy, Science Fiction]"
1,Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...",Gore Verbinski,"[ocean, drug abuse, exotic island, east india ...","[Adventure, Fantasy, Action]"
2,Spectre,"[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",Sam Mendes,"[spy, based on novel, secret agent, sequel]","[Action, Adventure, Crime]"


In [15]:
# cleaning the data toi avoid the misunderstanding caused by space between name and surname 

def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [16]:
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    df[feature] = df[feature].apply(clean_data)


In [17]:
def combine_features(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
for feature in features:
    df[feature] = df[feature].fillna('') #filling all NaNs with blank string
df["combined_features"] = df.apply(combine_features,axis=1) #applying combined_features() method over each rows of dataframe and storing the combined string in “combined_features” column

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

cv= CountVectorizer(stop_words='english')
count_matrix = cv.fit_transform(df['combined_features'])

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(count_matrix)

In [20]:
df = df.reset_index()
df.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,status,tagline,title,vote_average,vote_count,tittle,cast,crew,director,combined_features
0,0,237000000,"[action, adventure, fantasy, sciencefiction]",http://www.avatarmovie.com/,19995,"[cultureclash, future, spacewar, spacecolony]",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,Released,Enter the World of Pandora.,Avatar,7.2,11800,Avatar,"[samworthington, zoesaldana, sigourneyweaver, ...","[{'credit_id': '52fe48009251416c750aca23', 'de...",jamescameron,cultureclash future spacewar spacecolony samwo...
1,1,300000000,"[adventure, fantasy, action]",http://disney.go.com/disneypictures/pirates/,285,"[ocean, drugabuse, exoticisland, eastindiatrad...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Pirates of the Caribbean: At World's End,"[johnnydepp, orlandobloom, keiraknightley, ste...","[{'credit_id': '52fe4232c3a36847f800b579', 'de...",goreverbinski,ocean drugabuse exoticisland eastindiatradingc...
2,2,245000000,"[action, adventure, crime]",http://www.sonypictures.com/movies/spectre/,206647,"[spy, basedonnovel, secretagent, sequel]",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,Released,A Plan No One Escapes,Spectre,6.3,4466,Spectre,"[danielcraig, christophwaltz, léaseydoux, ralp...","[{'credit_id': '54805967c3a36829b5002c41', 'de...",sammendes,spy basedonnovel secretagent sequel danielcrai...
3,3,250000000,"[action, crime, drama, thriller]",http://www.thedarkknightrises.com/,49026,"[dccomics, crimefighter, terrorist, secretiden...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,Released,The Legend Ends,The Dark Knight Rises,7.6,9106,The Dark Knight Rises,"[christianbale, michaelcaine, garyoldman, anne...","[{'credit_id': '52fe4781c3a36847f81398c3', 'de...",christophernolan,dccomics crimefighter terrorist secretidentity...
4,4,260000000,"[action, adventure, sciencefiction]",http://movies.disney.com/john-carter,49529,"[basedonnovel, mars, medallion, spacetravel]",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,Released,"Lost in our world, found in another.",John Carter,6.1,2124,John Carter,"[taylorkitsch, lynncollins, samanthamorton, wi...","[{'credit_id': '52fe479ac3a36847f813eaa3', 'de...",andrewstanton,basedonnovel mars medallion spacetravel taylor...


In [21]:
def get_title_from_index(index):
    return df[df.index == index]["title"].values[0]
def get_index_from_title(title):
    return df[df.title == title]["index"].values[0]

In [22]:
def get_prediction (x, n):
    movie_user_likes = x
    number = n
    movie_index = get_index_from_title(movie_user_likes)
    similar_movies =  list(enumerate(cosine_sim[movie_index]))
    sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)[1:]
    i=0
    print("Top "+str(n)+" similar movies to "+movie_user_likes+" are:\n".format(n))
    for element in sorted_similar_movies:
        print(get_title_from_index(element[0]))
        i=i+1
        if i>=n:
            break

In [23]:
get_prediction('Gone Girl', 10)

Top 10 similar movies to Gone Girl are:

Fabled
The Girl with the Dragon Tattoo
The Town
Shutter Island
Jack Reacher
The Game
The Number 23
The Curious Case of Benjamin Button
Zodiac
The Bourne Identity


### With only  'Genres' & 'Keywords '

In [24]:
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    k[feature] = k[feature].apply(literal_eval)

In [25]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 6 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 6:
            names = names[:6]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [26]:
k['director'] = k['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    k[feature] = k[feature].apply(get_list)

In [27]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [28]:
features = ['keywords', 'genres']

for feature in features:
    k[feature] = k[feature].apply(clean_data)


In [29]:
def combine_features_2(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['genres'])
for feature in features:
    k[feature] = k[feature].fillna('') #filling all NaNs with blank string
k["combined_features_less"] = k.apply(combine_features_2,axis=1) #applying combined_features() method over each rows of dataframe and storing the combined string in “combined_features” column


In [30]:
count_matrix_2 = cv.fit_transform(k['combined_features_less'])
cosine_sim_2 = cosine_similarity(count_matrix_2)

In [31]:
k = k.reset_index()
k.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,status,tagline,title,vote_average,vote_count,tittle,cast,crew,director,combined_features_less
0,0,237000000,"[action, adventure, fantasy, sciencefiction]",http://www.avatarmovie.com/,19995,"[cultureclash, future, spacewar, spacecolony, ...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,Released,Enter the World of Pandora.,Avatar,7.2,11800,Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weave...","[{'credit_id': '52fe48009251416c750aca23', 'de...",James Cameron,cultureclash future spacewar spacecolony socie...
1,1,300000000,"[adventure, fantasy, action]",http://disney.go.com/disneypictures/pirates/,285,"[ocean, drugabuse, exoticisland, eastindiatrad...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...","[{'credit_id': '52fe4232c3a36847f800b579', 'de...",Gore Verbinski,ocean drugabuse exoticisland eastindiatradingc...
2,2,245000000,"[action, adventure, crime]",http://www.sonypictures.com/movies/spectre/,206647,"[spy, basedonnovel, secretagent, sequel, mi6, ...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,Released,A Plan No One Escapes,Spectre,6.3,4466,Spectre,"[Daniel Craig, Christoph Waltz, Léa Seydoux, R...","[{'credit_id': '54805967c3a36829b5002c41', 'de...",Sam Mendes,spy basedonnovel secretagent sequel mi6 britis...
3,3,250000000,"[action, crime, drama, thriller]",http://www.thedarkknightrises.com/,49026,"[dccomics, crimefighter, terrorist, secretiden...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,Released,The Legend Ends,The Dark Knight Rises,7.6,9106,The Dark Knight Rises,"[Christian Bale, Michael Caine, Gary Oldman, A...","[{'credit_id': '52fe4781c3a36847f81398c3', 'de...",Christopher Nolan,dccomics crimefighter terrorist secretidentity...
4,4,260000000,"[action, adventure, sciencefiction]",http://movies.disney.com/john-carter,49529,"[basedonnovel, mars, medallion, spacetravel, p...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,Released,"Lost in our world, found in another.",John Carter,6.1,2124,John Carter,"[Taylor Kitsch, Lynn Collins, Samantha Morton,...","[{'credit_id': '52fe479ac3a36847f813eaa3', 'de...",Andrew Stanton,basedonnovel mars medallion spacetravel prince...


In [32]:
def get_title_from_index(index):
    return k[k.index == index]["title"].values[0]
def get_index_from_title(title):
    return k[k.title == title]["index"].values[0]
def get_prediction_g_k (x, n):
    movie_user_likes = x
    number = n
    movie_index = get_index_from_title(movie_user_likes)
    similar_movies =  list(enumerate(cosine_sim_2[movie_index]))
    sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)[1:]
    i=0
    print("Top "+str(n)+" similar movies to "+movie_user_likes+" are:\n".format(n))
    for element in sorted_similar_movies:
        print(get_title_from_index(element[0]))
        i=i+1
        if i>=n:
            break

In [33]:
get_prediction_g_k('Gone Girl', 10)

Top 10 similar movies to Gone Girl are:

Slow Burn
Broken Horses
The Caveman's Valentine
Stoker
The Life Before Her Eyes
Red Riding: In the Year of Our Lord 1974
Goddess of Love
Amnesiac
#Horror
Brigham City
