In [1]:
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random
from sklearn.metrics.pairwise import euclidean_distances
from scipy import stats

In [2]:
def get_recommendations(title, cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return df_movies['title'].iloc[movie_indices]

In [6]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [4]:
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        if len(names) > 3:
            names = names[:3]
        return names

    return []

In [5]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [7]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + 
' '.join(x['genres']) + ' ' + x['director'] + ' ' + x['director'] + ' ' + ' '.join(x['cast'])

In [8]:
df_credits=pd.read_csv('./Desktop/MovieRecommendor/tmdb_5000_credits.csv')
df_movies=pd.read_csv('./Desktop/MovieRecommendor/tmdb_5000_movies.csv')
df_movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [9]:
df_credits.columns = ['id','tittle','cast','crew']
df_movies= df_movies.merge(df_credits,on='id')

In [10]:
C= df_movies['vote_average'].mean()
m= df_movies['vote_count'].quantile(0.7)

In [11]:
q_movies = df_movies.copy().loc[df_movies['vote_count'] >= m]
q_movies['title'].head(10)

0                                      Avatar
1    Pirates of the Caribbean: At World's End
2                                     Spectre
3                       The Dark Knight Rises
4                                 John Carter
5                                Spider-Man 3
6                                     Tangled
7                     Avengers: Age of Ultron
8      Harry Potter and the Half-Blood Prince
9          Batman v Superman: Dawn of Justice
Name: title, dtype: object

In [12]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [13]:
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)

In [14]:
q_movies = q_movies.sort_values('score', ascending=False)
q_movies[['title', 'score', 'vote_average']].head(10)

Unnamed: 0,title,score,vote_average
1881,The Shawshank Redemption,8.340775,8.5
3337,The Godfather,8.192887,8.4
662,Fight Club,8.171648,8.3
3232,Pulp Fiction,8.157615,8.3
65,The Dark Knight,8.102674,8.2
809,Forrest Gump,8.056059,8.2
1818,Schindler's List,8.038748,8.3
3865,Whiplash,8.034695,8.3
96,Inception,8.018611,8.1
1990,The Empire Strikes Back,8.010426,8.2


Content Based

In [15]:
tfidf = TfidfVectorizer(stop_words='english')
df_movies['overview'] = df_movies['overview'].fillna('')
tfidf_matrix = tfidf.fit_transform(df_movies['overview'])

In [57]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
euclidean = -1 * euclidean_distances(tfidf_matrix, tfidf_matrix)

In [51]:
indices = pd.Series(df_movies.index, index=df_movies['title']).drop_duplicates()

In [52]:
get_recommendations('Avatar',cosine_sim)

2656    Chiamatemi Francesco - Il Papa della gente
4140                   To Be Frank, Sinatra at 100
4401                           The Helix... Loaded
4431                                   Food Chains
3604                                     Apollo 18
2130                                  The American
634                                     The Matrix
1341                          The Inhabited Island
529                               Tears of the Sun
1610                                         Hanna
Name: title, dtype: object

In [53]:
get_recommendations('The Green Mile', cosine_sim)

2656    Chiamatemi Francesco - Il Papa della gente
4140                   To Be Frank, Sinatra at 100
4401                           The Helix... Loaded
4431                                   Food Chains
3741                                Monster's Ball
1897                                Half Past Dead
980                         The Life of David Gale
3985       Friday the 13th Part VII: The New Blood
2926                              Dead Man Walking
931                         Race to Witch Mountain
Name: title, dtype: object

In [20]:
features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    df_movies[feature] = df_movies[feature].apply(literal_eval)

In [21]:
df_movies['director'] = df_movies['crew'].apply(get_director)
features = ['cast', 'keywords', 'genres']
for feature in features:
    df_movies[feature] = df_movies[feature].apply(get_list)

In [22]:
df_movies[['title', 'cast', 'director', 'keywords', 'genres']].head()

Unnamed: 0,title,cast,director,keywords,genres
0,Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]",James Cameron,"[culture clash, future, space war]","[Action, Adventure, Fantasy]"
1,Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Keira Knightley]",Gore Verbinski,"[ocean, drug abuse, exotic island]","[Adventure, Fantasy, Action]"
2,Spectre,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",Sam Mendes,"[spy, based on novel, secret agent]","[Action, Adventure, Crime]"
3,The Dark Knight Rises,"[Christian Bale, Michael Caine, Gary Oldman]",Christopher Nolan,"[dc comics, crime fighter, terrorist]","[Action, Crime, Drama]"
4,John Carter,"[Taylor Kitsch, Lynn Collins, Samantha Morton]",Andrew Stanton,"[based on novel, mars, medallion]","[Action, Adventure, Science Fiction]"


In [23]:
df_movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,spoken_languages,status,tagline,title,vote_average,vote_count,tittle,cast,crew,director
0,237000000,"[Action, Adventure, Fantasy]",http://www.avatarmovie.com/,19995,"[culture clash, future, space war]",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{'credit_id': '52fe48009251416c750aca23', 'de...",James Cameron
1,300000000,"[Adventure, Fantasy, Action]",http://disney.go.com/disneypictures/pirates/,285,"[ocean, drug abuse, exotic island]",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Keira Knightley]","[{'credit_id': '52fe4232c3a36847f800b579', 'de...",Gore Verbinski
2,245000000,"[Action, Adventure, Crime]",http://www.sonypictures.com/movies/spectre/,206647,"[spy, based on novel, secret agent]",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Spectre,"[Daniel Craig, Christoph Waltz, Léa Seydoux]","[{'credit_id': '54805967c3a36829b5002c41', 'de...",Sam Mendes
3,250000000,"[Action, Crime, Drama]",http://www.thedarkknightrises.com/,49026,"[dc comics, crime fighter, terrorist]",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,The Dark Knight Rises,"[Christian Bale, Michael Caine, Gary Oldman]","[{'credit_id': '52fe4781c3a36847f81398c3', 'de...",Christopher Nolan
4,260000000,"[Action, Adventure, Science Fiction]",http://movies.disney.com/john-carter,49529,"[based on novel, mars, medallion]",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,John Carter,"[Taylor Kitsch, Lynn Collins, Samantha Morton]","[{'credit_id': '52fe479ac3a36847f813eaa3', 'de...",Andrew Stanton


In [24]:
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    df_movies[feature] = df_movies[feature].apply(clean_data)

In [25]:
df_movies['soup'] = df_movies.apply(create_soup, axis=1)

In [26]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df_movies['soup'])

In [27]:
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [28]:
df_movies = df_movies.reset_index()
indices = pd.Series(df_movies.index, index=df_movies['title'])

In [37]:
get_recommendations('The Shawshank Redemption', cosine_sim2)

690               The Green Mile
559                 The Majestic
2751                    The Mist
2926            Dead Man Walking
1522            Cradle Will Rock
1283         The Hudsucker Proxy
1556                Mystic River
1661                   Antitrust
2704                Catch a Fire
4638    Amidst the Devil's Wings
Name: title, dtype: object

In [39]:
get_recommendations('The Rainmaker', cosine_sim2)

867     The Godfather: Part III
3337              The Godfather
1018            The Cotton Club
2731     The Godfather: Part II
3012              The Outsiders
4209           The Conversation
1525             Apocalypse Now
2333      Peggy Sue Got Married
3401                      Twixt
2600           New York Stories
Name: title, dtype: object

In [31]:
reader = Reader()
ratings = pd.read_csv('./Desktop/MovieRecommendor/ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [32]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [33]:
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5)

{'test_rmse': array([0.89436598, 0.89260879, 0.89748886, 0.8907387 , 0.9100275 ]),
 'test_mae': array([0.68891284, 0.68823508, 0.69215914, 0.68859321, 0.69862209]),
 'fit_time': (5.27518105506897,
  5.764752626419067,
  6.771573781967163,
  5.065270662307739,
  5.307448387145996),
 'test_time': (0.22551250457763672,
  0.19232559204101562,
  0.19416284561157227,
  0.16683316230773926,
  0.19488978385925293)}

In [34]:
svd.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fd95d9986a0>

In [36]:
list_of_movies=[]
for i in range(0,10000):
    tt = svd.predict(1, i, 500)
    if(tt[3]>3.3):
        if not q_movies['title'][q_movies['id']==tt[1]].empty:
            print(q_movies['title'][q_movies['id']==tt[1]])

1260    Amélie
Name: title, dtype: object
690    The Green Mile
Name: title, dtype: object
43    Terminator Salvation
Name: title, dtype: object
1145    The Sixth Sense
Name: title, dtype: object
1053    Galaxy Quest
Name: title, dtype: object
509    Madagascar
Name: title, dtype: object
425    Mission: Impossible
Name: title, dtype: object
1170    The Talented Mr. Ripley
Name: title, dtype: object
1275    Sunshine
Name: title, dtype: object
452    Space Jam
Name: title, dtype: object
