In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string

In [2]:
movies = pd.read_csv("movie_dataset.csv")
movies.isna().sum()

index                      0
budget                     0
genres                    28
homepage                3091
id                         0
keywords                 412
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
cast                      43
crew                       0
director                  30
dtype: int64

In [3]:
print(movies.shape)
movies.head(2)

(4803, 24)


Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski


# Helper Functions

In [4]:
def getIndexFromTitle(title):
    return movies[movies.title==title].index[0]

In [5]:
def getTitleFromIndex(index):
    return movies[index:index+1].original_title[index]

# Remove Punctuation from Text

In [6]:
punc = {ord(i):None for i in string.punctuation}
def remove_punctuation(text):
    return text.translate(punc)

# Selecting Features to be used in Model

In [7]:
features = ['keywords', 'cast', 'genres', 'director', 'tagline', 'original_title', 'overview']
movies.fillna({feature:"" for feature in features}, inplace = True)

In [8]:
movies['combined_features'] = ""
for feature in features:
    movies['combined_features'] += movies[feature] + " "
movies['combined_features']=movies['combined_features'].apply(lambda s:s.strip().lower()).apply(remove_punctuation)

# Creating Tf-idf Vector and Finding Cosine Similarity

In [9]:
tf = TfidfVectorizer()
vector = tf.fit_transform(movies['combined_features'])
similarity_scores = cosine_similarity(vector)

In [10]:
temp_movie = "The Matrix"

# Using Content Based Recommendation System

In [11]:
movie_index = getIndexFromTitle(temp_movie)
similar_movies = list(enumerate(similarity_scores[movie_index]))
similar_movies.sort(key = lambda x:x[1], reverse = True)

print(f"Movies Similar To: {temp_movie}\n\nRecommended Movies:")

# printing title of 10 most similar movies
for i in range(1,11):
    print(f"{i}. {getTitleFromIndex(similar_movies[i][0])}")

Movies Similar To: The Matrix

Recommended Movies:
1. The Matrix Revolutions
2. The Matrix Reloaded
3. Commando
4. Transcendence
5. The Terminator
6. A.I. Artificial Intelligence
7. Interstellar
8. Terminator 3: Rise of the Machines
9. The Thirteenth Floor
10. Hackers


# Collaborative Recommendation System

In [12]:
ratings = pd.read_csv("ratings.csv")
movies = pd.read_csv("movies.csv")
print(ratings.shape, movies.shape)

(58856, 4) (9742, 3)


In [13]:
ratings = pd.merge(ratings, movies).drop(['genres', 'timestamp'], axis = 1)
ratings.shape

(58856, 4)

In [14]:
ratings.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


In [15]:
user_ratings = ratings.pivot_table(index = ['userId'], columns = ['title'], values = 'rating')
user_ratings.head()

title,'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),...All the Marbles (1981),...And Justice for All (1979),...,Zootopia (2016),Zulu (1964),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,4.0
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [16]:
# drop movies rated by less than 10 users
user_ratings.dropna(thresh = 10, axis = 1, inplace = True)
user_ratings.fillna(0, inplace = True)

In [17]:
movie_similarity = user_ratings.corr(method = 'pearson')
movie_similarity.head()

title,"'burbs, The (1989)",(500) Days of Summer (2009),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),12 Years a Slave (2013),127 Hours (2010),13 Going on 30 (2004),...,Young Frankenstein (1974),Young Guns (1988),Zack and Miri Make a Porno (2008),Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),xXx (2002),¡Three Amigos! (1986)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",1.0,-0.041448,0.117769,-0.023799,0.103569,0.137886,0.011204,-0.027168,-0.026149,0.035116,...,0.120493,0.013679,-0.02936,0.105517,0.011302,-0.046504,0.001386,-0.037174,0.03988,0.147042
(500) Days of Summer (2009),-0.041448,1.0,0.209336,0.219671,0.139245,0.089198,0.167206,0.169419,0.231447,0.297795,...,-0.030329,0.026066,0.334488,0.029089,0.386179,0.317635,0.24009,0.227113,0.264045,0.045721
10 Things I Hate About You (1999),0.117769,0.209336,1.0,0.270613,0.19577,0.127772,-0.00306,0.065435,0.01301,0.161357,...,0.120401,0.1104,0.301059,0.038976,0.048745,0.147973,0.305589,0.043807,0.154498,0.045829
"10,000 BC (2008)",-0.023799,0.219671,0.270613,1.0,0.228515,0.104175,0.09758,-0.02724,0.077947,0.157426,...,0.072903,0.046936,0.322426,0.084153,0.20344,0.342681,0.283987,0.108378,0.206732,0.090097
101 Dalmatians (1996),0.103569,0.139245,0.19577,0.228515,1.0,0.269325,0.075406,0.061794,0.038203,0.174846,...,0.181398,0.030488,0.185425,0.071356,0.060675,0.146466,0.246673,0.108583,0.226002,0.071757


In [18]:
def getSimilarMovies(movie_name, user_rating):
    similar_score = movie_similarity[movie_name]*(user_rating - 2.5)
    similar_score = similar_score.sort_values(ascending = False)
    return similar_score

In [19]:
def getRecommendedMovies(user_ratings):
    user_movies_df = pd.DataFrame()
    for movie, rating in user_ratings:
        user_movies_df = user_movies_df.append(getSimilarMovies(movie, rating), ignore_index = True)
    return user_movies_df.sum().sort_values(ascending = False)

In [20]:
temp_user = [
    ("Zombieland (2009)", 5),
    ("Harry Potter and the Half-Blood Prince (2009)", 4),
    ("Angels & Demons (2009)", 2),
    ("Blair Witch Project, The (1999)", 1),
    ("Mad Max (1979)", 2)
]

print("User Information:")
print(pd.DataFrame(data = temp_user, columns = ['Movie','Rating']))

print("\nRecommendations:")
    
getRecommendedMovies(temp_user)[:15]

User Information:
                                           Movie  Rating
0                              Zombieland (2009)       5
1  Harry Potter and the Half-Blood Prince (2009)       4
2                         Angels & Demons (2009)       2
3                Blair Witch Project, The (1999)       1
4                                 Mad Max (1979)       2

Recommendations:


Zombieland (2009)                                      2.779589
Harry Potter and the Half-Blood Prince (2009)          2.262980
Sherlock Holmes (2009)                                 1.978229
Avatar (2009)                                          1.944465
Harry Potter and the Deathly Hallows: Part 1 (2010)    1.897896
Harry Potter and the Deathly Hallows: Part 2 (2011)    1.832364
Scott Pilgrim vs. the World (2010)                     1.817541
Up (2009)                                              1.732310
Guardians of the Galaxy (2014)                         1.685299
Deadpool (2016)                                        1.666978
Kung Fu Panda (2008)                                   1.663672
National Treasure: Book of Secrets (2007)              1.653295
Hurt Locker, The (2008)                                1.648037
Skyfall (2012)                                         1.643549
Avengers, The (2012)                                   1.623800
dtype: float64