# RECOMENDER SYSTEM

A Collaborative Filtering method

In [1]:
import pandas as pd

## Data importing and Prepatation

In [2]:
# Importing the datasets
df_movie = pd.read_csv("../movie.csv")
df_movie.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
df_movie.shape

(27278, 3)

In [4]:
df_rating = pd.read_csv("../rating.csv")
df_rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [5]:
df_rating.shape

(20000263, 4)

In [6]:
# Merge the rating dataset and the movie dataset
df = pd.merge(df_rating, df_movie, on="movieId").sort_values("userId")

In [7]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,2,3.5,2005-04-02 23:53:47,Jumanji (1995),Adventure|Children|Fantasy
505014,1,541,4.0,2005-04-02 23:30:03,Blade Runner (1982),Action|Sci-Fi|Thriller
2380423,1,6807,3.5,2005-04-02 23:31:26,Monty Python's The Meaning of Life (1983),Comedy
2378699,1,6774,4.0,2005-04-02 23:49:46,Videodrome (1983),Fantasy|Horror|Sci-Fi|Thriller
2376750,1,6755,3.5,2004-09-10 03:14:27,Bubba Ho-tep (2002),Comedy|Horror


In [8]:
df.shape

(20000263, 6)

In [9]:
# Remove movies with less than 5000 votes from the dataset
votes_count = df["title"].value_counts().reset_index()
votes_count.columns = ["title", "count"]

rare_movies = votes_count[votes_count["count"] <= 5000]["title"]
popular_movies = df[~df["title"].isin(rare_movies)]

popular_movies.shape

(12865877, 6)

In [10]:
# Create a pivot table with userIDs in the index, the movie names in the columns and the ratings as values.
user_movie_df = popular_movies.pivot_table(index=["userId"], columns=["title"], values="rating")

In [11]:
user_movie_df.head()

title,(500) Days of Summer (2009),10 Things I Hate About You (1999),101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),"13th Warrior, The (1999)","20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),21 Grams (2003),28 Days Later (2002),...,X-Men (2000),X-Men: The Last Stand (2006),X2: X-Men United (2003),You've Got Mail (1998),Young Frankenstein (1974),Young Guns (1988),Zombieland (2009),Zoolander (2001),eXistenZ (1999),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,3.5,,3.5,...,,,4.0,,4.0,,,,,
2,,,,,,,,5.0,,,...,,,,,,,,,,
3,,,,,,,,5.0,,,...,,,,,5.0,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


  Determining the Movies watched by the user in order to make suggestions

In [12]:
# Select a random User
ran_user = int(pd.Series(user_movie_df.index).sample(1, random_state=45).iloc[0])

In [13]:
# Creating a Dataframe consists of observation units of the selected user
ran_user_df = user_movie_df[user_movie_df.index == ran_user]

In [14]:
ran_user_df.head()

title,(500) Days of Summer (2009),10 Things I Hate About You (1999),101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),"13th Warrior, The (1999)","20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),21 Grams (2003),28 Days Later (2002),...,X-Men (2000),X-Men: The Last Stand (2006),X2: X-Men United (2003),You've Got Mail (1998),Young Frankenstein (1974),Young Guns (1988),Zombieland (2009),Zoolander (2001),eXistenZ (1999),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5979,,,,,,,,,,,...,,,,,,,,,,


In [15]:
# Create a list containing the movies voted by the selected user
movies_watched = ran_user_df.columns[ran_user_df.notna().any()].tolist()

Now, we try to reach other users who watched the same movies as our selected user

In [16]:
# Select the columns of the movies watched by the selected user from user_movie_df and create a new dataframe named movies_watched_df.
movies_watched_df = user_movie_df[movies_watched]

In [17]:
movies_watched_df.head()

title,Ace Ventura: Pet Detective (1994),Ace Ventura: When Nature Calls (1995),Aladdin (1992),Apollo 13 (1995),Batman (1989),Batman Forever (1995),Beauty and the Beast (1991),Braveheart (1995),Clueless (1995),Dances with Wolves (1990),...,Outbreak (1995),Pretty Woman (1990),Pulp Fiction (1994),"Shawshank Redemption, The (1994)",Speed (1994),Star Trek: Generations (1994),Stargate (1994),Terminator 2: Judgment Day (1991),True Lies (1994),"Usual Suspects, The (1995)"
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,4.0,4.0,,,,3.5,,3.5
2,,,,,,,,4.0,,,...,,,,,,,,5.0,,
3,,,,,,,,,,,...,,,,5.0,,5.0,5.0,4.0,,5.0
4,,3.0,,,,,,,,,...,,,,,4.0,3.0,,4.0,3.0,
5,,,5.0,5.0,,,5.0,4.0,,5.0,...,,,,5.0,5.0,,4.0,5.0,5.0,


In [18]:
# Create a new dataframe, which contains information about the number of movies each user has watched.
user_movie_count = movies_watched_df.T.notnull().sum().reset_index()
user_movie_count.columns = ["userId", "movie_count"]

In [19]:
user_movie_count.head()

Unnamed: 0,userId,movie_count
0,1,5
1,2,4
2,3,8
3,4,12
4,5,16


In [20]:
# Create a list from the user IDs of those who watched 90% or more of the movies that the selected user voted for.
percent = len(movies_watched) * 0.90
users_same_movies = user_movie_count[user_movie_count["movie_count"] > percent]["userId"]

Now, we are determining the suggested users and the users who are most similar to them

In [21]:
final_df = pd.concat([movies_watched_df[movies_watched_df.index.isin(users_same_movies)], ran_user_df[movies_watched]])

In [22]:
final_df.head()

title,Ace Ventura: Pet Detective (1994),Ace Ventura: When Nature Calls (1995),Aladdin (1992),Apollo 13 (1995),Batman (1989),Batman Forever (1995),Beauty and the Beast (1991),Braveheart (1995),Clueless (1995),Dances with Wolves (1990),...,Outbreak (1995),Pretty Woman (1990),Pulp Fiction (1994),"Shawshank Redemption, The (1994)",Speed (1994),Star Trek: Generations (1994),Stargate (1994),Terminator 2: Judgment Day (1991),True Lies (1994),"Usual Suspects, The (1995)"
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,3.0,1.0,3.0,4.0,4.0,3.0,3.0,5.0,3.0,5.0,...,4.0,5.0,5.0,,4.0,3.0,2.0,5.0,4.0,
26,3.0,3.0,3.0,4.0,3.0,3.0,5.0,4.0,4.0,4.0,...,3.0,3.0,5.0,5.0,3.0,3.0,3.0,5.0,3.0,5.0
69,4.0,4.0,4.0,3.0,4.0,3.0,3.0,3.0,2.0,3.0,...,,3.0,4.0,4.0,2.0,,3.0,5.0,4.0,4.0
91,2.5,2.0,,3.5,4.0,3.0,,5.0,,4.5,...,4.0,3.0,3.5,4.0,2.5,3.5,4.0,2.5,3.0,3.5
184,2.0,3.0,3.0,4.0,4.0,4.0,3.0,5.0,,5.0,...,3.0,3.0,3.0,5.0,3.0,3.0,4.0,3.0,4.0,5.0


In [23]:
# Determining the correlations between users
corr_df = final_df.T.corr().unstack().sort_values().drop_duplicates()

corr_df = pd.DataFrame(corr_df, columns=["corr"])

corr_df.index.names = ['user_id_1', 'user_id_2']

corr_df = corr_df.reset_index()

In [24]:
# Filter out users with a high correlation (over 0.65) with the selected user
top_users = corr_df[(corr_df["user_id_1"] == ran_user) & (corr_df["corr"]>0.65)][
    ["user_id_2", "corr"]].sort_values("corr", ascending=False).reset_index(drop=True)

top_users.rename(columns={"user_id_2": "userId"}, inplace= True)

In [25]:
# Merge the top_users dataframe with the rating dataset
top_users_ratings = top_users.merge(df_rating[["userId", "movieId", "rating"]], how = "inner")
top_users_ratings = top_users_ratings[~(top_users_ratings["userId"] == ran_user)]

Finally, Calculate the Weighted Average Recommendation Score and Keep the Top 5 Movies

In [26]:
# weighted_rating corresponds to the product of each user's correlation and rating.
top_users_ratings["weighted_rating"] = top_users_ratings["corr"] * top_users_ratings["rating"]

In [27]:
# Create a new dataframe containing the movie ID and the average weighted rating of all users
recommendation_df = top_users_ratings.groupby("movieId").agg({"weighted_rating": "mean"}).reset_index()

In [28]:
# Choose movies with a weighted rating greater than 3.5 in recommendation_df and rank them accordingly.
movies_to_be_recommend = recommendation_df[recommendation_df["weighted_rating"] > 3.5].sort_values("weighted_rating", ascending=False)

In [29]:
# Bring the movie names from the movie dataset and select the top 5 movies to recommend.
selected_movies = movies_to_be_recommend["movieId"].head(5).to_list()
df_movie[df_movie["movieId"].isin(selected_movies)]

Unnamed: 0,movieId,title,genres
56,57,Home for the Holidays (1995),Drama
278,281,Nobody's Fool (1994),Comedy|Drama|Romance
2910,2996,Music of the Heart (1999),Drama
4311,4406,"Man Who Shot Liberty Valance, The (1962)",Crime|Drama|Western
4615,4710,"Shootist, The (1976)",Drama|Western
