# Movie Recommendation System using Association Rule Mining

First, let us perform exploratory data analysis so that we get to know about attributes, size and null values present in the dataset.

In [1]:
import numpy as np
import pandas as pd
data_movies = pd.read_csv('movies.csv')
data_links = pd.read_csv('links.csv')
data_tags = pd.read_csv('tags.csv')
data_ratings = pd.read_csv('ratings.csv')
print(data_movies.shape)
print(data_movies.head())
data_movies.info()

(9742, 3)
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [2]:
print(data_links.shape)
print(data_links.head())
data_links.info()

(9742, 3)
   movieId  imdbId   tmdbId
0        1  114709    862.0
1        2  113497   8844.0
2        3  113228  15602.0
3        4  114885  31357.0
4        5  113041  11862.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 non-null   int64  
 2   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 228.5 KB


In [3]:
print(data_ratings.shape)
print(data_ratings.head())
data_ratings.info()

(100836, 4)
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [4]:
print(data_tags.shape)
print(data_tags.head())
data_tags.info()

(3683, 4)
   userId  movieId              tag   timestamp
0       2    60756            funny  1445714994
1       2    60756  Highly quotable  1445714996
2       2    60756     will ferrell  1445714992
3       2    89774     Boxing story  1445715207
4       2    89774              MMA  1445715200
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


Now computing the average rating of each movie in the dataset by averaging the individual user ratings for each movie.

In [5]:
movie_ids = data_movies['movieId'].tolist()
movie_rating_ids = data_ratings['movieId'].tolist()
ratings = data_ratings['rating'].tolist()

dict_ratings = {}
for i in movie_ids:
    cur_id = i
    n = 0
    sum = 0
    for j in range(len(movie_rating_ids)):
        if(movie_rating_ids[j]==cur_id):
            n += 1
            sum += ratings[j]
    if(n != 0):
        avg_rating = sum/n
    else:
        avg_rating = 0
    dict_ratings[cur_id] = avg_rating
data_movies['rating'] = dict_ratings.values()

Finding the tags for each movie in the dataset and saving it in a dictionary.

In [6]:
movie_tag_ids = data_tags['movieId'].tolist()
movie_tags = data_tags['tag'].tolist()
dict_tags = {}
for i in movie_ids:
    cur_id = i
    tags_list = []
    for j in range(len(movie_tag_ids)):
        if(cur_id==movie_tag_ids[j]):
            if(movie_tags[j] not in tags_list):
                tags_list.append(movie_tags[j])
    dict_tags[i] = tags_list
data_movies['tags'] = dict_tags.values()

So now, we add two columns in our dataset- rating and tags for each movie.

In [7]:
data_movies

Unnamed: 0,movieId,title,genres,rating,tags
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.920930,"[pixar, fun]"
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.431818,"[fantasy, magic board game, Robin Williams, game]"
2,3,Grumpier Old Men (1995),Comedy|Romance,3.259615,"[moldy, old]"
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.357143,[]
4,5,Father of the Bride Part II (1995),Comedy,3.071429,"[pregnancy, remake]"
...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,4.000000,[]
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,3.500000,[]
9739,193585,Flint (2017),Drama,3.500000,[]
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,3.500000,[]


Now obtaining the transaction data for each user in the dataset and storing them in a list.

In [8]:
user_list = []
user_movie_list = []
user_data = data_ratings['userId'].tolist()
movie_data = data_ratings['movieId'].tolist()
i = 0
while(i<len(user_data)):
    cur_id = user_data[i]
    if(cur_id not in user_list):
        user_list.append(cur_id)
    i = i+1
for i in user_list:
    movie_id_list = data_ratings[data_ratings['userId']==i]['movieId'].tolist()
    user_movie_list.append(movie_id_list)
user_movie_data = pd.DataFrame(user_list, columns = ['user'])
user_movie_data['movie_list'] = user_movie_list
print(user_movie_data)

     user                                         movie_list
0       1  [1, 3, 6, 47, 50, 70, 101, 110, 151, 157, 163,...
1       2  [318, 333, 1704, 3578, 6874, 8798, 46970, 4851...
2       3  [31, 527, 647, 688, 720, 849, 914, 1093, 1124,...
3       4  [21, 32, 45, 47, 52, 58, 106, 125, 126, 162, 1...
4       5  [1, 21, 34, 36, 39, 50, 58, 110, 150, 153, 232...
..    ...                                                ...
605   606  [1, 7, 11, 15, 17, 18, 19, 28, 29, 32, 36, 46,...
606   607  [1, 11, 25, 34, 36, 86, 110, 112, 150, 153, 16...
607   608  [1, 2, 3, 10, 16, 19, 21, 24, 31, 32, 34, 39, ...
608   609  [1, 10, 110, 116, 137, 150, 161, 185, 208, 231...
609   610  [1, 6, 16, 32, 47, 50, 70, 95, 110, 111, 112, ...

[610 rows x 2 columns]


Saving the data as csv file for later use.

In [9]:
data_movies = data_movies.sort_values(by=['rating'], ascending=False)
data_movies.to_csv('movies_data.csv')
user_movie_data.to_csv('user_movies_data.csv')

Importing libraries

In [10]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth
from mlxtend.frequent_patterns import association_rules
import ast

Creating a list for all the transactions by all the users to find association rules

In [11]:
transaction_data = []
for items in user_movie_data['movie_list']:
    transaction_data.append(items)

Now using mlxtend library for obtaining frequent item sets and association rules, FP growth algorithm is used

In [12]:
te = TransactionEncoder()  
te_array = te.fit(transaction_data).transform(transaction_data)
df = pd.DataFrame(te_array, columns=te.columns_)
frequent_itemsets = fpgrowth(df, min_support=0.20, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.9)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,"(296, 480)",(356),0.283607,0.539344,0.259016,0.913295,1.693343,0.106055,5.312896
1,"(480, 318)",(356),0.252459,0.539344,0.236066,0.935065,1.733707,0.099903,7.094098
2,"(296, 480, 318)",(356),0.219672,0.539344,0.208197,0.947761,1.757247,0.089718,8.818267
3,"(356, 2959)",(2571),0.262295,0.455738,0.244262,0.93125,2.04339,0.124725,7.916542
4,"(296, 356, 2959)",(2571),0.213115,0.455738,0.203279,0.953846,2.092972,0.106154,11.79235
5,"(260, 2959)",(2571),0.219672,0.455738,0.204918,0.932836,2.04687,0.104805,8.103461
6,(1196),(260),0.345902,0.411475,0.311475,0.900474,2.188403,0.169145,5.913271
7,"(2571, 1196)",(260),0.283607,0.411475,0.265574,0.936416,2.275752,0.148877,9.255887
8,"(1196, 356)",(260),0.242623,0.411475,0.22459,0.925676,2.24965,0.124757,7.918331
9,"(296, 1196)",(260),0.227869,0.411475,0.214754,0.942446,2.290407,0.120992,10.225615


Now, creating two different lists, one for storing the antecedents and other for storing the consequents

In [13]:
rules_list = []
recommendation_list = []
for i in rules['antecedents']:
    string_set = str(i)
    list_id = '[' + string_set[11:-2] + ']'
    list_id = ast.literal_eval(list_id)
    rules_list.append(list_id)
for i in rules['consequents']:
    string_set = str(i)
    consequent_id = '[' + string_set[11:-2] + ']'
    consequent_id = ast.literal_eval(consequent_id)
    recommendation_list.append(consequent_id)

Printing the Association Rules

In [14]:
n = len(rules_list)
for i in range(n):
    print(i+1, end=") ")
    cur_rule = rules_list[i]
    for j in cur_rule:
        print(data_movies[data_movies['movieId']==j]['title'].tolist()[0], end=" ")
    print(" --> ", end=" ")
    cur_rec = recommendation_list[i]
    for j in cur_rec:
        print(data_movies[data_movies['movieId']==j]['title'].tolist()[0], end=" ")
    print()

1) Pulp Fiction (1994) Jurassic Park (1993)  -->  Forrest Gump (1994) 
2) Jurassic Park (1993) Shawshank Redemption, The (1994)  -->  Forrest Gump (1994) 
3) Pulp Fiction (1994) Jurassic Park (1993) Shawshank Redemption, The (1994)  -->  Forrest Gump (1994) 
4) Forrest Gump (1994) Fight Club (1999)  -->  Matrix, The (1999) 
5) Pulp Fiction (1994) Forrest Gump (1994) Fight Club (1999)  -->  Matrix, The (1999) 
6) Star Wars: Episode IV - A New Hope (1977) Fight Club (1999)  -->  Matrix, The (1999) 
7) Star Wars: Episode V - The Empire Strikes Back (1980)  -->  Star Wars: Episode IV - A New Hope (1977) 
8) Matrix, The (1999) Star Wars: Episode V - The Empire Strikes Back (1980)  -->  Star Wars: Episode IV - A New Hope (1977) 
9) Star Wars: Episode V - The Empire Strikes Back (1980) Forrest Gump (1994)  -->  Star Wars: Episode IV - A New Hope (1977) 
10) Pulp Fiction (1994) Star Wars: Episode V - The Empire Strikes Back (1980)  -->  Star Wars: Episode IV - A New Hope (1977) 
11) Silence of