## Introduction to Dataset

In [1]:
# Importing libraries
import pandas as pd
import numpy as np

from mlxtend.frequent_patterns import apriori, association_rules 

In [2]:
# Reading the dataset
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


## Data Preprocessing

In [5]:
final_dataset = ratings.pivot(index='userId',columns='movieId',values='rating')

In [6]:
final_dataset

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [7]:
final_dataset.fillna(0,inplace=True)

In [8]:
no_user_voted = ratings.groupby('movieId')['rating'].agg('count')
no_movies_voted = ratings.groupby('userId')['rating'].agg('count')

In [9]:
final_dataset = final_dataset.loc[:, no_user_voted[no_user_voted > 10].index]

In [10]:
final_dataset = final_dataset.loc[no_movies_voted[no_movies_voted > 50].index, :]

In [11]:
final_dataset

movieId,1,2,3,5,6,7,9,10,11,12,...,159093,164179,166528,168250,168252,174055,176371,177765,179819,187593
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,4.0,5.0,5.0,4.0,4.0,0.0,3.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,4.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
606,2.5,0.0,0.0,0.0,0.0,2.5,0.0,0.0,2.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
def hot_encode(x): 
    if(x < 3.5): 
        return 0
    else: 
        return 1

In [13]:
final_dataset = final_dataset.applymap(hot_encode)

In [14]:
final_dataset

movieId,1,2,3,5,6,7,9,10,11,12,...,159093,164179,166528,168250,168252,174055,176371,177765,179819,187593
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,1,1,1,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
606,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
607,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
608,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
movieIdToName = dict()
for mid in final_dataset.columns:
    movieIdToName[mid] = movies[movies["movieId"] == mid]["title"].values[0]

In [16]:
cnt = 0
for movieId, movieName in movieIdToName.items():
    print(f"{movieId} -> {movieName}")
    cnt += 1
    
    if(cnt == 5):
        break

1 -> Toy Story (1995)
2 -> Jumanji (1995)
3 -> Grumpier Old Men (1995)
5 -> Father of the Bride Part II (1995)
6 -> Heat (1995)


In [17]:
finalLst = []
for i in final_dataset.index:
    lst = []
    for j in final_dataset.columns:
        if(final_dataset[j][i]):
            lst.append(j)
    finalLst.append(lst)

In [18]:
print(finalLst[0])

[1, 3, 6, 47, 50, 101, 110, 151, 157, 163, 216, 231, 235, 260, 333, 349, 356, 362, 367, 441, 457, 480, 527, 543, 552, 553, 590, 592, 593, 596, 608, 661, 733, 919, 923, 954, 1023, 1025, 1029, 1031, 1032, 1042, 1049, 1060, 1073, 1080, 1089, 1090, 1092, 1097, 1127, 1136, 1196, 1197, 1198, 1206, 1208, 1210, 1213, 1214, 1220, 1222, 1224, 1240, 1256, 1265, 1270, 1275, 1278, 1282, 1291, 1298, 1348, 1500, 1517, 1552, 1573, 1587, 1617, 1620, 1625, 1732, 1777, 1805, 1920, 1954, 1967, 2000, 2005, 2012, 2018, 2028, 2046, 2054, 2058, 2078, 2090, 2094, 2096, 2105, 2115, 2116, 2137, 2139, 2141, 2143, 2161, 2174, 2193, 2268, 2273, 2291, 2329, 2353, 2366, 2387, 2395, 2406, 2427, 2450, 2459, 2470, 2478, 2502, 2529, 2542, 2571, 2580, 2596, 2616, 2628, 2640, 2641, 2648, 2692, 2700, 2716, 2761, 2797, 2826, 2858, 2872, 2916, 2944, 2947, 2948, 2949, 2959, 2985, 2987, 2991, 2993, 2997, 3033, 3034, 3052, 3053, 3062, 3147, 3168, 3253, 3273, 3386, 3439, 3440, 3441, 3448, 3450, 3479, 3489, 3527, 3578, 3617, 3639,

In [19]:
# storing data to file
with open("dataset.txt", "w") as fp:
    for lst in finalLst:
        for x in lst:
            fp.write(str(x))
            fp.write(" ")
        fp.write("\n")

## Manual Implementation

In [20]:
# encoding the movie id length to fixed size
movieIdSize = 6

# encoding value
encoder = 100000

# Total users
userCnt = 378

In [21]:
minSupport = 70

In [22]:
# Too generate new (k+1)-itemsets
def generateKPlus1thSet(itemSet):
    length = len(itemSet)
    candidates = []   # all (k + 1) candidates
    
    # for each candidate
    for (i, candidate) in enumerate(itemSet):
        # for next all candidates in itemSet
        for j in range(i + 1, length):
            nextCandidate = itemSet[j]
            # matching first (k - 1) elements
            if(candidate[:-movieIdSize] == nextCandidate[:-movieIdSize]):    
                newItem = candidate[:-movieIdSize] + candidate[-movieIdSize:] + nextCandidate[-movieIdSize:]
                candidates.append(newItem)
            
    return candidates

In [23]:
# Prune step
def prune(Ck):
    Lk = []

    for item in Ck:
        if(Ck[item] >= minSupport):
            Lk.append(item)
    
    return Lk

In [24]:
# calculating support for new itemset
def calculateSupport(candidates):
    
    Ck = dict()
    
    for line in finalLst:
        line = list(map(lambda x: str(x + encoder), line))
        
        for candidate in candidates:
            
            if(candidate not in Ck):
                Ck[candidate] = 0
                
            present = True
            
            for k in range(0, len(candidate), movieIdSize):
                item = candidate[k: k + movieIdSize]
                
                if(item not in line):
                    present = False
                    break
                    
            if(present):
                Ck[candidate] += 1
                
    return Ck

In [25]:
C1 = dict()

for line in finalLst:
    for item in line:
        item = str(item + encoder)
        C1[item] = C1.get(item, 0) + 1
        
L1 = prune(C1)

print('====================================')
print('     Generating 1 itemset')
print('====================================')

L = generateKPlus1thSet(L1)

k = 2
while(L != []):
    
    C = calculateSupport(L)
    
    frequentItemset = prune(C)

    print('     Generating', k, 'itemset')
    print('====================================')
    
    L = generateKPlus1thSet(frequentItemset)
    
    k += 1

     Generating 1 itemset
     Generating 2 itemset
     Generating 3 itemset
     Generating 4 itemset
     Generating 5 itemset


In [26]:
def decoder(frequentItemset):
    
    y = [[itemSet[x : x + movieIdSize] for x in range(0, len(itemSet), movieIdSize)] for itemSet in frequentItemset]

    x1 = [list(map(lambda x: str(int(x) - encoder), z)) for z in y]
    
    movieItemSet = []
    
    # for each itemset
    for itemSet in x1:
        tempSet = []
        for movieId in itemSet:
            tempSet.append(movieIdToName[int(movieId)])
            
        movieItemSet.append(tempSet)
    
    return movieItemSet

In [27]:
frequentItems = decoder(frequentItemset)

print("Final Frequent ItemSets\n\n")

for itemSet in frequentItems:   
    for movie in itemSet:
        print(movie)
        
    print("\n")

Final Frequent ItemSets


Star Wars: Episode IV - A New Hope (1977)
Star Wars: Episode V - The Empire Strikes Back (1980)
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
Star Wars: Episode VI - Return of the Jedi (1983)
Indiana Jones and the Last Crusade (1989)


Matrix, The (1999)
Fight Club (1999)
Lord of the Rings: The Fellowship of the Ring, The (2001)
Lord of the Rings: The Two Towers, The (2002)
Lord of the Rings: The Return of the King, The (2003)




In [28]:
# Formating frequent itemset to generate association rules
freqItems = []

items = "".join(frequentItemset)

for k in range(0, len(items), movieIdSize):
    item = items[k: k + movieIdSize]
    support = (C1[item] / userCnt)
    movieName = frozenset([movieIdToName[int(item) - encoder]])
    freqItems.append([support, movieName])
    
freqDf = pd.DataFrame(freqItems, columns=["support", "itemsets"])
print(freqDf)

    support                                           itemsets
0  0.481481        (Star Wars: Episode IV - A New Hope (1977))
1  0.415344  (Star Wars: Episode V - The Empire Strikes Bac...
2  0.407407  (Raiders of the Lost Ark (Indiana Jones and th...
3  0.380952  (Star Wars: Episode VI - Return of the Jedi (1...
4  0.285714        (Indiana Jones and the Last Crusade (1989))
5  0.507937                               (Matrix, The (1999))
6  0.431217                                (Fight Club (1999))
7  0.370370  (Lord of the Rings: The Fellowship of the Ring...
8  0.351852    (Lord of the Rings: The Two Towers, The (2002))
9  0.351852  (Lord of the Rings: The Return of the King, Th...


In [29]:
rules = association_rules(freqDf, metric ="confidence", min_threshold = 1) 
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])

rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction


## Generating rules from frequent itemset

In [30]:
final_dataset.columns = [movieIdToName[mid] for mid in final_dataset.columns]

In [31]:
final_dataset

Unnamed: 0_level_0,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Sudden Death (1995),GoldenEye (1995),"American President, The (1995)",Dracula: Dead and Loving It (1995),...,Now You See Me 2 (2016),Arrival (2016),Rogue One: A Star Wars Story (2016),Get Out (2017),Logan (2017),Dunkirk (2017),Blade Runner 2049 (2017),Coco (2017),Star Wars: The Last Jedi (2017),Deadpool 2 (2018)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,1,1,1,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
606,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
607,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
608,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
# Building the model 
frq_items = apriori(final_dataset, min_support = 0.3, use_colnames = True) 
print(frq_items)

# Collecting the inferred rules in a dataframe 
rules = association_rules(frq_items, metric ="lift", min_threshold = 1) 
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 

     support                                           itemsets
0   0.357143                                 (Toy Story (1995))
1   0.301587        (Twelve Monkeys (a.k.a. 12 Monkeys) (1995))
2   0.351852                      (Seven (a.k.a. Se7en) (1995))
3   0.375661                       (Usual Suspects, The (1995))
4   0.380952                                (Braveheart (1995))
5   0.481481        (Star Wars: Episode IV - A New Hope (1977))
6   0.539683                              (Pulp Fiction (1994))
7   0.555556                 (Shawshank Redemption, The (1994))
8   0.568783                              (Forrest Gump (1994))
9   0.317460                             (Fugitive, The (1993))
10  0.380952                             (Jurassic Park (1993))
11  0.378307                          (Schindler's List (1993))
12  0.386243                (Terminator 2: Judgment Day (1991))
13  0.494709                 (Silence of the Lambs, The (1991))
14  0.338624                            

In [33]:
for i in rules.index:
    antecedents = []
    consequents = []
    for j in rules.antecedents[i]:
        antecedents.append(j[:30])
    for k in rules.consequents[i]:
        consequents.append(k[:30])
        
    print(f'({",".join(antecedents)}) --> ({",".join(consequents)})\n')

(Lord of the Rings: The Return ,Lord of the Rings: The Fellows) --> (Lord of the Rings: The Two Tow)

(Lord of the Rings: The Return ,Lord of the Rings: The Two Tow) --> (Lord of the Rings: The Fellows)

(Lord of the Rings: The Fellows,Lord of the Rings: The Two Tow) --> (Lord of the Rings: The Return )

(Lord of the Rings: The Two Tow) --> (Lord of the Rings: The Fellows)

(Star Wars: Episode V - The Emp) --> (Star Wars: Episode IV - A New )

(Lord of the Rings: The Return ) --> (Lord of the Rings: The Two Tow)

(Lord of the Rings: The Two Tow) --> (Lord of the Rings: The Return )

(Lord of the Rings: The Return ) --> (Lord of the Rings: The Fellows)

(Star Wars: Episode VI - Return) --> (Star Wars: Episode IV - A New )

(Lord of the Rings: The Fellows) --> (Lord of the Rings: The Two Tow)

(Lord of the Rings: The Two Tow) --> (Lord of the Rings: The Return ,Lord of the Rings: The Fellows)

(Lord of the Rings: The Return ) --> (Lord of the Rings: The Fellows,Lord of the Rings: The Two

In [34]:
# Recommendation for a particular movie
def getRecommendation(movie):
    similarMovies = []
    for movies in frequentItemset:
        if movie in movies:
            similarMovies.extend(movies)
    return similarMovies

In [35]:
movie = 'Star Wars: Episode IV - A New Hope (1977)' 
print("The Recommended Movies are\n")
recommended_movies = getRecommendation(movie)
for movies in recommended_movies:
    if(movies != movie):
        print(movies)

The Recommended Movies are



## The End