In [95]:
import sys
from collections import defaultdict
from operator import itemgetter

import pandas as pd

In [102]:
moviedata_input_file = 'ml-100k/u.data'
moviename_input_file = 'ml-100k/u.item'

In [103]:
# Load movie data
all_ratings = pd.read_csv(moviedata_input_file, 
                          delimiter='\t', 
                          header=None,
                          names=['UserID', 'MovieID', 'Rating', 'Datetime'])

In [113]:
# Parse datetime
all_ratings['Datetime'] = pd.to_datetime(all_ratings['Datetime'], unit='s')

In [114]:
all_ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Datetime
0,196,242,3,1997-12-04 15:55:49
1,186,302,3,1998-04-04 19:22:22
2,22,377,1,1997-11-07 07:18:36
3,244,51,2,1997-11-27 05:02:03
4,166,346,1,1998-02-02 05:33:16


In [122]:
# Create a new column
all_ratings['Favorable'] = all_ratings['Rating'] > 3

In [22]:
all_ratings.loc[10:15]

Unnamed: 0,UserID,MovieID,Rating,Datetime,Favorable
10,62,257,2,1997-11-12 22:07:14,False
11,286,1014,5,1997-11-17 15:38:45,True
12,200,222,5,1997-10-05 09:05:40,True
13,210,40,3,1998-03-27 21:59:54,False
14,224,29,3,1998-02-21 23:40:57,False
15,303,785,3,1997-11-14 05:28:38,False


In [32]:
# Take the users with UserID in the range 0 to 200 as the training dataset
ratings = all_ratings[all_ratings['UserID'].isin(range(200))]
ratings.head()

19531


Unnamed: 0,UserID,MovieID,Rating,Datetime,Favorable
0,196,242,3,1997-12-04 15:55:49,False
1,186,302,3,1998-04-04 19:22:22,False
2,22,377,1,1997-11-07 07:18:36,False
4,166,346,1,1998-02-02 05:33:16,False
6,115,265,2,1997-12-03 17:51:28,False


In [35]:
# Take only users that has rated
favorable_ratings = ratings[ratings['Favorable']]

print('All Ratings: {}'.format(len(ratings)))
print('Favorable Ratings: {}'.format(len(favorable_ratings)))

favorable_ratings.head()

All Ratings: 19531
Favorable Ratings: 11043


Unnamed: 0,UserID,MovieID,Rating,Datetime,Favorable
16,122,387,5,1997-11-11 17:47:39,True
20,119,392,4,1998-01-30 16:13:34,True
21,167,486,4,1998-04-16 14:54:12,True
26,38,95,5,1998-04-13 01:14:54,True
28,63,277,4,1997-10-01 23:10:01,True


In [56]:
favorable_reviews_by_users = dict((k, frozenset(v.values))
                                  for k, v in favorable_ratings.groupby('UserID')['MovieID'])
favorable_reviews_by_users

{1: frozenset({1,
            3,
            6,
            7,
            9,
            12,
            13,
            14,
            15,
            16,
            18,
            19,
            20,
            22,
            23,
            25,
            28,
            32,
            33,
            39,
            42,
            43,
            44,
            45,
            46,
            47,
            48,
            50,
            51,
            52,
            55,
            56,
            57,
            58,
            59,
            60,
            61,
            64,
            65,
            66,
            68,
            72,
            75,
            76,
            77,
            79,
            80,
            81,
            82,
            84,
            86,
            87,
            88,
            89,
            90,
            91,
            93,
            95,
            96,
            98,
            100,
            106,
        

In [58]:
num_favorable_by_movie = ratings[['MovieID', 'Favorable']].groupby('MovieID').sum()
sorted_by_favorable = num_favorable_by_movie.sort_values('Favorable', ascending=False)

# Top 5 movie list
sorted_by_favorable.head()

Unnamed: 0_level_0,Favorable
MovieID,Unnamed: 1_level_1
50,100.0
100,89.0
258,83.0
181,79.0
174,74.0


In [83]:
frequent_itemsets = {}
min_support = 50

frequent_itemsets[1] = dict((frozenset((movie_id,)),
                            row['Favorable'])
                            for movie_id, row in num_favorable_by_movie.iterrows()
                            if row['Favorable'] > min_support)
frequent_itemsets

{1: {frozenset({1}): 66.0,
  frozenset({7}): 67.0,
  frozenset({9}): 53.0,
  frozenset({50}): 100.0,
  frozenset({56}): 67.0,
  frozenset({64}): 58.0,
  frozenset({79}): 58.0,
  frozenset({98}): 70.0,
  frozenset({100}): 89.0,
  frozenset({127}): 70.0,
  frozenset({172}): 59.0,
  frozenset({174}): 74.0,
  frozenset({181}): 79.0,
  frozenset({258}): 83.0,
  frozenset({286}): 59.0,
  frozenset({313}): 60.0}}

In [84]:
def find_frequent_itemsets(favorable_reviews_by_users, k_1_itemsets, min_support):
    counts = defaultdict(int)
    for user, reviews in favorable_reviews_by_users.items():
        for itemset in k_1_itemsets:
            if itemset.issubset(reviews):
                for other_reviewed_movie in reviews - itemset:
                    current_superset = itemset | frozenset((other_reviewed_movie,))
                    counts[current_superset] += 1
    return dict([(itemset, frequency) for itemset, frequency in counts.items() if frequency >= min_support])

In [85]:
for k in range(2, 20):
    cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users, frequent_itemsets[k-1], min_support)
    
    if len(cur_frequent_itemsets) == 0:
        print('Did not find any frequent itemsets of length {}'.format(k))
        sys.stdout.flush()
        break
    else:
        print('I found {} frequent itemsets of length {}'.format(len(cur_frequent_itemsets), k))
        sys.stdout.flush()

    frequent_itemsets[k] = cur_frequent_itemsets
print('Done')

I found 93 frequent itemsets of length 2
I found 295 frequent itemsets of length 3
I found 593 frequent itemsets of length 4
I found 785 frequent itemsets of length 5
I found 677 frequent itemsets of length 6
I found 373 frequent itemsets of length 7
I found 126 frequent itemsets of length 8
I found 24 frequent itemsets of length 9
I found 2 frequent itemsets of length 10
Did not find any frequent itemsets of length 11
Done


In [86]:
# The first itemset contains only one item - delete them
del frequent_itemsets[1]

In [88]:
# Extracting association rules
candidate_rules = []
for itemset_length, itemset_counts in frequent_itemsets.items():
    for itemset in itemset_counts.keys():
        for conclusion in itemset:
            premise = itemset - set((conclusion,))
            candidate_rules.append((premise, conclusion))

In [90]:
candidate_rules[:5]

[(frozenset({7}), 1),
 (frozenset({1}), 7),
 (frozenset({50}), 1),
 (frozenset({1}), 50),
 (frozenset({1}), 56)]

In [93]:
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)

for user, reviews in favorable_reviews_by_users.items():
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule] += 1

In [94]:
rule_confidence = {candidate_rule: correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule])
                   for candidate_rule in candidate_rules}

In [98]:
sorted_confidence = sorted(rule_confidence.items(), key=itemgetter(1), reverse=True)
sorted_confidence

[((frozenset({98, 181}), 50), 1.0),
 ((frozenset({79, 172}), 174), 1.0),
 ((frozenset({172, 258}), 174), 1.0),
 ((frozenset({1, 7, 181}), 50), 1.0),
 ((frozenset({1, 7, 172}), 174), 1.0),
 ((frozenset({1, 50, 56}), 174), 1.0),
 ((frozenset({1, 56, 181}), 50), 1.0),
 ((frozenset({1, 98, 181}), 50), 1.0),
 ((frozenset({1, 172, 181}), 50), 1.0),
 ((frozenset({1, 56, 64}), 98), 1.0),
 ((frozenset({1, 56, 64}), 174), 1.0),
 ((frozenset({1, 56, 172}), 174), 1.0),
 ((frozenset({1, 56, 181}), 174), 1.0),
 ((frozenset({1, 79, 172}), 174), 1.0),
 ((frozenset({7, 50, 56}), 174), 1.0),
 ((frozenset({7, 56, 181}), 50), 1.0),
 ((frozenset({7, 64, 181}), 50), 1.0),
 ((frozenset({7, 79, 181}), 50), 1.0),
 ((frozenset({7, 50, 98}), 174), 1.0),
 ((frozenset({7, 98, 181}), 50), 1.0),
 ((frozenset({7, 100, 181}), 50), 1.0),
 ((frozenset({7, 50, 172}), 174), 1.0),
 ((frozenset({7, 56, 258}), 98), 1.0),
 ((frozenset({7, 56, 181}), 174), 1.0),
 ((frozenset({79, 100, 172}), 7), 1.0),
 ((frozenset({79, 100, 18

In [107]:
# Load movie name
movie_name_data = pd.read_csv(moviename_input_file,
                              delimiter='|', 
                              header=None,
                              encoding='mac-roman')
movie_name_data.columns = ['MovieID', 'Title', 'Release Date', 'Video Release', 'IMDB', '<UNK>', 
                           'Action', 'Adventure', 'Animation', 
                           'Children\'s', 'Comedy', 'Crime', 
                           'Documentary', 'Drama', 
                           'Fantasy', 'Film-Noir',
                           'Horror', 
                           'Musical', 'Mystery', 
                           'Romance', 
                           'Sci-Fi', 
                           'Thriller', 
                           'War', 'Western']
movie_name_data.head()

Unnamed: 0,MovieID,Title,Release Date,Video Release,IMDB,<UNK>,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [108]:
def get_movie_name(movie_id):
    """Returns the movie name based on the provided movie id
    
    Parameters:
    ----------
    movie_id: int
        The id of the movie

    Returns:
    --------
    title: string
        The movie name for the equivalent movie id
    """
    title_object = movie_name_data[movie_name_data['MovieID'] == movie_id]['Title']
    title = title_object.values[0]
    return title

In [111]:
for index in range(5):
    print('Rule #{}'.format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    premise_names = ", ".join(get_movie_name(idx) for idx in premise)
    conclusion_name = get_movie_name(conclusion)
    print('Rule: If person recommends {}, they will also recommend {}'.format(premise_names, conclusion_name))
    print(' - Confidence: {0:.3f}'.format(rule_confidence[(premise, conclusion)]))
    print('')

Rule #1
Rule: If person recommends Silence of the Lambs, The (1991), Return of the Jedi (1983), they will also recommend Star Wars (1977)
 - Confidence: 1.000

Rule #2
Rule: If person recommends Empire Strikes Back, The (1980), Fugitive, The (1993), they will also recommend Raiders of the Lost Ark (1981)
 - Confidence: 1.000

Rule #3
Rule: If person recommends Contact (1997), Empire Strikes Back, The (1980), they will also recommend Raiders of the Lost Ark (1981)
 - Confidence: 1.000

Rule #4
Rule: If person recommends Toy Story (1995), Return of the Jedi (1983), Twelve Monkeys (1995), they will also recommend Star Wars (1977)
 - Confidence: 1.000

Rule #5
Rule: If person recommends Toy Story (1995), Empire Strikes Back, The (1980), Twelve Monkeys (1995), they will also recommend Raiders of the Lost Ark (1981)
 - Confidence: 1.000



In [124]:
# Evaluation
test_dataset = all_ratings[~all_ratings['UserID'].isin(range(200))]
test_favorable = test_dataset[test_dataset['Favorable']]
test_favorable_by_users = dict((k, frozenset(v.values)) for k, v
                               in test_favorable.groupby('UserID')['MovieID'])

In [126]:
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)

for user, reviews in test_favorable_by_users.items():
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule] += 1

test_confidence = {candidate_rule: correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule])
                   for candidate_rule in rule_confidence}

In [127]:
for index in range(5):
    print('Rule #{}'.format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    premise_names = ", ".join(get_movie_name(idx) for idx in premise)
    conclusion_name = get_movie_name(conclusion)
    print('Rule: If person recommends {}, they will also recommend {}'.format(premise_names, conclusion_name))
    print(' - Train Confidence: {0:.3f}'.format(rule_confidence.get((premise, conclusion))))
    print(' - Test Confidence: {0:.3f}'.format(test_confidence.get((premise, conclusion))))
    print('')

Rule #1
Rule: If person recommends Silence of the Lambs, The (1991), Return of the Jedi (1983), they will also recommend Star Wars (1977)
 - Train Confidence: 1.000
 - Test Confidence: 0.936

Rule #2
Rule: If person recommends Empire Strikes Back, The (1980), Fugitive, The (1993), they will also recommend Raiders of the Lost Ark (1981)
 - Train Confidence: 1.000
 - Test Confidence: 0.876

Rule #3
Rule: If person recommends Contact (1997), Empire Strikes Back, The (1980), they will also recommend Raiders of the Lost Ark (1981)
 - Train Confidence: 1.000
 - Test Confidence: 0.841

Rule #4
Rule: If person recommends Toy Story (1995), Return of the Jedi (1983), Twelve Monkeys (1995), they will also recommend Star Wars (1977)
 - Train Confidence: 1.000
 - Test Confidence: 0.932

Rule #5
Rule: If person recommends Toy Story (1995), Empire Strikes Back, The (1980), Twelve Monkeys (1995), they will also recommend Raiders of the Lost Ark (1981)
 - Train Confidence: 1.000
 - Test Confidence: 0.9