In [1]:
# code for loading the format for the notebook
import os

# path : store the current path to convert back to it later
path = os.getcwd()
os.chdir('../notebook_format')
from formats import load_style
load_style()

In [2]:
os.chdir(path)
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 8, 6 # change default figure size
plt.rcParams['font.size'] = 12 # and font size

# 1. magic to print version
# 2. magic so that the notebook will reload external python modules
%load_ext watermark
%load_ext autoreload 
%autoreload 2

%watermark -a 'Ethen' -d -t -v -p numpy,pandas,matplotlib

Ethen 2016-09-09 14:15:26 

CPython 3.5.2
IPython 4.2.0

numpy 1.11.1
pandas 0.18.1
matplotlib 1.5.1


[movielens dataset](http://grouplens.org/datasets/movielens/)

In [3]:
filepath = os.path.join( 'ml-100k', 'u.data' )
movielens = pd.read_csv( filepath, delimiter = '\t', header = None,
                         names = ['UserID', 'MovieID', 'Rating', 'Datetime'] )
movielens.head()

Unnamed: 0,UserID,MovieID,Rating,Datetime
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
movielens['Favorable'] = movielens['Rating'] > 3
movielens.head()

Unnamed: 0,UserID,MovieID,Rating,Datetime,Favorable
0,196,242,3,881250949,False
1,186,302,3,891717742,False
2,22,377,1,878887116,False
3,244,51,2,880606923,False
4,166,346,1,886397596,False


In [5]:
ratings = movielens[ movielens['UserID'].isin(range(200)) ]
favorable_ratings = ratings[ ratings["Favorable"] ]

In [6]:
favorable_reviews_by_users = { k: frozenset(v.values) for k, v in 
                               favorable_ratings.groupby("UserID")["MovieID"] }

In [7]:
movielens[ movielens['UserID'] == 1 ]

Unnamed: 0,UserID,MovieID,Rating,Datetime,Favorable
202,1,61,4,878542420,True
305,1,189,3,888732928,False
333,1,33,4,878542699,True
334,1,160,4,875072547,True
478,1,20,4,887431883,True
639,1,202,5,875072442,True
687,1,171,5,889751711,True
820,1,265,4,878542441,True
933,1,155,2,878542201,False
972,1,117,3,874965739,False


In [8]:
num_favorable_by_movie = ( ratings[["MovieID", "Favorable"]]
                           .groupby("MovieID").sum() )
num_favorable_by_movie.head()

Unnamed: 0_level_0,Favorable
MovieID,Unnamed: 1_level_1
1,66.0
2,5.0
3,4.0
4,21.0
5,6.0


In [9]:
frequent_itemsets = {}
min_support = 50

# k=1 candidates are the ones with more than min_support favourable reviews
frequent_itemsets[1] = dict((frozenset((movie_id,)), row["Favorable"])
                                for movie_id, row in num_favorable_by_movie.iterrows()
                                if row["Favorable"] > min_support)

In [10]:
frequent_itemsets[1]

{frozenset({286}): 59.0,
 frozenset({7}): 67.0,
 frozenset({64}): 58.0,
 frozenset({79}): 58.0,
 frozenset({258}): 83.0,
 frozenset({50}): 100.0,
 frozenset({313}): 60.0,
 frozenset({174}): 74.0,
 frozenset({100}): 89.0,
 frozenset({181}): 79.0,
 frozenset({1}): 66.0,
 frozenset({127}): 70.0,
 frozenset({172}): 59.0,
 frozenset({98}): 70.0,
 frozenset({56}): 67.0,
 frozenset({9}): 53.0}

## Reference

https://github.com/asaini/Apriori

https://github.com/timothyasp/apriori-python