## Aprendizaje de reglas de asociación

In [0]:
import pandas as pd
import numpy as np
import sys
from itertools import combinations, groupby
from collections import Counter
from IPython.display import display
from sklearn.utils import shuffle
!pip install -q efficient_apriori
from efficient_apriori import apriori



In [0]:
def size(obj):
    return "{0:.2f} MB".format(sys.getsizeof(obj) / (1000 * 1000))

In [4]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

# es muy pesado trabajar con toda la base...
print('movies -- dimensions: {0};   size: {1}'.format(movies.shape, size(movies)))
display(movies.head())
display(movies.tail())
display(ratings.head())

movies -- dimensions: (27278, 3);   size: 4.50 MB


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,movieId,title,genres
27273,131254,Kein Bund für's Leben (2007),Comedy
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy
27275,131258,The Pirates (2014),Adventure
27276,131260,Rentun Ruusu (2001),(no genres listed)
27277,131262,Innocence (2014),Adventure|Fantasy|Horror


Unnamed: 0,userId,movieId,rating,timestamp
0,1,2.0,3.5,1112486000.0
1,1,29.0,3.5,1112485000.0
2,1,32.0,3.5,1112485000.0
3,1,47.0,3.5,1112485000.0
4,1,50.0,3.5,1112485000.0


In [5]:
#sacar peliculas con genero invalido.
print(len(movies[movies.genres == '(no genres listed)'].index))

246


In [6]:
print(len(movies.index))
movies.drop(movies[movies.genres == '(no genres listed)'].index, inplace=True)
print(len(movies.index))

27278
27032


In [7]:
movies = shuffle(movies, random_state=47).head(int(len(movies)/10))
print('movies -- dimensions: {0};   size: {1}'.format(movies.shape, size(movies)))

movies -- dimensions: (2703, 3);   size: 0.47 MB


In [8]:
#decodificar el nombre de los productos
merged_df = pd.merge(movies[['movieId','title']], ratings[['movieId','userId']] ,on='movieId', how= "inner")

display(merged_df.head())
merged_df=merged_df.sort_values( by='userId', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
merged=merged_df.values[:,[0,2]] # 0 es movieId, 2 es userId
print(len(merged)) # este merged tiene en 0 el movieId, el 1 el userId
print(merged[:2])

Unnamed: 0,movieId,title,userId
0,26736,Riki-Oh: The Story of Ricky (Lik Wong) (1991),451
1,26736,Riki-Oh: The Story of Ricky (Lik Wong) (1991),1590
2,26736,Riki-Oh: The Story of Ricky (Lik Wong) (1991),2601
3,87234,Submarine (2010),348
4,87234,Submarine (2010),482


38725
[[2692 1]
 [7389 1]]


In [9]:
transactions=[]
for movie_id, movie_object in groupby(merged, lambda x: x[1]):
    transactions.append([item[0] for item in movie_object])
print(len(transactions))
print(transactions[:2])

2791
[[2692, 7389, 1258, 1219, 1217, 1193, 1291, 3037, 5039, 2762], [3753, 2454, 62, 1356, 1121]]


In [10]:
itemsets, rules = apriori(transactions, min_support=0.006,  min_confidence=0.8, max_length=3)
len(rules)

23455

In [11]:
#rules = filter(lambda rule: len(rule.lhs) == 2 and len(rule.rhs) == 1, rules)
rules=sorted(rules, key=lambda rule: rule.confidence)
print(rules[:3])


[{1260} -> {904}, {3169} -> {1193}, {2119} -> {1258}]


In [12]:
rules_dict = [{'rule_lhs':x.lhs, 'rule_rhs':x.rhs, 'confidence':x.confidence, 'support':x.support, 'lift':x.lift, 'conviction':x.conviction} for x in rules]
print(len(rules_dict))
print(rules_dict[:2])

23455
[{'rule_lhs': (1260,), 'rule_rhs': (904,), 'confidence': 0.8, 'support': 0.02293084915800788, 'lift': 6.0839237057220705, 'conviction': 4.342529537585096}, {'rule_lhs': (3169,), 'rule_rhs': (1193,), 'confidence': 0.8, 'support': 0.007165890361877463, 'lift': 3.5385103011093504, 'conviction': 3.869580776065927}]


In [0]:
rules_df = pd.DataFrame(rules_dict)

In [14]:
print(len(rules_df.index))
print(rules_df.head())
print(rules_df.tail())

23455
   confidence  conviction      lift rule_lhs rule_rhs   support
0         0.8    4.342530  6.083924  (1260,)   (904,)  0.022931
1         0.8    3.869581  3.538510  (3169,)  (1193,)  0.007166
2         0.8    4.098889  4.438966  (2119,)  (1258,)  0.008599
3         0.8    3.575779  2.808553  (1648,)  (2762,)  0.008599
4         0.8    3.575779  2.808553  (2331,)  (2762,)  0.007166
       confidence    conviction       lift        rule_lhs  rule_rhs   support
23450         1.0  8.269437e+08   5.778468   (8985, 52328)   (6874,)  0.006091
23451         1.0  9.100681e+08  11.119522   (7099, 65261)  (60069,)  0.006091
23452         1.0  9.100681e+08  11.119522   (7099, 78499)  (60069,)  0.006091
23453         1.0  9.254747e+08  13.418269   (8665, 88163)  (54286,)  0.006091
23454         1.0  9.100681e+08  11.119522  (78499, 95167)  (60069,)  0.007166


In [15]:
rules_df.describe()
# Soporte: nos dice la prob empirica de q se compre el itemset Y dado q SABEMOS q se compro el itemset X. 
## De las tx q compraron X, cuantas tmb compraron Y?
# Confianza: de las tx que contienen a X, cuantas de estas tambien contienen a Y?. 
## De los q compraron el itemset X que % tambien compro el itemset Y.
# Lift: mide el incremento de Pr de Y dado q SABEMOS q se compro X. Rsuleve el problema de cuando la
# confianza es muy parecida a la Pr apriori de Y (# tx q contienen Y / # tx total).
## > 1, la Pr de Y aumente una vez q SABEMOS q l consumidor compro X.
## = 1, X e Y son eventos independientes, q haya comprado X no quiere decir q haya comprado Y, hay causalidad.
## < 1, la ocurrencia de X tuvo un efecto negativo en la ocurrencia de Y haciedno que bajo su Pr. 
## La gente que compra X tiene una menor Pr de comprar Y comparado con la Pr apriori de comprar Y.
# Conviccion ??

Unnamed: 0,confidence,conviction,lift,support
count,23455.0,23455.0,23455.0,23455.0
mean,0.871165,30545930.0,4.165142,0.010911
std,0.052946,148279000.0,2.019628,0.006992
min,0.8,3.230025,2.259919,0.006091
25%,0.826087,4.452436,2.860564,0.006808
50%,0.861111,5.509546,3.486098,0.008241
75%,0.904762,8.026514,4.940618,0.012182
max,1.0,925474700.0,28.242262,0.122178


In [0]:
final_rules_df = rules_df[(rules_df.lift >= 7.0)]

In [17]:
print(len(final_rules_df.index))
print(final_rules_df[:10])

1740
    confidence  conviction       lift    rule_lhs rule_rhs   support
18         0.8    4.553923   8.967068   (5, 2405)  (1377,)  0.008599
21         0.8    4.539592   8.687938   (5, 2496)  (3253,)  0.007166
23         0.8    4.539592   8.687938   (5, 3617)  (3253,)  0.007166
24         0.8    4.722322  14.405161   (5, 3617)  (4270,)  0.007166
26         0.8    4.460767   7.417940   (12, 161)   (370,)  0.008599
27         0.8    4.460767   7.417940  (12, 1219)   (370,)  0.007166
28         0.8    4.460767   7.417940  (12, 2294)   (370,)  0.007166
36         0.8    4.761734  16.787970   (20, 216)   (413,)  0.007166
40         0.8    4.539592   8.687938  (20, 2762)  (3253,)  0.007166
79         0.8    4.539592   8.687938  (34, 3477)  (3253,)  0.008599
