In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
# https://www.kaggle.com/jneupane12/movielens
movies = pd.read_csv('movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 3 columns):
movieId    27278 non-null int64
title      27278 non-null object
genres     27278 non-null object
dtypes: int64(1), object(2)
memory usage: 639.4+ KB


In [4]:
print('无重复ID:', len(movies.movieId.value_counts()) == len(movies))

无重复ID: True


In [5]:
# 将genres进行one-hot编码（离散特征有多少取值，就用多少维来表示这个特征）
movies_hot_encoded = movies.drop('genres',1).join(movies.genres.str.get_dummies())
# 将movieId, title设置为index
movies_hot_encoded.set_index(['movieId','title'],inplace=True)
# 挖掘频繁项集，最小支持度为0.02
itemsets = apriori(movies_hot_encoded,use_colnames=True, min_support=0.02)
# 根据频繁项集计算关联规则，设置最小提升度为2
rules =  association_rules(itemsets, metric='lift', min_threshold=2)

In [8]:
itemsets.sort_values(by='support', ascending=False)

Unnamed: 0,support,itemsets
7,0.489185,(Drama)
4,0.306987,(Comedy)
14,0.153164,(Thriller)
12,0.151294,(Romance)
0,0.129042,(Action)
5,0.107743,(Crime)
9,0.095718,(Horror)
31,0.094325,"(Romance, Drama)"
26,0.093335,"(Drama, Comedy)"
6,0.090586,(Documentary)


In [7]:
rules.sort_values(by='lift', ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
13,(Mystery),(Thriller),0.055503,0.153164,0.029144,0.525099,3.428352,0.020643,1.783185
12,(Thriller),(Mystery),0.153164,0.055503,0.029144,0.190282,3.428352,0.020643,1.166453
17,(Crime),"(Thriller, Drama)",0.107743,0.06848,0.024965,0.231711,3.383632,0.017587,1.212461
14,"(Thriller, Drama)",(Crime),0.06848,0.107743,0.024965,0.364561,3.383632,0.017587,1.404159
1,(Adventure),(Action),0.08538,0.129042,0.035633,0.417347,3.234198,0.024616,1.494813
0,(Action),(Adventure),0.129042,0.08538,0.035633,0.276136,3.234198,0.024616,1.263525
5,(Sci-Fi),(Action),0.063898,0.129042,0.023499,0.367757,2.849906,0.015253,1.377568
4,(Action),(Sci-Fi),0.129042,0.063898,0.023499,0.182102,2.849906,0.015253,1.144523
8,(Thriller),(Crime),0.153164,0.107743,0.045165,0.294878,2.736877,0.028662,1.265394
9,(Crime),(Thriller),0.107743,0.153164,0.045165,0.41919,2.736877,0.028662,1.458027
