In [4]:
# Part 1

from __future__ import print_function
import pyLDAvis
import pyLDAvis.sklearn
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [5]:
pyLDAvis.enable_notebook()

newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'),categories = ['sci.med', 'sci.space', 'talk.politics.guns'])
docs_raw = newsgroups.data
print(len(docs_raw))

1733


In [6]:
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 10)
dtm_tf = tf_vectorizer.fit_transform(docs_raw)
print(dtm_tf.shape)

(1733, 2589)


In [7]:
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw)
print(dtm_tfidf.shape)



(1733, 2589)


In [8]:
# for TF DTM
lda_tf = LatentDirichletAllocation(n_components=20, random_state=0)
lda_tf.fit(dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=20, random_state=0)
lda_tfidf.fit(dtm_tfidf)

LatentDirichletAllocation(n_components=20, random_state=0)

In [9]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

In [10]:
pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)

In [11]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='mmds')

In [12]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='tsne')

In [13]:
# repeat the above code with n_components = 30.

# for TF DTM
lda_tf = LatentDirichletAllocation(n_components=30, random_state=0)
lda_tf.fit(dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=30, random_state=0)
lda_tfidf.fit(dtm_tfidf)

LatentDirichletAllocation(n_components=30, random_state=0)

In [14]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

In [15]:
pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)

In [16]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='mmds')

In [17]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='tsne')

In [18]:
# Part 2

import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth
from datetime import datetime

dataset = pd.read_csv('/Users/brian80433/Downloads/ratings_small.csv')
dataset = dataset[dataset.rating > 3]
total = []
for i in range(1,672):
    list_ = []
    data = dataset[dataset.userId == i]
    for j in data['movieId']:
        list_.append(j)
    total.append(list_)

te = TransactionEncoder()
te_ary = te.fit(total).transform(total)
df = pd.DataFrame(te_ary, columns=te.columns_)

startTime = datetime.now()
frequent_itemsets = fpgrowth(df, min_support=0.15, use_colnames=True)
### alternatively:
#frequent_itemsets = apriori(df, min_support=0.6, use_colnames=True)
#frequent_itemsets = fpmax(df, min_support=0.6, use_colnames=True)
endTime = datetime.now()
time = endTime - startTime
print("Time Consumned: ",time)
frequent_itemsets 



Time Consumned:  0:00:00.160467


Unnamed: 0,support,itemsets
0,0.400894,(296)
1,0.318927,(527)
2,0.275708,(589)
3,0.274218,(50)
4,0.259314,(480)
...,...,...
190,0.154993,"(608, 260)"
191,0.154993,"(1, 318)"
192,0.156483,"(1, 356)"
193,0.150522,"(296, 1)"


In [19]:
startTime1 = datetime.now()
frequent_itemsets1 = fpgrowth(df, min_support=0.1, use_colnames=True)
### alternatively:
#frequent_itemsets = apriori(df, min_support=0.6, use_colnames=True)
#frequent_itemsets = fpmax(df, min_support=0.6, use_colnames=True)
endTime1 = datetime.now()
time1 = endTime1 - startTime1
print("Time Consumned: ",time1)
frequent_itemsets1

Time Consumned:  0:00:00.377831


Unnamed: 0,support,itemsets
0,0.400894,(296)
1,0.318927,(527)
2,0.275708,(589)
3,0.274218,(50)
4,0.259314,(480)
...,...,...
1427,0.110283,"(8961, 7153)"
1428,0.107303,"(4993, 8961)"
1429,0.104322,"(5952, 8961)"
1430,0.104322,"(4993, 7153, 8961)"


In [20]:
# Due to the huge number of itemsets to generate, my computer are not able to run when min_support = 0.01 and 0.001, you can also try it on your computer.
# Obviously, having a smaller min_support requires longer computational time and bigger ram, the time min_support = 0.01 and 0.001 take are way longer. 

In [21]:
from mlxtend.frequent_patterns import association_rules

association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(296),(318),0.400894,0.426230,0.260805,0.650558,1.526308,0.089932,1.641960
1,(318),(296),0.426230,0.400894,0.260805,0.611888,1.526308,0.089932,1.543642
2,(296),(356),0.400894,0.406855,0.228018,0.568773,1.397974,0.064912,1.375482
3,(356),(296),0.406855,0.400894,0.228018,0.560440,1.397974,0.064912,1.362966
4,"(296, 356)",(318),0.228018,0.426230,0.172876,0.758170,1.778783,0.075688,2.372619
...,...,...,...,...,...,...,...,...,...
229,(1),(318),0.271237,0.426230,0.154993,0.571429,1.340659,0.039383,1.338798
230,(1),(356),0.271237,0.406855,0.156483,0.576923,1.418005,0.046129,1.401978
231,(1),(296),0.271237,0.400894,0.150522,0.554945,1.384268,0.041784,1.346139
232,(858),(1221),0.277198,0.177347,0.163934,0.591398,3.334689,0.114774,2.013334


In [22]:
association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,"(296, 356)",(318),0.228018,0.426230,0.172876,0.758170,1.778783,0.075688,2.372619
1,(50),(296),0.274218,0.400894,0.204173,0.744565,1.857261,0.094241,2.345436
2,(50),(318),0.274218,0.426230,0.192250,0.701087,1.644858,0.075371,1.919523
3,"(296, 50)",(318),0.204173,0.426230,0.156483,0.766423,1.798147,0.069458,2.456455
4,"(50, 318)",(296),0.192250,0.400894,0.156483,0.813953,2.030345,0.079411,3.220194
...,...,...,...,...,...,...,...,...,...
69,"(593, 356)",(318),0.235469,0.426230,0.171386,0.727848,1.707644,0.071022,2.108273
70,"(296, 356)",(593),0.228018,0.385991,0.160954,0.705882,1.828753,0.072941,2.087630
71,(4993),(2571),0.251863,0.326379,0.184799,0.733728,2.248088,0.102596,2.529823
72,(1704),(318),0.205663,0.426230,0.156483,0.760870,1.785117,0.068823,2.399404


In [23]:
association_rules(frequent_itemsets, metric="confidence", min_threshold=0.9)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,"(2571, 1196)",(260),0.183308,0.368107,0.165425,0.902439,2.451565,0.097948,6.4769
1,"(1210, 1196)",(260),0.216095,0.368107,0.196721,0.910345,2.473042,0.117175,7.048034
2,"(1210, 1198)",(1196),0.175857,0.299553,0.160954,0.915254,3.055401,0.108275,8.265276
3,"(1210, 1198)",(260),0.175857,0.368107,0.160954,0.915254,2.486379,0.09622,7.456334
4,"(1210, 1196, 1198)",(260),0.160954,0.368107,0.152012,0.944444,2.565677,0.092764,11.374069
5,"(1210, 260, 1198)",(1196),0.160954,0.299553,0.152012,0.944444,3.152847,0.103798,12.608048
6,(1291),(1198),0.172876,0.295082,0.157973,0.913793,3.096743,0.10696,8.177049
7,(5952),(4993),0.229508,0.251863,0.210134,0.915584,3.635249,0.15233,8.862547
8,"(5952, 2571)",(4993),0.175857,0.251863,0.165425,0.940678,3.734881,0.121133,12.611454
9,"(5952, 260)",(4993),0.153502,0.251863,0.150522,0.980583,3.893319,0.11186,38.529061


In [24]:
# Obviously, the higher the confidence min_threshold, the less rules we have.
# The above results are the rules we obtain when min_threshold = 0.9.