In [16]:
# !pip install mlxtend
# !pip install surprise
import heapq
from collections import defaultdict

import matplotlib.pylab as plt
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from surprise import Dataset, KNNBasic, Reader
from surprise.model_selection import train_test_split

from pandas_util import load_data

In [20]:
df = load_data("Faceplate.csv")
df.set_index("transaction", inplace=True)
df.head()

Unnamed: 0_level_0,red,white,blue,orange,green,yellow
transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,1,0,0,1,0
2,0,1,0,1,0,0
3,0,1,1,0,0,0
4,1,1,0,1,0,0
5,1,0,1,0,0,0


In [22]:
# Create frequent itemsets
itemsets = apriori(df, min_support=0.2, use_colnames=True)
itemsets



Unnamed: 0,support,itemsets
0,0.6,(red)
1,0.7,(white)
2,0.6,(blue)
3,0.2,(orange)
4,0.2,(green)
5,0.4,"(red, white)"
6,0.4,"(red, blue)"
7,0.2,"(red, green)"
8,0.4,"(white, blue)"
9,0.2,"(white, orange)"


In [23]:
# Convert into rules.
rules = association_rules(itemsets, metric="confidence", min_threshold=0.5)
rules.sort_values(by=["lift"], ascending=False).drop(
    columns=["antecedent support", "consequent support", "conviction"]
).head(6)

Unnamed: 0,antecedents,consequents,support,confidence,lift,leverage
12,"(red, white)",(green),0.2,0.5,2.5,0.12
15,(green),"(red, white)",0.2,1.0,2.5,0.12
4,(green),(red),0.2,1.0,1.666667,0.08
14,"(white, green)",(red),0.2,1.0,1.666667,0.08
7,(orange),(white),0.2,1.0,1.428571,0.06
8,(green),(white),0.2,1.0,1.428571,0.06


In [24]:
# Interpreting the results
# If orange is purchased, then with confidence 100% white will also be purchased.

In [30]:
df = load_data("CharlesBookClub.csv")
# Create the binary incidence matrix
ignore = [
    "seq#",
    "id#",
    "gender",
    "m",
    "r",
    "f",
    "firstpurch",
    "related_purchase",
    "mcode",
    "rcode",
    "fcode",
    "yes_florence",
    "no_florence",
]
count_books = df.drop(columns=ignore)
count_books[count_books > 0] = 1

# Create frequent itemsets and rules
itemsets = apriori(count_books, min_support=200 / 4000, use_colnames=True)
rules = association_rules(
    itemsets,
    metric="confidence",
    min_threshold=0.5,
)

# Display 25 rules with the highest lift
rules.sort_values(by=["lift"], ascending=False).head(25)



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
64,"(youthbks, refbks)","(cookbks, childbks)",0.08125,0.242,0.05525,0.68,2.809917,0.035588,2.36875
73,"(doitybks, refbks)","(cookbks, childbks)",0.0925,0.242,0.06125,0.662162,2.736207,0.038865,2.24368
60,"(youthbks, doitybks)","(cookbks, childbks)",0.10325,0.242,0.067,0.64891,2.681448,0.042014,2.158993
80,"(refbks, geogbks)","(cookbks, childbks)",0.08175,0.242,0.05025,0.614679,2.539995,0.030467,1.96719
69,"(youthbks, geogbks)","(cookbks, childbks)",0.1045,0.242,0.06325,0.605263,2.501087,0.037961,1.920267
77,"(doitybks, geogbks)","(cookbks, childbks)",0.101,0.242,0.0605,0.59901,2.475248,0.036058,1.890321
68,"(cookbks, childbks, geogbks)",(youthbks),0.1095,0.23825,0.06325,0.577626,2.424452,0.037162,1.803495
70,"(cookbks, childbks, refbks)",(doitybks),0.1035,0.25475,0.06125,0.591787,2.323013,0.034883,1.825642
49,"(doitybks, geogbks)",(youthbks),0.101,0.23825,0.0545,0.539604,2.264864,0.030437,1.654554
63,"(cookbks, childbks, refbks)",(youthbks),0.1035,0.23825,0.05525,0.533816,2.240573,0.030591,1.634013


## Collaborative Filtering

In [32]:
import random

random.seed(0)
nratings = 5000

random_data = pd.DataFrame(
    {
        "item_id": [random.randint(0, 99) for _ in range(nratings)],
        "user_id": [random.randint(0, 999) for _ in range(nratings)],
        "rating": [random.randint(1, 5) for _ in range(nratings)],
    }
)
random_data.head()

Unnamed: 0,item_id,user_id,rating
0,49,665,1
1,97,974,5
2,53,542,5
3,5,634,3
4,33,694,2


In [33]:
def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    by_user = defaultdict(list)
    for p in predictions:
        by_user[p.uid].append(p)

    # For each user, reduce predictions to top-n
    for uid, user_predictions in by_user.items():
        by_user[uid] = heapq.nlargest(n, user_predictions, key=lambda p: p.est)

    return by_user

In [34]:
# Convert the data set into the format required by the surprise package.
# The column must correspond to user id, item id, and ratings (in that order)
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(random_data[["user_id", "item_id", "rating"]], reader)

# Split into training and test set
trainset, testset = train_test_split(data, test_size=0.25, random_state=1)

## User-based filtering
# Compute cosine similarity between users
sim_options = {"name": "cosine", "user_based": True}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x11c2a0e50>

In [35]:
# Predict ratings for all pairs (u, i) that are NOT in the training set.
predictions = algo.test(testset)

# Print the recommended items for each user.
top_n = get_top_n(predictions, n=4)
print("Top-3 recommended items for each user")
for uid, user_ratings in list(top_n.items())[:5]:
    print(f"User {uid}")
    for prediction in user_ratings:
        print("  Item {0.iid} ({0.est:.2f})".format(prediction), end="")
    print()

Top-3 recommended items for each user
User 6
  Item 6 (5.00)  Item 77 (2.50)  Item 60 (1.00)
User 222
  Item 77 (3.50)  Item 75 (2.78)
User 424
  Item 14 (3.50)  Item 45 (3.10)  Item 54 (2.34)
User 87
  Item 27 (3.00)  Item 54 (3.00)  Item 82 (3.00)  Item 32 (1.00)
User 121
  Item 98 (3.48)  Item 32 (2.83)


In [None]:
# Rebuild model using the full dataset
trainset = data.build_full_trainset()
sim_options = {'name': 'cosine', 'user_based': False}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)

# Predict rating for user 383 and item 7
algo.predict(383, 7)