# Module 11: Association Rules Mining and Recommendation Systems 
## Case Study – 1


In [20]:
# Install mlxtend if not already installed
# 1. Import Libraries
try:
    from mlxtend.frequent_patterns import apriori, association_rules
except ImportError:
    import sys
    !{sys.executable} -m pip install mlxtend
    # !{sys.executable} -m pip install --force-reinstall --upgrade mlxtend

import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
import warnings
warnings.filterwarnings('ignore')

# Load datasets
df_ratings = pd.read_csv('BX-Book-Ratings.csv', encoding='ISO-8859-1')
df_books = pd.read_csv('BX-Books.csv', encoding='ISO-8859-1')
df_users = pd.read_csv('BX-Users.csv', encoding='ISO-8859-1')

# Check data
print("Ratings Data:\n", df_ratings.head(), '\n','-'*60)
print("Books Data:\n", df_books.head(), '\n','-'*60)
print("Users Data:\n", df_users.head(), '\n','-'*60)

# Check the null values 
print(df_ratings.isnull().sum(), '\n')
print(df_books.isnull().sum(),'\n')
print(df_users.isnull().sum(),'\n')


print(df_ratings.info(), '\n')
print(df_books.info(),'\n')
print(df_users.info(),'\n')


Ratings Data:
    user_id        isbn  rating
0   276725  034545104X       0
1   276726   155061224       5
2   276727   446520802       0
3   276729  052165615X       3
4   276729   521795028       6 
 ------------------------------------------------------------
Books Data:
         isbn                                         book_title  \
0  195153448                                Classical Mythology   
1    2005018                                       Clara Callan   
2   60973129                               Decision in Normandy   
3  374157065  Flu: The Story of the Great Influenza Pandemic...   
4  393045218                             The Mummies of Urumchi   

            book_author year_of_publication                   publisher  
0    Mark P. O. Morford                2002     Oxford University Press  
1  Richard Bruce Wright                2001       HarperFlamingo Canada  
2          Carlo D'Este                1991             HarperPerennial  
3      Gina Bari Kolata 

In [2]:
# 2. Cleaning the Data:

# Replace missing Location values with 'Unknown Location'
df_users['Location'].fillna('Unknown Location', inplace=True)

# Replace missing Age values with the median age
df_users['Age'].fillna(df_users['Age'].median(), inplace=True)

# Replace missing values in df_books
df_books['book_author'].fillna('Unknown Author', inplace=True)
df_books['publisher'].fillna('Unknown Publisher', inplace=True)

# Verify updated null values
print("Updated Null Values in df_books:\n", df_books.isnull().sum(), '\n')
print("Updated Null Values in df_users:\n", df_users.isnull().sum(), '\n')


Updated Null Values in df_books:
 isbn                   0
book_title             0
book_author            0
year_of_publication    0
publisher              0
dtype: int64 

Updated Null Values in df_users:
 user_id     0
Location    0
Age         0
dtype: int64 



In [13]:
# 3. Data Volume: reduce memory usage and focus on meaningful data

# The MemoryError occurs because the pivot operation generates a massive matrix 
# (61K rows × 128K columns), which consumes too much memory. 
# Focusing on a smaller subset of the data will address this.

# To reduce memory usage and focus on meaningful data, you can filter users 
# who have interacted with a significant number of books 
# (e.g., users who have rated or rented a certain threshold of books). 
# This helps limit the size of the user-book matrix while keeping high-quality data.

# Step 1: Filter ratings (remove 0 ratings)
df_ratings_filtered = df_ratings[df_ratings['rating'] > 0]

# Step 2: Count the number of books each user interacted with
user_book_counts = df_ratings_filtered['user_id'].value_counts()

# Step 3: Filter users who interacted with significant books (threshold = 5)
threshold = 20
users_significant = user_book_counts[user_book_counts >= threshold].index

# Filter the ratings data for these users
df_ratings_filtered = df_ratings_filtered[df_ratings_filtered['user_id'].isin(users_significant)]

# Step 4: Merge with books data
df_merged = pd.merge(df_ratings_filtered, df_books, on='isbn', how='left')

# Keep only necessary columns
df_merged = df_merged[['user_id', 'book_title']]
print(df_merged)

# Step 5: Build the User-Book Matrix
user_book_matrix = df_merged.pivot_table(index='user_id', 
                                         columns='book_title', 
                                         aggfunc='size', 
                                         fill_value=0)
'''
This code creates a matrix (a table) where:

Rows (index='user_id') represent each user.
Columns (columns='book_title') represent each book title.
Values inside the table represent how many times a user interacted with (e.g., rented, rated) a book.

What is aggfunc='size'?
aggfunc: Stands for aggregation function. It tells how to combine or summarize the data.
'size': Instead of summing or averaging values, 'size' counts the number of occurrences of each combination.

fill_value=0: If a user hasn't interacted with a book, the value will be 0 instead of leaving it blank.
'''

# Convert to binary format (1 if user interacted with a book)
user_book_matrix = user_book_matrix.applymap(lambda x: 1 if x > 0 else 0)

print("User-Book Matrix (Filtered):\n", user_book_matrix.head())
print(f"Shape of the User-Book Matrix: {user_book_matrix.shape}")



        user_id                                         book_title
0        276847                                                NaN
1        276847                              Der Stein der Kelten.
2        276847                                         Nordermoor
3        276847                        Nur der Tod ist ohne Makel.
4        276847                                  Der Kleine Hobbit
...         ...                                                ...
227252   250709            Ghost in the Shell (Ghost in the Shell)
227253   250709                    The Bubblegum Crisis: Grand Mal
227254   250709  The Hidden Army: The Untold Story of Japan's M...
227255   250709  The Global Me: New Cosmopolitans and the Compe...
227256   250709                                       Dragon Dance

[227257 rows x 2 columns]
User-Book Matrix (Filtered):
 book_title   A Light in the Storm: The Civil War Diary of Amelia Martin, Fenwick Island, Delaware, 1861 (Dear America)  \
user_id          

In [12]:
user_book_matrix

book_title,"A Light in the Storm: The Civil War Diary of Amelia Martin, Fenwick Island, Delaware, 1861 (Dear America)",Dark Justice,"Earth Prayers From around the World: 365 Prayers, Poems, and Invocations for Honoring the Earth",Final Fantasy Anthology: Official Strategy Guide (Brady Games),Flight of Fancy: American Heiresses (Zebra Ballad Romance),Garfield Bigger and Better (Garfield (Numbered Paperback)),God's Little Promise Book,"Good Wives: Image and Reality in the Lives of Women in Northern New England, 1650-1750","Goosebumps Monster Edition 1: Welcome to Dead House, Stay Out of the Basement, and Say Cheese and Die!",Highland Desire (Zebra Splendor Historical Romances),...,Ã?ngeles fugaces (Falling Angels),Ã?Â?. Kolumnen.,Ã?Â?a,Ã?Â?ber das Fernsehen.,Ã?Â?ber den ProzeÃ?Â? der Zivilisation 1.,Ã?Â?ber den ProzeÃ?Â? der Zivilisation 2.,Ã?Â?ber die Freiheit.,Ã?Â?rger mit Produkt X. Roman.,Ã?Â?stlich der Berge.,Ã?Â?thique en toc
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
183,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
242,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
254,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
392,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
507,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278356,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
278418,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
278582,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
278633,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# 4. Apply Association Rule Mining
# We will use the Apriori algorithm to find frequently rented books and generate association rules.

# Step 1: Apply the Apriori algorithm
min_support = 0.01 # min_support stands for minimum support: threshold that determines how frequently an item or itemset must appear in the dataset to be considered "frequent.
frequent_itemsets = apriori(user_book_matrix, min_support=min_support, use_colnames=True)

print("\nFrequent Itemsets:")
print(frequent_itemsets)

# Step 2: Generate association rules with a dummy 'num_itemsets'
#rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0, num_itemsets=len(frequent_itemsets))

# Step 3: Sort and display the rules
rules = rules.sort_values(by="confidence", ascending=False)
print("\nTop Association Rules:")
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head())


0.23.3

Frequent Itemsets:
      support                                           itemsets
0    0.021137                                             (1984)
1    0.028580                              (1st to Die: A Novel)
2    0.021137                                       (2nd Chance)
3    0.010717                            (84 Charing Cross Road)
4    0.015779                               (A Bend in the Road)
..        ...                                                ...
376  0.014290  (Harry Potter and the Sorcerer's Stone (Book 1...
377  0.011015  (Harry Potter and the Sorcerer's Stone (Harry ...
378  0.010420  (Harry Potter and the Order of the Phoenix (Bo...
379  0.014290  (Harry Potter and the Chamber of Secrets (Book...
380  0.013397  (Harry Potter and the Chamber of Secrets (Book...

[381 rows x 2 columns]

Top Association Rules:
                                           antecedents  \
125  (Harry Potter and the Chamber of Secrets (Book...   
104  (Harry Potter and the Or

In [8]:
rules[ (rules['lift'] >= 10) &
      (rules['confidence'] >= 0.8) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
125,(Harry Potter and the Chamber of Secrets (Book...,(Harry Potter and the Prisoner of Azkaban (Boo...,0.013695,0.045847,0.013397,0.978261,21.337521,1.0,0.012769,43.891039,0.966368,0.290323,0.977216,0.635234
104,(Harry Potter and the Order of the Phoenix (Bo...,(Harry Potter and the Prisoner of Azkaban (Boo...,0.010717,0.045847,0.01042,0.972222,21.205808,1.0,0.009928,34.349509,0.963166,0.225806,0.970888,0.599747
93,(Harry Potter and the Sorcerer's Stone (Book 1...,(Harry Potter and the Prisoner of Azkaban (Boo...,0.014885,0.045847,0.01429,0.96,20.939221,1.0,0.013608,23.853826,0.966631,0.307692,0.958078,0.635844
127,(Harry Potter and the Sorcerer's Stone (Book 1...,(Harry Potter and the Chamber of Secrets (Book...,0.01429,0.053885,0.013397,0.9375,17.398135,1.0,0.012627,15.137839,0.956186,0.244565,0.93394,0.593059
58,(Harry Potter and the Sorcerer's Stone (Book 1...,(Harry Potter and the Chamber of Secrets (Book...,0.014885,0.053885,0.013695,0.92,17.07337,1.0,0.012892,11.826436,0.955655,0.248649,0.915444,0.587072
76,(Harry Potter and the Sorcerer's Stone (Book 1...,(Harry Potter and the Chamber of Secrets (Book...,0.016969,0.053885,0.015481,0.912281,16.930115,1.0,0.014566,10.78571,0.957176,0.27957,0.907285,0.599787
110,(Harry Potter and the Chamber of Secrets (Book...,(Harry Potter and the Prisoner of Azkaban (Boo...,0.015779,0.045847,0.01429,0.90566,19.753982,1.0,0.013567,10.114022,0.964597,0.301887,0.901127,0.608674
132,(Harry Potter and the Sorcerer's Stone (Book 1...,(Harry Potter and the Chamber of Secrets (Book...,0.014885,0.031557,0.013397,0.9,28.519811,1.0,0.012927,9.68443,0.979517,0.405405,0.896741,0.662264
111,(Harry Potter and the Chamber of Secrets (Book...,(Harry Potter and the Goblet of Fire (Book 4)),0.016374,0.042274,0.01429,0.872727,20.644302,1.0,0.013598,7.524986,0.967401,0.322148,0.867109,0.605378
124,(Harry Potter and the Chamber of Secrets (Book...,(Harry Potter and the Goblet of Fire (Book 4)),0.015481,0.042274,0.013397,0.865385,20.470612,1.0,0.012742,7.114532,0.966106,0.302013,0.859443,0.591143
