### 1. Load the Data (assuming you downloaded the .xlsx)
Note: This might take a minute or two to read because it's a heavy Excel file.

In [2]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

print("Loading data... (this might take a moment)")
df = pd.read_csv('dataset.csv')

Loading data... (this might take a moment)


In [4]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,01-12-2010 08:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,01-12-2010 08:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,01-12-2010 08:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,01-12-2010 08:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,01-12-2010 08:26,3.39,17850.0,United Kingdom


### 2. Data Cleaning

In [6]:
# Remove spaces from descriptions
df['Description'] = df['Description'].str.strip()
# Drop rows without Invoice numbers
df.dropna(axis=0, subset=['InvoiceNo'], inplace=True)
df['InvoiceNo'] = df['InvoiceNo'].astype('str')
# Remove Credit transactions (Invoices starting with 'C' are cancellations)
df = df[~df['InvoiceNo'].str.contains('C')]

### 3. Create the Basket (Focusing on France for speed/memory safety)

In [7]:
print("Structuring the market basket...")
basket = (df[df['Country'] =="France"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

# Convert counts to boolean (0 or 1) - We only care IF they bought it, not how many
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

basket_sets = basket.applymap(encode_units)

Structuring the market basket...


  basket_sets = basket.applymap(encode_units)


### 4. The Algorithm (Apriori)

In [8]:
print("Mining frequent itemsets...")
# min_support=0.07 means the item must appear in at least 7% of transactions
frequent_itemsets = apriori(basket_sets, min_support=0.07, use_colnames=True)

Mining frequent itemsets...




### 5. Generate Rules

In [9]:
print("Generating rules...")
# metric="lift" is usually best for finding strong associations
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

Generating rules...


### Show the top rules sorted by Confidence (How likely Y is if X happens)

In [10]:
print("\n--- Top 5 Association Rules ---")
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]
      .sort_values(by='confidence', ascending=False)
      .head(5))


--- Top 5 Association Rules ---
                                           antecedents  \
17                        (JUMBO BAG WOODLAND ANIMALS)   
122  (SET/6 RED SPOTTY PAPER PLATES, SET/20 RED RET...   
121  (SET/6 RED SPOTTY PAPER CUPS, SET/20 RED RETRO...   
127  (POSTAGE, SET/6 RED SPOTTY PAPER CUPS, SET/20 ...   
128  (POSTAGE, SET/6 RED SPOTTY PAPER PLATES, SET/2...   

                         consequents   support  confidence      lift  
17                         (POSTAGE)  0.076531    1.000000  1.306667  
122    (SET/6 RED SPOTTY PAPER CUPS)  0.099490    0.975000  7.077778  
121  (SET/6 RED SPOTTY PAPER PLATES)  0.099490    0.975000  7.644000  
127  (SET/6 RED SPOTTY PAPER PLATES)  0.081633    0.969697  7.602424  
128    (SET/6 RED SPOTTY PAPER CUPS)  0.081633    0.969697  7.039282  


In [11]:
# 1. Sort by Lift (Strongest associations first)
rules_sorted = rules.sort_values(by='lift', ascending=False)

# 2. Print the top 10 to see your victory
print("\n--- TOP RULES BY LIFT ---")
print(rules_sorted[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10))

# 3. Save to CSV
rules_sorted.to_csv('market_basket_rules_france.csv', index=False)
print("\n✅ Success! Saved to 'market_basket_rules_france.csv'")


--- TOP RULES BY LIFT ---
                                           antecedents  \
81                          (ALARM CLOCK BAKELIKE RED)   
80               (POSTAGE, ALARM CLOCK BAKELIKE GREEN)   
3                         (ALARM CLOCK BAKELIKE GREEN)   
2                           (ALARM CLOCK BAKELIKE RED)   
78                 (ALARM CLOCK BAKELIKE RED, POSTAGE)   
83                        (ALARM CLOCK BAKELIKE GREEN)   
6                           (ALARM CLOCK BAKELIKE RED)   
7                          (ALARM CLOCK BAKELIKE PINK)   
124                    (SET/6 RED SPOTTY PAPER PLATES)   
121  (SET/6 RED SPOTTY PAPER CUPS, SET/20 RED RETRO...   

                                           consequents   support  confidence  \
81               (POSTAGE, ALARM CLOCK BAKELIKE GREEN)  0.071429    0.756757   
80                          (ALARM CLOCK BAKELIKE RED)  0.071429    0.848485   
3                           (ALARM CLOCK BAKELIKE RED)  0.079082    0.815789   
2             