In [1]:
import pandas as pd
import dask.dataframe as dd  # 🔹 Using Dask for large datasets
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

cols_to_use = ["Invoice", "StockCode", "InvoiceDate"]
df1 = dd.read_csv("/kaggle/input/new-data-sales/Year 2009-2010.csv", 
                  usecols=cols_to_use, dtype={'Invoice': 'object'})

df2 = dd.read_csv("/kaggle/input/new-data-sales/Year 2010-2011.csv", 
                  usecols=cols_to_use, dtype={'Invoice': 'object'})
df = dd.concat([df1, df2], axis=0)

df["InvoiceDate"] = dd.to_datetime(df["InvoiceDate"])
df["StockCode"] = df["StockCode"].astype("category")

df_filtered = df.compute() 

def prepare_transaction_data(ds: pd.DataFrame, min_items=2, min_freq=10) -> list:
    product_counts = ds['StockCode'].value_counts()
    frequent_products = product_counts[product_counts >= min_freq].index
    ds_filtered = ds[ds['StockCode'].isin(frequent_products)]
    
    transactions = ds_filtered.groupby('Invoice')['StockCode'].apply(list).tolist()
    return [t for t in transactions if len(t) >= min_items]

transactions = prepare_transaction_data(df_filtered)

def encode_transactions(transactions: list) -> pd.DataFrame:
    te = TransactionEncoder()
    te_ary = te.fit(transactions).transform(transactions, sparse=True)  
    return pd.DataFrame.sparse.from_spmatrix(te_ary, columns=te.columns_)

df_encoded = encode_transactions(transactions)

def find_product_associations_apriori(df: pd.DataFrame, min_support: float = 0.01) -> pd.DataFrame:
    return apriori(df, min_support=min_support, use_colnames=True)

frequent_itemsets = find_product_associations_apriori(df_encoded)

def generate_association_rules(frequent_itemsets: pd.DataFrame, min_lift: float = 1.0) -> pd.DataFrame:
    if frequent_itemsets.empty:
        return pd.DataFrame()
    
    return association_rules(frequent_itemsets, metric="lift", min_threshold=min_lift, support_only=False, num_itemsets=len(frequent_itemsets))

rules = generate_association_rules(frequent_itemsets)

if not rules.empty:
    rules.to_csv("optimized_association_rules.csv", index=False)
    print("✅ Association rules saved as 'optimized_association_rules.csv'!")
else:
    print("⚠️ No association rules found.")


  return pd.DataFrame.sparse.from_spmatrix(te_ary, columns=te.columns_)


✅ Association rules saved as 'optimized_association_rules.csv'!


In [3]:
pip install --upgrade mlxtend


  and should_run_async(code)


Collecting mlxtend
  Downloading mlxtend-0.23.4-py3-none-any.whl.metadata (7.3 kB)
Collecting scikit-learn>=1.3.1 (from mlxtend)
  Downloading scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Downloading mlxtend-0.23.4-py3-none-any.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m94.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: scikit-learn, mlxtend
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
  Attempting uninstall: mlxtend
    Found existing installation: mlxtend 0.23.3
    Uninstallin