In [1]:
import pandas as pd
from itertools import combinations

df = pd.read_csv('/content/groceries_dataset.csv')

df.head()


Unnamed: 0,Item 1,Item 2,Item 3,Item 4,Item 5,Item 6,Item 7,Item 8,Item 9,Item 10,...,Item 23,Item 24,Item 25,Item 26,Item 27,Item 28,Item 29,Item 30,Item 31,Item 32
0,citrus fruit,semi-finished bread,margarine,ready soups,,,,,,,...,,,,,,,,,,
1,tropical fruit,yogurt,coffee,,,,,,,,...,,,,,,,,,,
2,whole milk,,,,,,,,,,...,,,,,,,,,,
3,pip fruit,yogurt,cream cheese,meat spreads,,,,,,,...,,,,,,,,,,
4,other vegetables,whole milk,condensed milk,long life bakery product,,,,,,,...,,,,,,,,,,


In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

# -----------------------------
# 1. Load & Preprocess Dataset
# -----------------------------
df = pd.read_csv('/content/groceries_dataset.csv')

# Replace spaces in item names with underscores
df = df.applymap(lambda x: x.replace(" ", "_") if isinstance(x, str) else x)

# Create "documents" (each transaction as string of items)
transactions = df.astype(str).apply(lambda x: ' '.join(x.dropna()), axis=1)

# -----------------------------
# 2. TF-IDF Transformation
# -----------------------------
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(transactions)
items = vectorizer.get_feature_names_out()

# Convert to DataFrame
tfidf_df = pd.DataFrame(X.toarray(), columns=items)

# Binary presence for support/confidence/lift
binary_df = (tfidf_df > 0).astype(int)
n_transactions = len(binary_df)

# -----------------------------
# 3. Rule Calculation Function
# -----------------------------
def compute_rule(item_a, item_b):
    support_a = binary_df[item_a].sum() / n_transactions
    support_b = binary_df[item_b].sum() / n_transactions
    support_ab = ((binary_df[item_a] & binary_df[item_b]).sum()) / n_transactions

    if support_a == 0 or support_b == 0:
        return None

    confidence = support_ab / support_a
    lift = confidence / support_b

    return {
        "antecedent": item_a.replace("_", " "),
        "consequent": item_b.replace("_", " "),
        "support": round(support_ab, 3),
        "confidence": round(confidence, 3),
        "lift": round(lift, 3)
    }

# -----------------------------
# 4. Generate Rules
# -----------------------------
rules = []
for i, item_a in enumerate(items):
    for j, item_b in enumerate(items):
        if i != j:
            rule = compute_rule(item_a, item_b)
            if rule:
                rules.append(rule)

rules_df = pd.DataFrame(rules)

# Sort by lift (strongest relationships)
rules_df = rules_df.sort_values(by="lift", ascending=False).reset_index(drop=True)

# -----------------------------
# 5. Save to Pickle
# -----------------------------
with open("tfidf_rules.pkl", "wb") as f:
    pickle.dump(rules_df, f)

print("✅ Rules generated and saved to tfidf_rules.pkl")

# -----------------------------
# 6. Load Pickle Later
# -----------------------------
with open("tfidf_rules.pkl", "rb") as f:
    loaded_rules = pickle.load(f)

print("Top 5 rules from pickle:")
print(loaded_rules.head())


  df = df.applymap(lambda x: x.replace(" ", "_") if isinstance(x, str) else x)


✅ Rules generated and saved to tfidf_rules.pkl
Top 5 rules from pickle:
    antecedent   consequent  support  confidence     lift
0  flower soil   fertilizer    0.002         1.0  517.632
1   fertilizer  flower soil    0.002         1.0  517.632
2    sweetener        artif    0.003         1.0  307.344
3        artif    sweetener    0.003         1.0  307.344
4       prunes         nuts    0.003         1.0  298.030
