In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

In [18]:
with open("transaction.txt", "r") as file:
    transactions = [line.strip().split() for line in file.readlines()]

In [19]:
transactions = [[int(item) for item in transaction] for transaction in transactions]

In [20]:
sensitive_itemsets = [
    (24458, 36306),
    (24458, 25608),
    (24458, 35918),
    (24458, 43336),
    (11788, 24458),
    (10550, 11776, 11788),
    (10550, 36308),
    (10550, 11788, 36308),
    (10550, 11788),
    (37994, 43331),
    (11777, 12408),
    (11774, 11777),
    (11777, 43338),
    (11776, 11777),
    (11777, 36308),
    (11777, 35918),
    (11777, 11788, 35918),
    (11777, 11788),
    (11777, 43337),
    (11777, 11788, 43337),
    (25608, 26826),
    (11776, 25608),
    (25608, 64866),
    (25608, 43338),
    (12408, 25608),
    (25608, 43028),
    (25608, 37996),
    (25608, 36306),
    (25608, 36306, 36308),
    (25608, 36305),
    (11788, 37996, 38176),
    (36308, 37996, 38176),
    (36306, 38176),
    (11788, 36306, 38176),
    (36306, 36308, 38176),
    (11776, 38176),
    (11776, 11788, 38176),
    (36305, 38176),
    (11788, 36305, 38176),
    (36305, 36308, 38176),
    (11776, 37996, 64866),
    (36306, 37996, 64866),
    (37996, 43336, 64866),
    (11788, 37996, 64866),
    (36308, 37996, 64866),
    (43336, 64866),
    (11776, 43336, 64866),
    (36306, 43336, 64866),
    (36306, 36308, 43336, 64866),
    (11788, 43336, 64866),
]

In [21]:
def is_sensitive(transaction, sensitive_itemsets):
    for itemset in sensitive_itemsets:
        if all(item in transaction for item in itemset):
            return 1
    return 0

In [22]:
def remove_sensitive_items(transaction, sensitive_itemsets):
    for itemset in sensitive_itemsets:
        if all(item in transaction for item in itemset):
            transaction = [item for item in transaction if item not in itemset]
    return transaction

In [9]:
labels = [is_sensitive(transaction, sensitive_itemsets) for transaction in transactions]
cleaned_transactions = [remove_sensitive_items(transaction, sensitive_itemsets) for transaction in transactions]


In [23]:
transactions_str = [" ".join(map(str, transaction)) for transaction in cleaned_transactions]

In [24]:
df = pd.DataFrame({'transaction': transactions_str, 'label': labels})

In [25]:
X_train, X_test, y_train, y_test = train_test_split(df['transaction'], df['label'], test_size=0.3, random_state=42)

In [26]:
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [27]:
model = MultinomialNB()
model.fit(X_train_vec, y_train)

In [28]:
y_pred = model.predict(X_test_vec)

In [29]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7978900255754475

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.89      0.88     13258
           1       0.32      0.28      0.30      2382

    accuracy                           0.80     15640
   macro avg       0.59      0.59      0.59     15640
weighted avg       0.79      0.80      0.79     15640



In [30]:
output_file = "Naive_transaction2.txt"
with open(output_file, "w") as file:
    for transaction in cleaned_transactions:
        file.write(" ".join(map(str, transaction)) + "\n")