Market Basket Analysis Using Apriori /FP-Growth algorithm.
Medhanie Yonatan Haile 
2120246059
Software Engineering NKU

# importing libraries

In [None]:
#importing libraries
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import association_rules
from mlxtend.frequent_patterns import fpgrowth


# importing the sales dataset transaction

In [None]:
#importing the sales dataset transaction
dataset = pd.read_csv("Sales1998_normalized.csv",header=None, engine='python')
# printing the shape of the dataset
dataset.shape

In [None]:
# Dynamically determine the number of columns
num_columns = dataset.shape[1]
num_columns

 # Convert the transactions to a list

In [None]:
 # Convert the transactions to a list
transactions = []
for i in range(len(dataset)):
     transactions.append([str(dataset.values[i, j]) for j in range(num_columns)])
transactions

# Verification step

In [None]:
print(f"Number of rows processed: {len(transactions)}")
print(dataset.isnull().sum())  # Check for missing values in each column

# Encode the transaction dataset into a one-hot encoded DataFrame

In [None]:
# Encode the transaction dataset into a one-hot encoded DataFrame, where each column represents a unique item.
te = TransactionEncoder()
te_ary = te.fit_transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)


# Convert column names from float to integers

In [None]:
# Convert column names to integers if they are numeric, otherwise keep them as strings
def convert_column_names(column):
    try:
        return int(float(column))  # Convert to integer if possible
    except ValueError:
        return str(column)  # Keep as string if conversion fails

df.columns = [convert_column_names(col) for col in df.columns]
# Verify the updated column names
print("Updated Column Names:")
print(df.columns)

# remove any rows in the DataFrame dataset that contain missing values (NaN) 

In [None]:
dataset = dataset.dropna()

# verification step

In [None]:
# Display the DataFrame columns in to a list format
print("Columns in df:", df.columns)
# Display the shape of the DataFrame rows and columns
print("Shape of df:", df.shape)
print(df.head())

# Check if the column '177' exists in the DataFrame for verification

In [None]:
# Check if the column '177' exists in the DataFrame
if 177 in df.columns:
    # Filter transactions where item '177' is present
    transactions_with_177 = df[df[177] == 1]
    print("Transactions containing item 177:")
    print(transactions_with_177)
else:
    print("Item '177' is not found in the DataFrame columns.")

#  Analyze co-occurrence of items

In [None]:
# Calculate the co-occurrence matrix
co_occurrence = df.T.dot(df)

# Print the co-occurrence matrix
print("Co-occurrence Matrix:")
print(co_occurrence)

In [None]:
# Filter out infrequent items
item_counts = df.sum(axis=0)
frequent_items = item_counts[item_counts >= 5].index
df = df[frequent_items]

In [None]:
# Lower the min_support threshold
min_support = 0.0001  # Further reduced support threshold
model = fpgrowth(df, min_support=min_support, use_colnames=True)

In [None]:
# Debug: Check the frequent itemsets
print("Frequent Itemsets:")
print(model)

In [None]:
# Check for larger itemsets
print("Frequent Itemsets with More Than One Item:")
larger_itemsets = model[model['itemsets'].apply(lambda x: len(x) > 1)]
print(larger_itemsets)

In [None]:
""" This checks if there are any larger itemsets (itemsets with more than one item) in the frequent itemsets.
If frequent itemsets and larger itemsets exist, the code generates association rules using the `association_rules` function.
  - The rules are filtered based on the `min_confidence` threshold (set to `0.05` in this case).
  - Confidence** measures the likelihood of the consequent being present in a transaction, given that the antecedent is present.
   If rules are generated, they are sorted by the **lift** metric in descending order.
  - Lift measures the strength of the association between the antecedent and the consequent. A lift value greater than 1 indicates a positive association.
  - The sorted rules are printed, showing key metrics such as `antecedents`, `consequents`, `support`, `confidence`, and `lift`. """

In [None]:
# Check if any frequent itemsets were found
if model.empty:
    print("No frequent itemsets found. Try lowering the min_support value.")
elif larger_itemsets.empty:
    print("No larger itemsets found. Try lowering the min_support value.")
else:
    # Lower the min_confidence threshold
    min_confidence = 0.05  # Reduced confidence threshold
    rules = association_rules(model, metric='confidence', min_threshold=min_confidence)

    # Debug: Check the generated rules
    print("Generated Rules:")
    print(rules)

    # Check if any rules were generated
    if rules.empty:
        print("No rules generated. Try lowering the min_confidence value.")
    else:
        # Sort rules by lift
        rules = rules.sort_values(by='lift', ascending=False)

        # Print the rules
        print("Association Rules:")
        print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])