# Datasets Characteristics Analysis

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
DATASET_DIR = "datasets" 

datasets = {
    "Retail": "retail.txt",
    "Foodmart": "foodmartFIM.txt",
    "Mushroom": "mushrooms.txt", 
    "Chainstore": "chainstore.txt"
}

def get_dataset_characteristics(name, filename):
    """
    Parses an SPMF dataset and returns key characteristics.
    """
    # This joins "datasets" + "retail.txt" -> "datasets/retail.txt"
    filepath = os.path.join(DATASET_DIR, filename)
    print(f"Scanning {name}...", end=" ")
    
    # Debugging check: Print where we are looking if it fails
    if not os.path.exists(filepath):
        print(f"\n  [Error] File not found at: {os.path.abspath(filepath)}")
        return None

    transaction_lengths = []
    unique_items = set()
    
    try:
        with open(filepath, 'r') as f:
            for line in f:
                line = line.strip()
                if not line: continue
                
                # Split by space (standard SPMF)
                items = line.split()
                # Remove SPMF metadata tags like -1, -2 if they exist
                items = [x for x in items if x not in ['-1', '-2']]
                
                transaction_lengths.append(len(items))
                unique_items.update(items)
                
        # Calculations
        num_trans = len(transaction_lengths)
        num_items = len(unique_items)
        avg_len = np.mean(transaction_lengths)
        max_len = np.max(transaction_lengths)
        
        # Density Calculation: (AvgLength / TotalUniqueItems) * 100
        density = (avg_len / num_items) * 100 if num_items > 0 else 0
        
        print("Done.")
        
        return {
            "Dataset": name,
            "Transactions (|D|)": num_trans,
            "Distinct Items (|I|)": num_items,
            "Avg. Length": round(avg_len, 2),
            "Max Length": max_len,
            "Density (%)": round(density, 4),
            # Threshold for dense is usually > 5-10%
            "Type": "Dense" if density > 5.0 else "Sparse" 
        }
        
    except Exception as e:
        print(f"\n[Error] Could not read {name}: {e}")
        return None

# --- Main Execution Loop ---
# Check if the folder exists first
if not os.path.exists(DATASET_DIR):
    print(f"CRITICAL ERROR: The folder '{DATASET_DIR}' was not found in {os.getcwd()}")
    print("Please check that your notebook is in the same folder as the 'datasets' directory.")
else:
    stats_list = []

    for name, file in datasets.items():
        stats = get_dataset_characteristics(name, file)
        if stats:
            stats_list.append(stats)

    # --- Display Result ---
    if stats_list:
        df_stats = pd.DataFrame(stats_list)
        
        # Sort by Transactions to look organized
        df_stats = df_stats.sort_values(by="Transactions (|D|)")
        
        print("DATASET CHARACTERISTICS TABLE")
        display(df_stats)
    else:
        print("No datasets were analyzed successfully.")

Scanning Retail... Done.
Scanning Foodmart... Done.
Scanning Mushroom... Done.
Scanning Chainstore... Done.
DATASET CHARACTERISTICS TABLE


Unnamed: 0,Dataset,Transactions (|D|),Distinct Items (|I|),Avg. Length,Max Length,Density (%),Type
1,Foodmart,4141,1559,4.42,14,0.2838,Sparse
2,Mushroom,8416,119,23.0,23,19.3277,Dense
0,Retail,88162,16470,10.31,76,0.0626,Sparse
3,Chainstore,1112949,910126,13.45,339,0.0015,Sparse
