In [2]:
import pandas as pd
import numpy as np # Useful for generating sequences

print("Starting data processing...")

# --- Step 1: Read the CSV ---
# Assuming no header row in the CSV, columns are indexed 0, 1, 2...
try:
    df = pd.read_csv('data.csv', header=None)
    print(f"Successfully read data.csv. Initial shape: {df.shape}")
    if df.empty:
        print("Error: data.csv is empty.")
        exit()
except FileNotFoundError:
    print("Error: data.csv not found. Please check the file path.")
    exit()
except pd.errors.EmptyDataError:
	print("Error: data.csv is empty.")
	exit()
except Exception as e:
    print(f"An error occurred while reading data.csv: {e}")
    exit()

# --- Step 2: Define which columns contain items ---
# ***** CRITICAL: Decide if original column 0 should be dropped *****
# If original column 0 (the first one in the CSV) IS NOT an item and should be dropped:
columns_to_drop = [0]
# If original column 0 IS an item, make this list empty:
# columns_to_drop = []
# Add any other original column indices if they need dropping.

item_columns = [col for col in df.columns if col not in columns_to_drop]

# Keep only the item columns for processing
# Use .copy() to avoid potential SettingWithCopyWarning later
if not item_columns:
    print("Error: No item columns selected after dropping specified columns.")
    exit()

item_df = df[item_columns].copy()
print(f"Selected item columns. Shape for processing: {item_df.shape}")

# --- Step 3: Process Rows to Get Lists of Actual Items (Efficiently) ---
# Use apply row-wise. Handles original NaNs correctly.
# - dropna() removes the NaNs (ignores missing items).
# - astype(str) converts valid items to strings (good practice).
# - tolist() converts the remaining items in the row to a list.
transactions = item_df.apply(lambda row: row.dropna().astype(str).tolist(), axis=1)

# --- Step 4: Create Final DataFrame with TransactionID ---
# Create Transaction IDs (starting from 1) matching the number of rows processed
transaction_ids = np.arange(1, len(transactions) + 1)

# Create the final DataFrame
final_df = pd.DataFrame({
    'TransactionID': transaction_ids,
    'Items': transactions # Assign the Series directly
})

# Remove rows where the 'Items' list might be empty (if a row had only NaNs)
final_df = final_df[final_df['Items'].map(len) > 0]

# Set TransactionID as index
final_df.set_index('TransactionID', inplace=True)

print(f"\nProcessing complete. Final DataFrame shape: {final_df.shape}")
print("Final DataFrame (head):")
final_df

# --- Ready for Apriori ---
# You can now use 'final_df['Items'].tolist()' or the DataFrame itself
# depending on your Apriori library requirements.
# For example, for mlxtend:
# from mlxtend.preprocessing import TransactionEncoder
# from mlxtend.frequent_patterns import apriori

# item_list = final_df['Items'].tolist()
# te = TransactionEncoder()
# te_ary = te.fit(item_list).transform(item_list)
# apriori_df = pd.DataFrame(te_ary, columns=te.columns_)
# frequent_itemsets = apriori(apriori_df, min_support=0.01, use_colnames=True) # Adjust min_support
# print("\nFrequent Itemsets:")
# print(frequent_itemsets)

Starting data processing...
Successfully read data.csv. Initial shape: (7501, 21)
Selected item columns. Shape for processing: (7501, 20)

Processing complete. Final DataFrame shape: (7501, 1)
Final DataFrame (head):


Unnamed: 0_level_0,Items
TransactionID,Unnamed: 1_level_1
1,"[shrimp, almonds, avocado, vegetables mix, gre..."
2,"[burgers, meatballs, eggs]"
3,[chutney]
4,"[turkey, avocado]"
5,"[mineral water, milk, energy bar, whole wheat ..."
...,...
7497,"[butter, light mayo, fresh bread]"
7498,"[burgers, frozen vegetables, eggs, french frie..."
7499,[chicken]
7500,"[escalope, green tea]"
