In [None]:
#--------------------------------------------------
# 3. Analysis of Merchants Data
#--------------------------------------------------

# Cell: Load Merchants Data
print("\n=== Loading Merchants Data ===")
merchants = pd.read_csv(DATA_PATH + 'merchants.csv')
print(f"Merchants shape: {merchants.shape}")

# Cell: Examine Merchants Data - Overview
print("\n=== Merchants Data Overview ===")
print(merchants.head())
print("\nColumn information:")
print(merchants.info())

# Cell: Examine Merchants Data - Statistics
print("\nStatistical summary (numeric columns):")
print(merchants.describe())

# Cell: Examine Merchants Data - Missing Values
print("\nMissing values in merchants data:")
missing_merchants = merchants.isnull().sum()
print(missing_merchants)
print(f"Percentage of missing values: {missing_merchants / len(merchants) * 100}")

# Cell: Explore Merchant Categories
# Check merchant categories
plt.figure(figsize=(12, 6))
if 'category_1' in merchants.columns:
    merchants['category_1'].value_counts().plot(kind='bar')
    plt.title('Distribution of Merchant Category 1')
    plt.ylabel('Count')
    plt.show()
    
    # Print percentages
    cat1_pct = merchants['category_1'].value_counts(normalize=True) * 100
    print("Category 1 Distribution (%):")
    print(cat1_pct)

# Cell: Explore More Merchant Categories
if 'category_2' in merchants.columns:
    plt.figure(figsize=(12, 6))
    merchants['category_2'].value_counts().plot(kind='bar')
    plt.title('Distribution of Merchant Category 2')
    plt.ylabel('Count')
    plt.show()
    
    # Print percentages
    cat2_pct = merchants['category_2'].value_counts(normalize=True) * 100
    print("Category 2 Distribution (%):")
    print(cat2_pct)

if 'category_3' in merchants.columns:
    plt.figure(figsize=(12, 6))
    merchants['category_3'].value_counts().plot(kind='bar')
    plt.title('Distribution of Merchant Category 3')
    plt.ylabel('Count')
    plt.show()
    
    # Print percentages
    cat3_pct = merchants['category_3'].value_counts(normalize=True) * 100
    print("Category 3 Distribution (%):")
    print(cat3_pct)

# Cell: Explore Merchant Category Combinations
if 'category_1' in merchants.columns and 'category_2' in merchants.columns:
    # Create cross-tabulation
    category_crosstab = pd.crosstab(
        merchants['category_1'], 
        merchants['category_2'],
        normalize='all'
    ) * 100
    
    plt.figure(figsize=(12, 8))
    sns.heatmap(category_crosstab, annot=True, cmap='YlGnBu', fmt='.1f')
    plt.title('Percentage Distribution of Category 1 vs Category 2')
    plt.show()

# Cell: Save key insights from merchants
merchant_ids = merchants['merchant_id'].copy()
merchant_shape = merchants.shape
merchant_columns = merchants.columns.tolist()

# Summary of merchants data
print("\n=== Merchants Data Summary ===")
print(f"Shape: {merchant_shape}")
print(f"Columns: {merchant_columns}")
print(f"Number of unique merchants: {len(merchant_ids)}")

# Clear memory
del merchants
gc.collect()
print("Merchants data cleared from memory")