In [1]:
# ELO Merchant Category Recommendation - Exploratory Data Analysis (Merchant.csv)

In [2]:
# Cell 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc
from IPython.display import display

# Set the visualization style
sns.set(style="whitegrid")
plt.style.use('fivethirtyeight')


# Define data path - adjust as needed
DATA_PATH = '../data/raw/'  # Add your path here if needed

In [3]:
# Cell 2: Load Merchant Data
print("\n=== Loading Merchant Data ===")
merchant = pd.read_csv(DATA_PATH + 'merchants.csv')
print(f"Merchant dataset shape: {merchant.shape}")


=== Loading Merchant Data ===
Merchant dataset shape: (334696, 22)


In [4]:
# Cell 3: Basic Merchant Data Overview
print("\n=== Merchant Data Overview ===")
print(merchant.head())
print("\nColumn information:")
print(merchant.info())



=== Merchant Data Overview ===
       merchant_id  merchant_group_id  merchant_category_id  subsector_id  \
0  M_ID_838061e48c               8353                   792             9   
1  M_ID_9339d880ad               3184                   840            20   
2  M_ID_e726bbae1e                447                   690             1   
3  M_ID_a70e9c5f81               5026                   792             9   
4  M_ID_64456c37ce               2228                   222            21   

   numerical_1  numerical_2 category_1 most_recent_sales_range  \
0    -0.057471    -0.057471          N                       E   
1    -0.057471    -0.057471          N                       E   
2    -0.057471    -0.057471          N                       E   
3    -0.057471    -0.057471          Y                       E   
4    -0.057471    -0.057471          Y                       E   

  most_recent_purchases_range  avg_sales_lag3  ...  avg_sales_lag6  \
0                           E         

In [5]:
# Cell 4: Statistical Summary
print("\nStatistical summary:")
print(merchant.describe())



Statistical summary:
       merchant_group_id  merchant_category_id   subsector_id    numerical_1  \
count      334696.000000         334696.000000  334696.000000  334696.000000   
mean        31028.736143            423.131663      25.116404       0.011476   
std         31623.043426            252.898046       9.807371       1.098154   
min             1.000000             -1.000000      -1.000000      -0.057471   
25%          3612.000000            222.000000      19.000000      -0.057471   
50%         19900.000000            373.000000      27.000000      -0.057471   
75%         51707.250000            683.000000      33.000000      -0.047556   
max        112586.000000            891.000000      41.000000     183.735111   

         numerical_2  avg_sales_lag3  avg_purchases_lag3  active_months_lag3  \
count  334696.000000   334683.000000        3.346960e+05       334696.000000   
mean        0.008103       13.832993                 inf            2.994108   
std         1.070

  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)


In [6]:
# Cell 5: Missing Values Analysis
print("\nMissing values in merchant data:")
missing_values = merchant.isnull().sum()
missing_percentages = (missing_values / len(merchant) * 100).round(2)

# Create a DataFrame with both counts and percentages
missing_data = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage (%)': missing_percentages
})

# Only display columns that actually have missing values
missing_cols = missing_data[missing_data['Missing Values'] > 0]

if missing_cols.empty:
    print("No missing values found in any column.")
else:
    print(missing_cols)
    
    # Simple summary
    total_missing = missing_values.sum()
    print(f"\nTotal missing values: {total_missing}")


Missing values in merchant data:
                 Missing Values  Percentage (%)
avg_sales_lag3               13            0.00
avg_sales_lag6               13            0.00
avg_sales_lag12              13            0.00
category_2                11887            3.55

Total missing values: 11926


In [7]:
# Cell 6: Categorical Columns Analysis
# First identify categorical columns (will be adjusted based on actual data)
cat_columns = merchant.select_dtypes(include=['object']).columns.tolist()

if cat_columns:
    print("\n=== Categorical Columns Analysis ===")
    
    for col in cat_columns:
        print(f"\n{col} Value Counts:")
        value_counts = merchant[col].value_counts().head(20)  # Show top 20 values
        print(value_counts)
        
        # Calculate cardinality and uniqueness
        unique_count = merchant[col].nunique()
        unique_percent = (unique_count / len(merchant) * 100).round(2)
        print(f"Unique values: {unique_count} ({unique_percent}% of total rows)")
        
        # Only create visualizations for low-cardinality categorical features (adjust threshold as needed)
        if unique_count <= 20:  # Only visualize if 20 or fewer categories
            plt.figure(figsize=(12, 6))
            
            # Create DataFrame for plotting
            plot_df = pd.DataFrame({'value': value_counts.index, 'count': value_counts.values})
            
            # Create the plot with proper hue
            ax = sns.barplot(x='value', y='count', hue='value', data=plot_df, palette='viridis', legend=False)
            
            plt.title(f'Distribution of {col}', fontsize=14)
            plt.xlabel(col, fontsize=12)
            plt.ylabel('Count', fontsize=12)
            plt.xticks(rotation=45)
            plt.grid(axis='y', linestyle='--', alpha=0.7)
            plt.tight_layout()
            plt.show()


=== Categorical Columns Analysis ===

merchant_id Value Counts:
merchant_id
M_ID_dbbf07ebf0    4
M_ID_6464db3b45    4
M_ID_30340088f2    4
M_ID_d123532c72    4
M_ID_ebbdb42da6    4
M_ID_42697d5d44    4
M_ID_1802942aaf    4
M_ID_c2b9ac2ea4    4
M_ID_ef233cff26    4
M_ID_992a180b15    4
M_ID_bd49e37dda    4
M_ID_00a6ca8a8a    2
M_ID_d2b5d4418d    2
M_ID_fea38c640b    2
M_ID_28c2aace87    2
M_ID_b794b9d9e8    2
M_ID_c470fbcfb9    2
M_ID_0c4018d3a0    2
M_ID_6017075769    2
M_ID_ae9fe1605a    2
Name: count, dtype: int64


AttributeError: 'float' object has no attribute 'round'

In [None]:
# Cell 7: Numerical Columns Analysis
num_columns = merchant.select_dtypes(include=['int64', 'float64']).columns.tolist()

if num_columns:
    print("\n=== Numerical Columns Analysis ===")
    
    for col in num_columns:
        print(f"\n{col} Statistics:")
        print(merchant[col].describe())
        
        plt.figure(figsize=(12, 5))
        
        # Distribution plot
        plt.subplot(1, 2, 1)
        sns.histplot(merchant[col].dropna(), kde=True)
        plt.title(f'Distribution of {col}')
        plt.grid(True, alpha=0.3)
        
        # Box plot
        plt.subplot(1, 2, 2)
        sns.boxplot(y=merchant[col].dropna())
        plt.title(f'Boxplot of {col}')
        plt.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()

In [None]:
# Cell 8: Correlation Analysis (if multiple numerical columns exist)
if len(num_columns) > 1:
    print("\n=== Correlation Analysis ===")
    
    # Calculate correlation matrix
    corr_matrix = merchant[num_columns].corr()
    
    # Create heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
    plt.title('Correlation Matrix of Numerical Features', fontsize=16)
    plt.tight_layout()
    plt.show()


In [None]:
# Cell 9: Summary of Findings
print("\n=== Merchant Data Summary ===")
print(f"Total merchants: {len(merchant)}")
print(f"Total columns: {merchant.shape[1]}")
print(f"Categorical columns: {len(cat_columns)}")
print(f"Numerical columns: {len(num_columns)}")

if not missing_cols.empty:
    print(f"Columns with missing values: {len(missing_cols)}")
    print("Top 3 columns with most missing values:")
    print(missing_cols.sort_values(by='Missing Values', ascending=False).head(3))

# Clear memory
merchant_shape = merchant.shape
del merchant
gc.collect()
print("Merchant data cleared from memory")

In [None]:
#--------------------------------------------------
# 3. Analysis of Merchants Data
#--------------------------------------------------

# Cell: Load Merchants Data
print("\n=== Loading Merchants Data ===")
merchants = pd.read_csv(DATA_PATH + 'merchants.csv')
print(f"Merchants shape: {merchants.shape}")

# Cell: Examine Merchants Data - Overview
print("\n=== Merchants Data Overview ===")
print(merchants.head())
print("\nColumn information:")
print(merchants.info())

# Cell: Examine Merchants Data - Statistics
print("\nStatistical summary (numeric columns):")
print(merchants.describe())

# Cell: Examine Merchants Data - Missing Values
print("\nMissing values in merchants data:")
missing_merchants = merchants.isnull().sum()
print(missing_merchants)
print(f"Percentage of missing values: {missing_merchants / len(merchants) * 100}")

# Cell: Explore Merchant Categories
# Check merchant categories
plt.figure(figsize=(12, 6))
if 'category_1' in merchants.columns:
    merchants['category_1'].value_counts().plot(kind='bar')
    plt.title('Distribution of Merchant Category 1')
    plt.ylabel('Count')
    plt.show()
    
    # Print percentages
    cat1_pct = merchants['category_1'].value_counts(normalize=True) * 100
    print("Category 1 Distribution (%):")
    print(cat1_pct)

# Cell: Explore More Merchant Categories
if 'category_2' in merchants.columns:
    plt.figure(figsize=(12, 6))
    merchants['category_2'].value_counts().plot(kind='bar')
    plt.title('Distribution of Merchant Category 2')
    plt.ylabel('Count')
    plt.show()
    
    # Print percentages
    cat2_pct = merchants['category_2'].value_counts(normalize=True) * 100
    print("Category 2 Distribution (%):")
    print(cat2_pct)

if 'category_3' in merchants.columns:
    plt.figure(figsize=(12, 6))
    merchants['category_3'].value_counts().plot(kind='bar')
    plt.title('Distribution of Merchant Category 3')
    plt.ylabel('Count')
    plt.show()
    
    # Print percentages
    cat3_pct = merchants['category_3'].value_counts(normalize=True) * 100
    print("Category 3 Distribution (%):")
    print(cat3_pct)

# Cell: Explore Merchant Category Combinations
if 'category_1' in merchants.columns and 'category_2' in merchants.columns:
    # Create cross-tabulation
    category_crosstab = pd.crosstab(
        merchants['category_1'], 
        merchants['category_2'],
        normalize='all'
    ) * 100
    
    plt.figure(figsize=(12, 8))
    sns.heatmap(category_crosstab, annot=True, cmap='YlGnBu', fmt='.1f')
    plt.title('Percentage Distribution of Category 1 vs Category 2')
    plt.show()

# Cell: Save key insights from merchants
merchant_ids = merchants['merchant_id'].copy()
merchant_shape = merchants.shape
merchant_columns = merchants.columns.tolist()

# Summary of merchants data
print("\n=== Merchants Data Summary ===")
print(f"Shape: {merchant_shape}")
print(f"Columns: {merchant_columns}")
print(f"Number of unique merchants: {len(merchant_ids)}")

# Clear memory
del merchants
gc.collect()
print("Merchants data cleared from memory")