In [1]:
# =============================================================================
# ANALYZE CHURNING DATA FOR STREAMLIT APP
# Find min/max/median for numerical features and unique values for categorical features
# =============================================================================


import pandas as pd
import json
import os

# 1. Setting up: Load the data
data_path = "data/strat_train_set.csv"

if not os.path.exists(data_path):
    data_path = "../data/strat_train_set.csv"

print(f"ðŸ“‚ Loading data from: {data_path}")
df = pd.read_csv(data_path)

df.columns = df.columns.str.lower()

print(f"âœ… Data loaded. Shape: {df.shape}")

# 2. Define feature lists
numerical_features = [
    'credit_score',
    'age',
    'tenure',
    'balance',
    'num_of_products',
    'estimated_salary'
]

categorical_features = [
    'geography',
    'gender',
    'has_cr_card',      
    'is_active_member'  
]

# 3. Create data schema dictionary
data_schema = {
    "numerical": {},
    "categorical": {}
}

print("\n" + "=" * 80)
print("ANALYZING CHURN DATA FOR STREAMLIT APP")
print("=" * 80)

# 4. Analyze numerical features
print("\n" + "-" * 80)
print("NUMERICAL FEATURES")
print("-" * 80)
print(f"{'Feature':<25} {'Min':<15} {'Max':<15} {'Mean':<15} {'Median':<15}")
print("-" * 80)

for feature in numerical_features:
    min_val = float(df[feature].min())
    max_val = float(df[feature].max())
    mean_val = float(df[feature].mean())
    median_val = float(df[feature].median())

    data_schema["numerical"][feature] = {
        "min": min_val,
        "max": max_val,
        "mean": mean_val,
        "median": median_val
    }

    print(f"{feature:<25} {min_val:<15.2f} {max_val:<15.2f} {mean_val:<15.2f} {median_val:<15.2f}")

# 5. Analyze categorical features
print("\n" + "-" * 80)
print("CATEGORICAL FEATURES")
print("-" * 80)

for feature in categorical_features:
    unique_values = df[feature].unique().tolist()
    if hasattr(unique_values[0], 'item'): 
         unique_values = [int(x) for x in unique_values]
         
    value_counts = df[feature].value_counts().to_dict()
    
    data_schema["categorical"][feature] = {
        "unique_values": unique_values,
        "value_counts": {str(k): int(v) for k, v in value_counts.items()} 
    }

    print(f"\n{feature}:")
    print(f"  Unique values: {unique_values}")

# 6. Save the data schema to a JSON file
output_dir = os.path.join("..", "data")

output_file = os.path.join(output_dir, "data_schema.json")

with open(output_file, 'w') as f:
    json.dump(data_schema, f, indent=2)

print("\n" + "=" * 80)
print(f"âœ“ Data schema saved to {output_file}")
print("=" * 80)

ðŸ“‚ Loading data from: ../data/strat_train_set.csv
âœ… Data loaded. Shape: (8000, 13)

ANALYZING CHURN DATA FOR STREAMLIT APP

--------------------------------------------------------------------------------
NUMERICAL FEATURES
--------------------------------------------------------------------------------
Feature                   Min             Max             Mean            Median         
--------------------------------------------------------------------------------
credit_score              350.00          850.00          650.75          652.00         
age                       18.00           92.00           38.95           37.00          
tenure                    0.00            10.00           5.02            5.00           
balance                   0.00            238387.56       76381.21        97055.15       
num_of_products           1.00            4.00            1.53            1.00           
estimated_salary          11.58           199992.48       99730.81    