In [10]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

# Try to import prince (FAMD implementation)
try:
    import prince
    PRINCE_AVAILABLE = True
except ImportError:
    PRINCE_AVAILABLE = False
    print("Warning: 'prince' package not found. Install it with: pip install prince")

In [2]:
# ============================================================================
# 1. LOAD DATA
# ============================================================================
print("=" * 80)
print("DATA PROCESSING WITH FAMD")
print("=" * 80)

print("\n1. Loading raw data...")
data_path = '../../Data/raw/new_Base_CDM_balanced_V2.csv'

# Load data (skip descriptive label row)
df = pd.read_csv(data_path, sep=';', skiprows=[1])
print(f"   ✓ Loaded {len(df)} rows and {len(df.columns)} columns")

DATA PROCESSING WITH FAMD

1. Loading raw data...
   ✓ Loaded 25782 rows and 8 columns


In [3]:
# ============================================================================
# 2. PREPARE DATA - REMOVE X1, X2
# ============================================================================
print("\n2. Preparing data...")

# Target variable
target = 'Y'

# Variables to keep
# Continuous: X3, X4, X6 (X1, X2 removed)
# Categorical: X5, X7
continuous_vars = ['X3', 'X4', 'X6']
categorical_vars = ['X5', 'X7']

# Remove X1 and X2
df_processed = df.drop(['X1', 'X2'], axis=1)

# Convert continuous variables to numeric
for col in continuous_vars:
    df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')

print(f"   ✓ Removed X1 and X2")
print(f"   ✓ Continuous variables: {continuous_vars}")
print(f"   ✓ Categorical variables: {categorical_vars}")
print(f"   ✓ Data shape after removal: {df_processed.shape}")

# Check for missing values
missing = df_processed[continuous_vars].isnull().sum()
if missing.sum() > 0:
    print(f"\n   ⚠ Warning: Missing values in continuous variables:")
    print(missing[missing > 0])
    # Fill missing values with median
    for col in continuous_vars:
        if df_processed[col].isnull().sum() > 0:
            median_val = df_processed[col].median()
            df_processed[col].fillna(median_val, inplace=True)
            print(f"   ✓ Filled missing values in {col} with median: {median_val}")


2. Preparing data...
   ✓ Removed X1 and X2
   ✓ Continuous variables: ['X3', 'X4', 'X6']
   ✓ Categorical variables: ['X5', 'X7']
   ✓ Data shape after removal: (25782, 6)


In [4]:
# ============================================================================
# 3. NORMALIZE CONTINUOUS VARIABLES
# ============================================================================
print("\n3. Normalizing continuous variables...")

# Extract continuous variables
X_continuous = df_processed[continuous_vars].copy()

# Apply StandardScaler (Z-score normalization: mean=0, std=1)
scaler = StandardScaler()
X_continuous_scaled = scaler.fit_transform(X_continuous)
X_continuous_scaled_df = pd.DataFrame(
    X_continuous_scaled,
    columns=[f'{col}_norm' for col in continuous_vars],
    index=df_processed.index
)

print("   ✓ Applied StandardScaler (mean=0, std=1)")
print(f"   ✓ Normalized variables: {list(X_continuous_scaled_df.columns)}")
print(f"\n   Normalization statistics:")
for i, col in enumerate(continuous_vars):
    print(f"     {col}:")
    print(f"       Original - Mean: {X_continuous[col].mean():.2f}, Std: {X_continuous[col].std():.2f}")
    print(f"       Normalized - Mean: {X_continuous_scaled_df[f'{col}_norm'].mean():.4f}, Std: {X_continuous_scaled_df[f'{col}_norm'].std():.4f}")



3. Normalizing continuous variables...
   ✓ Applied StandardScaler (mean=0, std=1)
   ✓ Normalized variables: ['X3_norm', 'X4_norm', 'X6_norm']

   Normalization statistics:
     X3:
       Original - Mean: 64641.18, Std: 54924.65
       Normalized - Mean: -0.0000, Std: 1.0000
     X4:
       Original - Mean: 37.65, Std: 23.50
       Normalized - Mean: 0.0000, Std: 1.0000
     X6:
       Original - Mean: 587.86, Std: 1821.34
       Normalized - Mean: -0.0000, Std: 1.0000


In [5]:
# ============================================================================
# 4. ONE-HOT ENCODE CATEGORICAL VARIABLES
# ============================================================================
print("\n4. One-hot encoding categorical variables...")

# Extract categorical variables
X_categorical = df_processed[categorical_vars].copy()

# Apply one-hot encoding
encoder = OneHotEncoder(drop='first', sparse_output=False)  # drop='first' to avoid multicollinearity
X_categorical_encoded = encoder.fit_transform(X_categorical)

# Get feature names
feature_names = encoder.get_feature_names_out(categorical_vars)
X_categorical_encoded_df = pd.DataFrame(
    X_categorical_encoded,
    columns=feature_names,
    index=df_processed.index
)

print(f"   ✓ Applied One-Hot Encoding (drop='first')")
print(f"   ✓ Encoded variables: {list(X_categorical_encoded_df.columns)}")
print(f"   ✓ Number of encoded features: {len(feature_names)}")

# Display encoding info
print(f"\n   Encoding details:")
for col in categorical_vars:
    n_categories = X_categorical[col].nunique()
    n_encoded = len([c for c in feature_names if col in c])
    print(f"     {col}: {n_categories} categories → {n_encoded} encoded features")


4. One-hot encoding categorical variables...
   ✓ Applied One-Hot Encoding (drop='first')
   ✓ Encoded variables: ['X5_CARREFOUR', 'X5_CARREFOUR MARKET', 'X5_CASINO', 'X5_CORA', 'X5_ECOMARCHE', 'X5_FRANPRIX', 'X5_GEANT', 'X5_HYPER U', 'X5_INTERMARCHE', 'X5_LECLERC', 'X5_MARCHE U', 'X5_MATCH', 'X5_MONOPRIX', 'X5_OTHERS', 'X5_PRISUNIC', 'X5_SHOPI', 'X5_SIMPLY MARKET', 'X5_SUPER U', 'X7_No_Feat']
   ✓ Number of encoded features: 19

   Encoding details:
     X5: 19 categories → 18 encoded features
     X7: 2 categories → 1 encoded features


In [6]:
# ============================================================================
# 5. COMBINE NORMALIZED CONTINUOUS AND ENCODED CATEGORICAL
# ============================================================================
print("\n5. Combining normalized and encoded features...")

# Combine all features
X_combined = pd.concat([X_continuous_scaled_df, X_categorical_encoded_df], axis=1)

print(f"   ✓ Combined feature matrix shape: {X_combined.shape}")
print(f"   ✓ Total features: {X_combined.shape[1]}")


5. Combining normalized and encoded features...
   ✓ Combined feature matrix shape: (25782, 22)
   ✓ Total features: 22


In [13]:
if not PRINCE_AVAILABLE:
    print("✗ ERROR: Cannot apply FAMD - prince package not available")
    print("  Please install: pip install prince")
else:
    # For FAMD, we need the original data (not normalized/encoded)
    # FAMD handles mixed data internally
    X_famd_input = pd.concat([df_processed[continuous_vars], df_processed[categorical_vars]], axis=1)
    
    # Determine number of components (adjust as needed)
    # Using components that explain most variance (max 20 or n_features-1)
    max_components = min(20, X_famd_input.shape[1] - 1)
    
    print(f"Applying FAMD with up to {max_components} components...")
    
    # Create and fit FAMD
    famd = prince.FAMD(
        n_components=max_components,
        n_iter=10,
        copy=True,
        check_input=True,
        random_state=42,
        engine='sklearn'
    )
    
    # Fit and transform
    X_famd_transformed = famd.fit_transform(X_famd_input)
    
    # Convert to DataFrame
    X_famd = pd.DataFrame(
        X_famd_transformed,
        columns=[f'FAMD_{i+1}' for i in range(X_famd_transformed.shape[1])],
        index=df_processed.index
    )
    
    n_components = X_famd.shape[1]
    
    # Calculate explained variance
    # Calculate explained variance from eigenvalues
    eigenvalues = famd.eigenvalues_
    total_variance = eigenvalues.sum()
    explained_variance = eigenvalues / total_variance
    cumulative_variance = np.cumsum(explained_variance)
    # cumulative_variance = np.cumsum(explained_variance)
    
    print(f"\n✓ FAMD transformation complete")
    print(f"✓ Number of components: {n_components}")
    print(f"\nExplained variance (first 10 components):")
    for i in range(min(10, n_components)):
        print(f"  Component {i+1}: {explained_variance[i]*100:.2f}% (Cumulative: {cumulative_variance[i]*100:.2f}%)")
    
    # Find number of components for 95% variance
    n_components_95 = np.where(cumulative_variance >= 0.95)[0]
    if len(n_components_95) > 0:
        n_95 = n_components_95[0] + 1
        print(f"\n✓ {n_95} components explain 95% of variance")
        print(f"  Total variance explained: {cumulative_variance[n_95-1]*100:.2f}%")

Applying FAMD with up to 4 components...

✓ FAMD transformation complete
✓ Number of components: 4

Explained variance (first 10 components):
  Component 1: 25.53% (Cumulative: 25.53%)
  Component 2: 24.99% (Cumulative: 50.52%)
  Component 3: 24.90% (Cumulative: 75.42%)
  Component 4: 24.58% (Cumulative: 100.00%)

✓ 4 components explain 95% of variance
  Total variance explained: 100.00%


In [14]:
# ============================================================================
# 7. CREATE FINAL DATASET
# ============================================================================
print("\n7. Creating final dataset...")

# Create final DataFrame with FAMD components and target
df_final = X_famd.copy()
df_final[target] = df_processed[target].values

print(f"   ✓ Final dataset shape: {df_final.shape}")
print(f"   ✓ Columns: {list(df_final.columns)}")

# ============================================================================
# 8. SAVE PROCESSED DATA
# ============================================================================
print("\n8. Saving processed data...")

# Create output directory
output_dir = '../../Data/processed'
os.makedirs(output_dir, exist_ok=True)

# Save to CSV
output_path = os.path.join(output_dir, 'data_continuous_famd.csv')
df_final.to_csv(output_path, index=False, sep=';')

print(f"   ✓ Saved to: {output_path}")


7. Creating final dataset...
   ✓ Final dataset shape: (25782, 5)
   ✓ Columns: ['FAMD_1', 'FAMD_2', 'FAMD_3', 'FAMD_4', 'Y']

8. Saving processed data...
   ✓ Saved to: ../../Data/processed\data_continuous_famd.csv
