In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Update MLP configuration
models = {
    # ... keep other models as-is ...
    'MLP': {
        'model': MLPClassifier(
            activation='relu',
            solver='adam',
            early_stopping=True,
            random_state=42,
            max_iter=1000
        ),
        'params': {
            'classifier__hidden_layer_sizes': [(100, 50), (50, 25)],  # 2 hidden layers
            'classifier__alpha': [0.0001, 0.001, 0.01, 0.1]  # L2 regularization
        }
    }
}

# Modified training loop section
for model_name, config in models.items():
    try:
        print(f"\n{'='*40}\nTraining {model_name}\n{'='*40}")
        
        pipeline = ImbPipeline([
            ('preprocessor', preprocessor),
            ('smote', SMOTE(random_state=42)),
            ('classifier', config['model'])
        ])
        
        search = GridSearchCV(
            estimator=pipeline,
            param_grid=config['params'],
            cv=cv,
            scoring='roc_auc',
            n_jobs=-1,
            verbose=1,
            return_train_score=True
        )
        
        search.fit(X_train, y_train)
        
        # For MLP specifically
        if model_name == 'MLP':
            # Get best estimator
            best_mlp = search.best_estimator_
            
            # Generate predictions
            y_pred = best_mlp.predict(X_test)
            
            # Confusion matrix
            cm = confusion_matrix(y_test, y_pred)
            disp = ConfusionMatrixDisplay(
                confusion_matrix=cm,
                display_labels=['Non-User', 'Long-Acting User']
            )
            
            # Plot with annotations
            disp.plot(cmap='Blues', values_format='d')
            plt.title(f'MLP Confusion Matrix\n(Best Alpha: {search.best_params_["classifier__alpha"]})')
            plt.savefig(r"C:\Users\HP\Desktop\edited\mlp_confusion_matrix.png", 
                        bbox_inches='tight', dpi=300)
            plt.show()
            
            # Print metrics
            print("\nMLP Classification Report:")
            print(classification_report(y_test, y_pred))
            print(f"Best Parameters: {search.best_params_}")

    except Exception as e:
        print(f"Error training {model_name}: {str(e)}")
        continue

In [6]:
import pandas as pd
import numpy as np
df=pd.read_csv(r"C:\Users\HP\Desktop\PMA\merged_Sub_saharan_processed.csv")

In [7]:
# Clean yes/no columns more thoroughly
yes_no_cols = ['radio', 'tv', 'heard_implants', 'heard_IUD', 
               'fp_side_effects', 'fp_ever_used', 'visited_by_health_worker',
               'visited_a_facility', 'fp_ad_radio', 'fp_ad_tv', 'fp_ad_magazine']

for col in yes_no_cols:
    # Handle missing values and normalize strings
    df[col] = df[col].fillna('No').astype(str).str.strip().str.lower()
    # Convert to binary
    df[col] = np.where(df[col].str.contains('yes|1', na=False), 1, 0).astype(int)

# Convert numeric columns to floats
numeric_cols = ['age', 'num_HH_members', 'age_at_first_sex',
                'age_at_first_use', 'age_at_first_use_children']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

In [None]:
import shap
# Enhanced preprocessing pipeline with feature names preservation
numeric_features = ['age', 'num_HH_members', 'age_at_first_sex',
                    'age_at_first_use', 'age_at_first_use_children']
categorical_features = ['ur', 'marital_status', 'religion']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', KNNImputer(n_neighbors=5)),
            ('scaler', StandardScaler())
        ]), numeric_features),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features)
    ],
    remainder='drop'
)

# Create complete pipeline
full_pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])

# Fit pipeline
full_pipeline.fit(X_train, y_train)

# SHAP Analysis Integration
# ===============================================================
# Extract feature names
numeric_features_processed = numeric_features
categorical_encoder = full_pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['encoder']
categorical_features_processed = categorical_encoder.get_feature_names_out(categorical_features)
all_feature_names = np.concatenate([numeric_features_processed, categorical_features_processed])

# Create SHAP explainer
explainer = shap.TreeExplainer(full_pipeline.named_steps['classifier'])
X_processed = full_pipeline.named_steps['preprocessor'].transform(X_train)
shap_values = explainer.shap_values(X_processed)

# Generate SHAP plots
shap.summary_plot(shap_values, X_processed, feature_names=all_feature_names, plot_type='bar')
shap.initjs()

# Create feature importance dataframe
shap_df = pd.DataFrame({
    'features': all_feature_names,
    'importance': np.abs(shap_values).mean(axis=0)
}).sort_values('importance', ascending=False)

print("\nTop 10 Important Features by SHAP Values:")
print(shap_df.head(10).to_string(index=False))

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Update MLP configuration
models = {
    # ... keep other models as-is ...
    'MLP': {
        'model': MLPClassifier(
            activation='relu',
            solver='adam',
            early_stopping=True,
            random_state=42,
            max_iter=1000
        ),
        'params': {
            'classifier__hidden_layer_sizes': [(100, 50), (50, 25)],  # 2 hidden layers
            'classifier__alpha': [0.0001, 0.001, 0.01, 0.1]  # L2 regularization
        }
    }
}

# Modified training loop section
for model_name, config in models.items():
    try:
        print(f"\n{'='*40}\nTraining {model_name}\n{'='*40}")
        
        pipeline = ImbPipeline([
            ('preprocessor', preprocessor),
            ('smote', SMOTE(random_state=42)),
            ('classifier', config['model'])
        ])
        
        search = GridSearchCV(
            estimator=pipeline,
            param_grid=config['params'],
            cv=cv,
            scoring='roc_auc',
            n_jobs=-1,
            verbose=1,
            return_train_score=True
        )
        
        search.fit(X_train, y_train)
        
        # For MLP specifically
        if model_name == 'MLP':
            # Get best estimator
            best_mlp = search.best_estimator_
            
            # Generate predictions
            y_pred = best_mlp.predict(X_test)
            
            # Confusion matrix
            cm = confusion_matrix(y_test, y_pred)
            disp = ConfusionMatrixDisplay(
                confusion_matrix=cm,
                display_labels=['Non-User', 'Long-Acting User']
            )
            
            # Plot with annotations
            disp.plot(cmap='Blues', values_format='d')
            plt.title(f'MLP Confusion Matrix\n(Best Alpha: {search.best_params_["classifier__alpha"]})')
            plt.savefig(r"C:\Users\HP\Desktop\edited\mlp_confusion_matrix.png", 
                        bbox_inches='tight', dpi=300)
            plt.show()
            
            # Print metrics
            print("\nMLP Classification Report:")
            print(classification_report(y_test, y_pred))
            print(f"Best Parameters: {search.best_params_}")

    except Exception as e:
        print(f"Error training {model_name}: {str(e)}")
        continue