In [None]:
# Identify numeric columns (exclude IDs, dates, and target variables)
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
exclude_cols = ['respondentPk', 'has_wonky_study', 'has_wonky_task', 
                'wonky_study_count', 'wonky_task_instances']
numeric_cols = [col for col in numeric_cols if col not in exclude_cols]

# Calculate correlations with target variables
# Include target variables separately to avoid duplicates
corr_study_count = df[numeric_cols + ['wonky_study_count']].corr()['wonky_study_count'].sort_values(ascending=False)
corr_task_instances = df[numeric_cols + ['wonky_task_instances']].corr()['wonky_task_instances'].sort_values(ascending=False)

print("=" * 80)
print("TOP CORRELATIONS WITH wonky_study_count")
print("=" * 80)
print("\nTop 20 positive correlations:")
display(corr_study_count.head(21)[1:])  # Exclude self-correlation
print("\nTop 20 negative correlations:")
display(corr_study_count.tail(20))

In [None]:
# Statistical tests for wonky_study_count (binary: has_wonky_study)
print("=" * 80)
print("STATISTICAL TESTS: Features vs has_wonky_study")
print("=" * 80)
print("\nComparing respondents with wonky studies (1) vs without (0)")

stats_results_study = compare_groups_statistically(
    df=df,
    group_col='has_wonky_study',
    metrics=test_features,
    group1_value=1,
    group2_value=0,
    significance_level=0.05,
    include_welch=True
)

# Sort by p-value
stats_results_study = stats_results_study.sort_values('mw_p_value')

print(f"\n✓ Completed tests on {len(stats_results_study)} features")
print(f"\nSignificant features (p < 0.05): {stats_results_study['mw_significant'].sum()}")

display(stats_results_study.head(20))

In [None]:
# Statistical tests for wonky_task_instances (binary: has_wonky_task)
print("=" * 80)
print("STATISTICAL TESTS: Features vs has_wonky_task")
print("=" * 80)
print("\nComparing respondents with wonky tasks (1) vs without (0)")

stats_results_task = compare_groups_statistically(
    df=df,
    group_col='has_wonky_task',
    metrics=test_features,
    group1_value=1,
    group2_value=0,
    significance_level=0.05,
    include_welch=True
)

# Sort by p-value
stats_results_task = stats_results_task.sort_values('mw_p_value')

print(f"\n✓ Completed tests on {len(stats_results_task)} features")
print(f"\nSignificant features (p < 0.05): {stats_results_task['mw_significant'].sum()}")

display(stats_results_task.head(20))

In [None]:
# Visualize statistical test results
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Statistical Significance: has_wonky_study',
                    'Statistical Significance: has_wonky_task'),
    specs=[[{"secondary_y": False}, {"secondary_y": False}]]
)

# Top significant features for wonky_study
# Check if DataFrame exists, is not empty, and has the required column
if (len(stats_results_study) > 0 and 
    'mw_significant' in stats_results_study.columns and 
    'mw_p_value' in stats_results_study.columns and
    'metric' in stats_results_study.columns):
    top_sig_study = stats_results_study[stats_results_study['mw_significant']].head(15)
    if len(top_sig_study) > 0:
        fig.add_trace(
            go.Bar(x=-np.log10(top_sig_study['mw_p_value']), 
                   y=top_sig_study['metric'],
                   orientation='h',
                   marker_color='steelblue',
                   name='wonky_study'),
            row=1, col=1
        )
else:
    print("Warning: stats_results_study is empty or missing required columns")

# Top significant features for wonky_task
# Check if DataFrame exists, is not empty, and has the required column
if (len(stats_results_task) > 0 and 
    'mw_significant' in stats_results_task.columns and 
    'mw_p_value' in stats_results_task.columns and
    'metric' in stats_results_task.columns):
    top_sig_task = stats_results_task[stats_results_task['mw_significant']].head(15)
    if len(top_sig_task) > 0:
        fig.add_trace(
            go.Bar(x=-np.log10(top_sig_task['mw_p_value']),
                   y=top_sig_task['metric'],
                   orientation='h',
                   marker_color='coral',
                   name='wonky_task'),
            row=1, col=2
        )
else:
    print("Warning: stats_results_task is empty or missing required columns")

fig.update_layout(
    height=600,
    title_text="Top Significant Features (-log10 p-value)",
    showlegend=False
)

fig.update_xaxes(title_text="-log10(p-value)", row=1, col=1)
fig.update_xaxes(title_text="-log10(p-value)", row=1, col=2)

# Add significance line
fig.add_hline(y=-np.log10(0.05), line_dash="dash", line_color="red", 
              annotation_text="p=0.05", row=1, col=1)
fig.add_hline(y=-np.log10(0.05), line_dash="dash", line_color="red",
              annotation_text="p=0.05", row=1, col=2)

fig.show()

In [None]:
# Get top significant features for visualization
top_features_study = []
top_features_task = []

if (len(stats_results_study) > 0 and 
    'mw_significant' in stats_results_study.columns and 
    'metric' in stats_results_study.columns):
    top_features_study = stats_results_study[stats_results_study['mw_significant']].head(6)['metric'].tolist()

if (len(stats_results_task) > 0 and 
    'mw_significant' in stats_results_task.columns and 
    'metric' in stats_results_task.columns):
    top_features_task = stats_results_task[stats_results_task['mw_significant']].head(6)['metric'].tolist()

print(f"Top features for wonky_study analysis: {top_features_study}")
print(f"Top features for wonky_task analysis: {top_features_task}")

In [None]:
# Box plots comparing distributions for wonky_study
print("=" * 80)
print("DISTRIBUTION COMPARISONS: has_wonky_study")
print("=" * 80)

# Use significant features if available, otherwise use top correlated features
if len(top_features_study) == 0:
    print("No significant features found. Using top correlated features instead...")
    if len(corr_study_count) > 0:
        top_features_study = corr_study_count.head(7)[1:].index.tolist()[:6]  # Exclude self, get top 6
        print(f"Using top correlated features: {top_features_study}")

if len(top_features_study) > 0:
    for feature in top_features_study[:6]:
        if feature in df.columns:
            try:
                # Check if feature has valid numeric data
                if df[feature].dtype not in [np.int64, np.float64]:
                    print(f"Skipping {feature}: not numeric")
                    continue
                
                # Remove NaN values for plotting
                plot_df = df[['has_wonky_study', feature]].dropna()
                if len(plot_df) == 0:
                    print(f"Skipping {feature}: no valid data after removing NaNs")
                    continue
                
                fig = create_box_plot(
                    df=plot_df,
                    x='has_wonky_study',
                    y=feature,
                    title=f'{feature} by Wonky Study Status',
                    labels={'has_wonky_study': 'Has Wonky Study', feature: feature}
                )
                fig.update_xaxes(tickmode='linear', tick0=0, dtick=1)
                fig.update_layout(height=500)
                fig.show()
            except Exception as e:
                print(f"Error creating plot for {feature}: {e}")
                continue
else:
    print("No features available for plotting")

In [None]:
# Box plots comparing distributions for wonky_task
print("=" * 80)
print("DISTRIBUTION COMPARISONS: has_wonky_task")
print("=" * 80)

# Use significant features if available, otherwise use top correlated features
if len(top_features_task) == 0:
    print("No significant features found. Using top correlated features instead...")
    if len(corr_task_instances) > 0:
        top_features_task = corr_task_instances.head(7)[1:].index.tolist()[:6]  # Exclude self, get top 6
        print(f"Using top correlated features: {top_features_task}")

if len(top_features_task) > 0:
    for feature in top_features_task[:6]:
        if feature in df.columns:
            try:
                # Check if feature has valid numeric data
                if df[feature].dtype not in [np.int64, np.float64]:
                    print(f"Skipping {feature}: not numeric")
                    continue
                
                # Remove NaN values for plotting
                plot_df = df[['has_wonky_task', feature]].dropna()
                if len(plot_df) == 0:
                    print(f"Skipping {feature}: no valid data after removing NaNs")
                    continue
                
                fig = create_box_plot(
                    df=plot_df,
                    x='has_wonky_task',
                    y=feature,
                    title=f'{feature} by Wonky Task Status',
                    labels={'has_wonky_task': 'Has Wonky Task', feature: feature}
                )
                fig.update_xaxes(tickmode='linear', tick0=0, dtick=1)
                fig.update_layout(height=500)
                fig.show()
            except Exception as e:
                print(f"Error creating plot for {feature}: {e}")
                continue
else:
    print("No features available for plotting")

In [None]:
# Combine correlation and statistical test results for wonky_study_count
print("=" * 80)
print("FEATURE IMPORTANCE SUMMARY: wonky_study_count")
print("=" * 80)

if len(stats_results_study) > 0 and 'metric' in stats_results_study.columns:
    # Convert correlation Series to DataFrame properly
    if len(corr_study_count) > 0:
        corr_df = corr_study_count.to_frame(name='correlation').reset_index()
        corr_df.columns = ['metric', 'correlation']
        
        # Merge correlation and statistical test results
        importance_study = stats_results_study.merge(
            corr_df,
            on='metric',
            how='left'
        )
        
        print(f"Merged {len(importance_study)} features")
        print(f"Features with correlation data: {importance_study['correlation'].notna().sum()}")
        
        # Calculate importance score (combination of correlation magnitude and statistical significance)
        if 'mw_p_value' in importance_study.columns:
            # Fill NaN correlations with 0 for calculation
            importance_study['correlation'] = importance_study['correlation'].fillna(0)
            
            importance_study['importance_score'] = (
                np.abs(importance_study['correlation']) * 
                (-np.log10(importance_study['mw_p_value'] + 1e-10))
            )
            importance_study = importance_study.sort_values('importance_score', ascending=False)
            
            # Select columns that exist
            display_cols = ['metric', 'correlation', 'mean_difference', 'importance_score']
            if 'mw_p_value' in importance_study.columns:
                display_cols.insert(2, 'mw_p_value')
            if 'mw_significant' in importance_study.columns:
                display_cols.insert(3, 'mw_significant')
            
            print("\nTop 20 Most Important Features:")
            result_df = importance_study[[col for col in display_cols if col in importance_study.columns]].head(20)
            if len(result_df) > 0:
                display(result_df)
            else:
                print("No results to display")
        else:
            print("Warning: mw_p_value column not found in stats_results_study")
            print(f"Available columns: {list(stats_results_study.columns)}")
    else:
        print("Warning: corr_study_count is empty")
else:
    print("Warning: stats_results_study is empty or missing required columns")
    if len(stats_results_study) == 0:
        print("stats_results_study is empty")
    elif 'metric' not in stats_results_study.columns:
        print(f"Available columns: {list(stats_results_study.columns)}")

In [None]:
# Combine correlation and statistical test results for wonky_task_instances
print("=" * 80)
print("FEATURE IMPORTANCE SUMMARY: wonky_task_instances")
print("=" * 80)

if len(stats_results_task) > 0 and 'metric' in stats_results_task.columns:
    # Merge correlation and statistical test results
    importance_task = stats_results_task.merge(
        corr_task_instances.reset_index().rename(columns={'index': 'metric', 'wonky_task_instances': 'correlation'}),
        on='metric',
        how='left'
    )
    
    # Calculate importance score
    if 'mw_p_value' in importance_task.columns:
        importance_task['importance_score'] = (
            np.abs(importance_task['correlation']) * 
            (-np.log10(importance_task['mw_p_value'] + 1e-10))
        )
        importance_task = importance_task.sort_values('importance_score', ascending=False)
        
        # Select columns that exist
        display_cols = ['metric', 'correlation', 'mean_difference', 'importance_score']
        if 'mw_p_value' in importance_task.columns:
            display_cols.insert(2, 'mw_p_value')
        if 'mw_significant' in importance_task.columns:
            display_cols.insert(3, 'mw_significant')
        
        print("\nTop 20 Most Important Features:")
        display(importance_task[[col for col in display_cols if col in importance_task.columns]].head(20))
    else:
        print("Warning: mw_p_value column not found in stats_results_task")
else:
    print("Warning: stats_results_task is empty or missing required columns")

In [None]:
print("=" * 80)
print("EXECUTIVE SUMMARY")
print("=" * 80)

print("\n1. TARGET VARIABLE STATISTICS:")
print(f"   - Respondents with wonky studies: {df['has_wonky_study'].sum():,} ({df['has_wonky_study'].mean()*100:.2f}%)")
print(f"   - Respondents with wonky tasks: {df['has_wonky_task'].sum():,} ({df['has_wonky_task'].mean()*100:.2f}%)")
print(f"   - Mean wonky_study_count: {df['wonky_study_count'].mean():.3f}")
print(f"   - Mean wonky_task_instances: {df['wonky_task_instances'].mean():.3f}")

print("\n2. STATISTICAL SIGNIFICANCE:")
print(f"   - Significant features for wonky_study: {stats_results_study['mw_significant'].sum()}/{len(stats_results_study)}")
print(f"   - Significant features for wonky_task: {stats_results_task['mw_significant'].sum()}/{len(stats_results_task)}")

print("\n3. TOP CORRELATED FEATURES (wonky_study_count):")
top_corr_study = corr_study_count.head(6)[1:]
for idx, (feature, corr) in enumerate(top_corr_study.items(), 1):
    print(f"   {idx}. {feature}: {corr:.3f}")

print("\n4. TOP CORRELATED FEATURES (wonky_task_instances):")
top_corr_task = corr_task_instances.head(6)[1:]
for idx, (feature, corr) in enumerate(top_corr_task.items(), 1):
    print(f"   {idx}. {feature}: {corr:.3f}")

print("\n5. TOP STATISTICALLY SIGNIFICANT FEATURES (wonky_study_count):")
top_sig_study = stats_results_study[stats_results_study['mw_significant']].head(5)
for idx, row in enumerate(top_sig_study.itertuples(), 1):
    print(f"   {idx}. {row.metric}: p={row.mw_p_value:.4f}, mean_diff={row.mean_difference:.3f}")

print("\n6. TOP STATISTICALLY SIGNIFICANT FEATURES (wonky_task_instances):")
top_sig_task = stats_results_task[stats_results_task['mw_significant']].head(5)
for idx, row in enumerate(top_sig_task.itertuples(), 1):
    print(f"   {idx}. {row.metric}: p={row.mw_p_value:.4f}, mean_diff={row.mean_difference:.3f}")

if len(chi2_results_study) > 0 and 'significant' in chi2_results_study.columns:
    print("\n7. SIGNIFICANT CATEGORICAL FEATURES (wonky_study_count):")
    sig_cat_study = chi2_results_study[chi2_results_study['significant']].head(5)
    for idx, row in enumerate(sig_cat_study.itertuples(), 1):
        print(f"   {idx}. {row.feature}: p={row.chi_p_value:.4f}")

if len(chi2_results_task) > 0 and 'significant' in chi2_results_task.columns:
    print("\n8. SIGNIFICANT CATEGORICAL FEATURES (wonky_task_instances):")
    sig_cat_task = chi2_results_task[chi2_results_task['significant']].head(5)
    for idx, row in enumerate(sig_cat_task.itertuples(), 1):
        print(f"   {idx}. {row.feature}: p={row.chi_p_value:.4f}")

print("\n" + "=" * 80)