#### Setup and Data RetrievalSetup and Data Retrieval

In [1]:
import pandas as pd
import numpy as np
import mysql.connector
from scipy import stats # Crucial for statistical inference (T-test)
from dotenv import load_dotenv
import os

# 1. Load Environment Variables and Setup
load_dotenv()
MYSQL_USER = os.getenv('MYSQL_USER')
MYSQL_PASSWORD = os.getenv('MYSQL_PASSWORD')
MYSQL_HOST = os.getenv('MYSQL_HOST')
MYSQL_DATABASE = os.getenv('MYSQL_DATABASE')

def get_clean_data():
    """Fetches the clean data from the gts_processed_data table."""
    conn = None
    try:
        conn = mysql.connector.connect(
            user=MYSQL_USER,
            password=MYSQL_PASSWORD,
            host=MYSQL_HOST,
            database=MYSQL_DATABASE
        )
        
        # Only fetch records marked as valid (is_valid=1) for core analysis
        query = "SELECT * FROM gts_processed_data WHERE is_valid = 1;" 
        df = pd.read_sql(query, conn)
        
        # Ensure scores are integers for accurate calculation
        score_cols = ['aid_satisfaction', 'trust_in_aid_provider', 'communication_clarity', 'aid_fairness']
        for col in score_cols:
             df[col] = df[col].astype(int)
             
        print(f"Successfully retrieved {len(df)} clean records from MySQL.")
        return df
        
    except mysql.connector.Error as err:
        print(f"Database Error during data fetch: {err}")
        return pd.DataFrame()
    finally:
        if conn and conn.is_connected():
            conn.close()

df_clean = get_clean_data()

if df_clean.empty:
    raise RuntimeError("Analysis halted: Could not retrieve clean data from the database.")

  df = pd.read_sql(query, conn)


Successfully retrieved 8091 clean records from MySQL.


#### 2.2: KPI Calculation (Descriptive Statistics)

In [2]:
# --- 2. KPI Calculation Function (Net Satisfaction Score and Mean Scores) ---

def calculate_kpis(df):
    """
    Calculates key descriptive statistics for executive reporting.
    """
    print("\n--- Phase 2.1: KPI Calculation (Descriptive Analysis) ---")
    
    # --- A. Overall Net Satisfaction Score (NSS) ---
    # NSS is derived from Aid Satisfaction (1-5 scale) and is critical for advocacy.
    total_responses = len(df)
    
    # Promoters (Score 5)
    promoters = len(df[df['aid_satisfaction'] == 5])
    # Detractors (Score 1, 2)
    detractors = len(df[df['aid_satisfaction'].isin([1, 2])])
    
    # NSS = (% Promoters - % Detractors)
    nss = round(((promoters / total_responses) - (detractors / total_responses)) * 100, 2)
    
    print(f"Total Valid Responses Analyzed: {total_responses}")
    print(f"Overall Net Satisfaction Score (NSS): {nss}")
    
    # --- B. Mean Scores Grouped by Location and Provider ---
    
    score_cols = ['aid_satisfaction', 'trust_in_aid_provider', 'communication_clarity', 'aid_fairness']
    
    # Grouped by Location (Shows regional variance)
    location_summary = df.groupby('location')[score_cols].mean().round(2).sort_values(by='aid_satisfaction', ascending=False)
    print("\nMean Scores by Location (Top 5):")
    print(location_summary.head())
    
    # Grouped by Provider (Holds partners accountable)
    provider_summary = df.groupby('aid_provider')[score_cols].mean().round(2).sort_values(by='trust_in_aid_provider', ascending=False)
    print("\nMean Scores by Aid Provider:")
    print(provider_summary)
    
    return location_summary, provider_summary

location_summary, provider_summary = calculate_kpis(df_clean)


--- Phase 2.1: KPI Calculation (Descriptive Analysis) ---
Total Valid Responses Analyzed: 8091
Overall Net Satisfaction Score (NSS): 9.12

Mean Scores by Location (Top 5):
                               aid_satisfaction  trust_in_aid_provider  \
location                                                                 
Cox's Bazar (Bangladesh)                   3.61                   2.82   
North-East DRC                             3.59                   2.80   
Jigjiga Zone (Ethiopia)                    3.58                   2.77   
Darfur Region (Sudan)                      3.57                   2.80   
Lviv / Odesa Region (Ukraine)              3.57                   2.79   

                               communication_clarity  aid_fairness  
location                                                            
Cox's Bazar (Bangladesh)                        3.34          3.33  
North-East DRC                                  3.38          3.34  
Jigjiga Zone (Ethiopia)         

#### 2.3: Statistical Significance Testing (Inference)

In [5]:
def check_significance(df):
    """
    Performs T-tests to check if differences in mean scores between key groups 
    (e.g., Male vs. Female) are statistically significant (P < 0.05).
    """
    
    comparison_results = []
    
    # Comparison 1: Male vs. Female on 'Trust in Aid Provider'
    metric = 'trust_in_aid_provider'
    group1_name = 'Female'
    group2_name = 'Male'
    
    group1 = df[df['gender'] == group1_name][metric]
    group2 = df[df['gender'] == group2_name][metric]
    
    if len(group1) < 20 or len(group2) < 20:
        print(f"Skipping {group1_name} vs. {group2_name}: insufficient sample size.")
        return pd.DataFrame()

    # Perform Welch's t-test (ttest_ind with equal_var=False) 
    # This is more robust as it does not assume equal variances between groups.
    t_stat, p_value = stats.ttest_ind(group1, group2, equal_var=False)
    
    is_significant = p_value < 0.05
    
    comparison_results.append({
        'Comparison': f'{group1_name} vs. {group2_name} (on {metric})',
        'Mean_G1': round(group1.mean(), 3),
        'Mean_G2': round(group2.mean(), 3),
        'Difference': round(group1.mean() - group2.mean(), 3),
        'P_Value': round(p_value, 5),
        'Significance': 'Yes' if is_significant else 'No'
    })
    
    # --- Comparison 2: IDP vs. Host Community on 'Aid Fairness' ---
    metric = 'aid_fairness'
    group1_name = 'IDP'
    group2_name = 'Host Community'
    
    group1 = df[df['displacement_status'].str.contains('IDP')][metric] # Use str.contains to catch IDP variations
    group2 = df[df['displacement_status'].str.contains('Host Community')][metric]
    
    if len(group1) < 20 or len(group2) < 20:
        print(f"Skipping {group1_name} vs. {group2_name}: insufficient sample size.")
    else:
        t_stat, p_value = stats.ttest_ind(group1, group2, equal_var=False)
        is_significant = p_value < 0.05
        
        comparison_results.append({
            'Comparison': f'{group1_name} vs. {group2_name} (on {metric})',
            'Mean_G1': round(group1.mean(), 3),
            'Mean_G2': round(group2.mean(), 3),
            'Difference': round(group1.mean() - group2.mean(), 3),
            'P_Value': round(p_value, 5),
            'Significance': 'Yes' if is_significant else 'No'
        })


    results_df = pd.DataFrame(comparison_results)
    print("\n Summary of T-Test Results ")
    print(results_df[['Comparison', 'Mean_G1', 'Mean_G2', 'P_Value', 'Significance']])
    # 
    
    return results_df

significance_results = check_significance(df_clean)


 Summary of T-Test Results 
                                   Comparison  Mean_G1  Mean_G2  P_Value  \
0  Female vs. Male (on trust_in_aid_provider)    2.811    2.784  0.38051   
1    IDP vs. Host Community (on aid_fairness)    3.328    3.296  0.39126   

  Significance  
0           No  
1           No  


#### 2.4: Finalizing and Saving Results

In [6]:
# Reset index of location_summary for clean CSV export
location_summary = location_summary.reset_index()

# Save the necessary dataframes for Phase 3
location_summary.to_csv('location_summary.csv', index=False)
significance_results.to_csv('significance_results.csv', index=False)

print("Results saved to 'location_summary.csv' and 'significance_results.csv'.")

Results saved to 'location_summary.csv' and 'significance_results.csv'.
