#### 1: Setup

In [14]:
import pandas as pd
import numpy as np
import mysql.connector
from dotenv import load_dotenv
import os

load_dotenv()
MYSQL_USER = os.getenv('MYSQL_USER')
MYSQL_PASSWORD = os.getenv('MYSQL_PASSWORD')
MYSQL_HOST = os.getenv('MYSQL_HOST')
MYSQL_DATABASE = os.getenv('MYSQL_DATABASE')
CSV_FILE = 'synthetic_gts_survey_data.csv'

#### 2. Load Raw Data (E - Extract)

In [15]:
try:
    df_raw = pd.read_csv(CSV_FILE)
    # Convert date column to date objects for MySQL DATE type compatibility
    df_raw['survey_date'] = pd.to_datetime(df_raw['survey_date']).dt.date
    
    print(f"Loaded Raw Data (df_raw): {len(df_raw)} records.")
    
except FileNotFoundError:
    print(f"Error: CSV file '{CSV_FILE}' not found. Aborting.")
    exit()

# Define the columns expected in the final gts_processed_data table
CLEAN_COLUMNS = [
    'response_id', 'survey_date', 'location', 'aid_provider', 'displacement_status', 
    'gender', 'age_group', 'aid_satisfaction', 'trust_in_aid_provider', 
    'communication_clarity', 'aid_fairness', 'feedback_comment', 'is_valid', 'processing_notes'
]

Loaded Raw Data (df_raw): 10000 records.


#### 3. Data Cleaning Function (T - Transform)

In [16]:
def create_clean_dataframe(df_raw):
    """
    Applies standardization and imputation rules to create the final, 
    clean 'gts_processed_data' DataFrame (df_clean).
    """
    df_clean = df_raw.copy()
    
    # Define Cleaning/Standardization Configurations
    gender_map = {'Female': 'Female', 'Male': 'Male', 'male': 'Male', 'fEmAlE': 'Female'}
    aid_provider_map = {
        'UNHCR ': 'UNHCR', 'Intl Rescue Commitee': 'IRC', 
        'WFP': 'WFP', 'UNICEF': 'UNICEF', 'NRC': 'NRC', 'ICRC': 'ICRC'
    }
    score_cols = ['aid_satisfaction', 'trust_in_aid_provider', 'communication_clarity', 'aid_fairness']
    
    # Pre-calculate the median for imputation (robust for ordinal data)
    median_scores = df_clean[score_cols].median(skipna=True).apply(lambda x: int(round(x)))

    # Initialize new columns for processing audit
    df_clean['processing_notes'] = ''
    df_clean['is_valid'] = True
    
    # --- Apply Cleaning and Auditing ---

    # 1. Score Imputation and Type Conversion (Addressing NOT NULL requirement)
    for col in score_cols:
        is_missing = df_clean[col].isna()
        if is_missing.any():
            imputed_value = median_scores[col]
            df_clean.loc[is_missing, col] = imputed_value
            df_clean.loc[is_missing, 'processing_notes'] += f"Imputed missing {col} with {imputed_value}. "
            df_clean.loc[is_missing, 'is_valid'] = False
        
        # Crucial step: Convert score columns to the strict integer type (TINYINT compatible)
        df_clean[col] = df_clean[col].astype(int) 

    # 2. Standardization for Categorical Columns
    
    # Gender
    df_clean['gender'] = df_clean['gender'].astype(str).str.strip().replace(gender_map)
    is_unknown_gender = ~df_clean['gender'].isin(gender_map.values())
    if is_unknown_gender.any():
        df_clean.loc[is_unknown_gender, 'gender'] = 'Unknown'
        df_clean.loc[is_unknown_gender, 'processing_notes'] += "Standardized non-standard/unknown gender. "
        df_clean.loc[is_unknown_gender, 'is_valid'] = False
        
    # Aid Provider
    df_clean['aid_provider'] = df_clean['aid_provider'].astype(str).str.strip().replace(aid_provider_map)
    
    # 3. Final Cleanup
    # Handle missing/NaN comments (required for MySQL insertion, replace with None)
    df_clean['feedback_comment'] = df_clean['feedback_comment'].replace({np.nan: None}).astype(object)
    
    return df_clean[CLEAN_COLUMNS]

# Execute the cleaning function
df_clean = create_clean_dataframe(df_raw)

df_clean.head(5)

Unnamed: 0,response_id,survey_date,location,aid_provider,displacement_status,gender,age_group,aid_satisfaction,trust_in_aid_provider,communication_clarity,aid_fairness,feedback_comment,is_valid,processing_notes
0,GTS-00001,2024-10-13,Jigjiga Zone (Ethiopia),UNICEF,Refugee,Male,18-25,2,2,3,4,UNHCR staff were very helpful.,True,
1,GTS-00002,2024-11-14,Jigjiga Zone (Ethiopia),ICRC,Refugee,Unknown,41-60,5,1,5,4,I feel safe here now.,False,Standardized non-standard/unknown gender.
2,GTS-00003,2024-07-09,Cox's Bazar (Bangladesh),NRC,Refugee,Female,26-40,3,4,3,3,WFP delivered on time.,True,
3,GTS-00004,2024-05-01,Jigjiga Zone (Ethiopia),WFP,IDP (Internally Displaced Person),Female,41-60,3,3,5,3,WFP delivered on time.,False,Imputed missing aid_satisfaction with 3. Imput...
4,GTS-00005,2024-09-04,Gaza Strip / West Bank (oPt),UNICEF,IDP (Internally Displaced Person),Unknown,41-60,1,1,4,4,UNHCR staff were very helpful.,False,Standardized non-standard/unknown gender.


#### 4. MySQL Injection Function (L - Load)

In [18]:
def inject_clean_data_to_mysql(df, table_name, column_names):
    """
    Connects to MySQL and injects the CLEANED DataFrame into the specified table.
    """
    conn = None
    try:
        # Establish connection
        conn = mysql.connector.connect(
            user=MYSQL_USER,
            password=MYSQL_PASSWORD,
            host=MYSQL_HOST,
            database=MYSQL_DATABASE
        )
        cursor = conn.cursor()
        print(f"\nSuccessfully connected to MySQL database.")

        print(f"Injecting {len(df)} CLEAN records into '{table_name}'")
        
        # Prepare injection query
        placeholders = ', '.join(['%s'] * len(column_names))
        cols_str = ', '.join(column_names)
        insert_query = f"INSERT INTO {table_name} ({cols_str}) VALUES ({placeholders})"
        
        # Convert DataFrame to a list of tuples for executemany
        data_to_insert = [tuple(row) for row in df.values]
        
        cursor.executemany(insert_query, data_to_insert)
        conn.commit()
        
        print(f"SUCCESS: {len(data_to_insert)} clean records committed to {table_name}.")

    except mysql.connector.Error as err:
        print(f"\nFATAL MySQL Error during Injection: {err}")
        print("Please verify connection details, database name, and table schema (especially NOT NULL constraints).")
        if conn:
            conn.rollback() # Rollback on error
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    finally:
        if conn and conn.is_connected():
            cursor.close()
            conn.close()
            print("MySQL connection closed.")

# --- Execute Injection ---
if __name__ == "__main__":
    
    # Check if df_clean was successfully created in the previous cell
    if 'df_clean' in locals() and not df_clean.empty:
        inject_clean_data_to_mysql(df_clean, 'gts_processed_data', CLEAN_COLUMNS)
    else:
        print("\nInjection failed: Clean DataFrame is empty or was not created successfully.")


Successfully connected to MySQL database.
Injecting 10000 CLEAN records into 'gts_processed_data'...
SUCCESS: 10000 clean records committed to gts_processed_data.
MySQL connection closed.
