In [7]:
import sqlite3
import pandas as pd

# -------------------------------
# Function: Iteratively Remove Columns/Rows with High Missing Values
# -------------------------------
def iterative_filter(df, col_thresh=40, row_thresh=20, max_iter=5):
    """ Iteratively removes columns and rows exceeding missing value thresholds. """
    iteration = 0
    while iteration < max_iter:
        iteration += 1
        prev_shape = df.shape
        
        # Drop columns with more than col_thresh% missing values
        df = df.loc[:, df.isnull().mean() * 100 < col_thresh]

                
        # Drop rows with more than row_thresh% missing values
        df = df.loc[df.isnull().mean(axis=1) * 100 < row_thresh, :]

    
        print(f"Iteration {iteration}: Reduced from {prev_shape} → {df.shape}")
        
        # Stop if no more changes
        if df.shape == prev_shape:
            break
    return df

# -------------------------------
# Function: Load Country Metadata from SQLite
# -------------------------------
def load_country_meta(db_path, income_group=None, region=None):
    """ Loads country metadata and filters based on income group and/or region. """
    conn = sqlite3.connect(db_path)
    query = "SELECT [Short Name], [Income Group], Region FROM Country"
    
    filters = []
    if income_group:
        filters.append(f"[Income Group] = '{income_group}'")
    if region:
        filters.append(f"Region = '{region}'")
    
    if filters:
        query += " WHERE " + " AND ".join(filters)

    df_country = pd.read_sql_query(query, conn)
    conn.close()
    return df_country

# -------------------------------
# Function: Load and Filter CSV Data Based on Selected Countries
# -------------------------------
def load_and_filter_csv(csv_path, country_list):
    """ Loads CSV data and filters by selected countries. """
    df = pd.read_csv(csv_path)
    print("Initial CSV shape:", df.shape)

    # Keep only rows for selected countries
    df = df[df['CountryShortName'].isin(country_list)]

    # Apply iterative missing value filtering
    df_filtered = iterative_filter(df.copy())

    print("Filtered CSV shape:", df_filtered.shape)
    return df_filtered

# -------------------------------
# Function: Build Indicator Mapping from SQLite
# -------------------------------
def build_indicator_mapping(db_path):
    """ Extracts indicator-to-table mapping from all indicator tables in SQLite. """
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    # Get all indicator tables dynamically
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name LIKE '%Indicator%'")
    indicator_tables = [row[0] for row in cursor.fetchall()]

    indicator_mapping = {}
    for table in indicator_tables:
        col_name = "Series Name" if "QPSD" in table else "Indicator Name"
        query = f"SELECT [{col_name}] FROM [{table}]"
        
        try:
            df_temp = pd.read_sql(query, conn)
            for indicator in df_temp[col_name].dropna():
                indicator_mapping[indicator] = table
        except Exception as e:
            print(f"Skipping table {table} due to error: {e}")
    
    conn.close()
    return indicator_mapping

# -------------------------------
# Function: Compute Missing Data by Country
# -------------------------------
def compute_missing_by_country(df):
    """ Aggregates missing values at the country level. """
    missing_info = df.groupby("CountryShortName").apply(
        lambda group: pd.Series({
            "missing_count": group.isnull().sum().sum(),
            "missing_percentage": (group.isnull().sum().sum() / (group.shape[0] * group.shape[1])) * 100
        }),
        include_groups=False
    ).reset_index()
    
    return missing_info.sort_values("missing_percentage", ascending=False)

# -------------------------------
# Main Processing Function
# -------------------------------
def main(csv_path, db_path, income_group=None, region=None):
    """ Runs the full process: Load, filter, clean, and generate reports. """
    
    # Step 1: Load country metadata with optional filters
    df_country = load_country_meta(db_path, income_group, region)
    print("Country metadata shape:", df_country.shape)

    # Step 2: Filter CSV based on country selection
    selected_countries = df_country['Short Name'].tolist()
    df_filtered = load_and_filter_csv(csv_path, selected_countries)

    # Step 3: Extract indicator-to-table mapping
    indicator_mapping = build_indicator_mapping(db_path)

    # Step 4: Compute missing values by country
    missing_by_country = compute_missing_by_country(df_filtered)

    # Step 5: Merge missing count with country metadata
    df_final = missing_by_country.merge(df_country, left_on="CountryShortName", right_on="Short Name", how="left")
    df_final = df_final.drop(columns=["Short Name"]).sort_values("missing_count", ascending=False)

    print("\nFinal Merged Report (Missing Values with Income Group and Region):")
    print(df_final)

    return df_filtered, df_final

In [10]:
# -------------------------------
# Run the Main Processing Function
# -------------------------------
if __name__ == "__main__":
    CSV_PATH = r"C:\Users\Namrata Patil\Desktop\bana698-project\culminating-project-group-1\Week 3\NP\Group1Data.csv"   
    DB_PATH = r"C:\Users\Namrata Patil\Desktop\bana698-project\culminating-project-group-1\Week 2\Database Files\BANA698GROUP1.db.db"
    # Set income group or region for filtering (set to None if you want all)
    SELECTED_INCOME_GROUP = "High income" 
    SELECTED_REGION = None

    df_filtered, df_final_report = main(
        CSV_PATH, DB_PATH,
        income_group=SELECTED_INCOME_GROUP,
        region=SELECTED_REGION
    )

Country metadata shape: (85, 3)
Initial CSV shape: (5380, 1764)
Iteration 1: Reduced from (1742, 1764) → (1082, 662)
Iteration 2: Reduced from (1082, 662) → (1082, 662)
Filtered CSV shape: (1082, 662)

Final Merged Report (Missing Values with Income Group and Region):
        CountryShortName  missing_count  missing_percentage Income Group  \
3    Trinidad and Tobago         1697.0           14.262901  High income   
4       Macao SAR, China         1692.0           14.220877  High income   
5                 Guyana         1690.0           14.204068  High income   
6   United Arab Emirates         1690.0           14.204068  High income   
7             Seychelles         1659.0           13.943520  High income   
2               Barbados         1368.0           14.782797  High income   
8                  Qatar         1359.0           10.820925  High income   
9   Hong Kong SAR, China         1327.0           10.566128  High income   
1                  Palau         1008.0        

In [11]:
df_filtered.to_csv("df_filtered_col_first.csv",index=False)