In [17]:
import pandas as pd
import numpy as np
import sqlite3

## Initial Cleaning

- Drop columns where `null` values exceed 90%

In [45]:
import sqlite3
import pandas as pd

# -------------------------------
# Step 1. Load and filter the DataFrame
# -------------------------------
df = pd.read_csv(r'D:\GitHub Repos\culminating-project-group-1\Week 3\JC\Group1Data.csv')
# Calculate the absolute missing counts for each column
missing_counts = df.isnull().sum()

# Define the threshold as 30% of the total number of rows
threshold_count = 0.1 * len(df)

# Filter the DataFrame columns based on the absolute missing counts
df = df.loc[:, missing_counts < threshold_count]

# Identify indicator columns (exclude 'CountryShortName' and 'year')
indicator_cols = [col for col in df.columns if col not in ['CountryShortName', 'year']]

# -------------------------------
# Step 2. Build a mapping of indicator name -> source table from the database
# -------------------------------
db_path = r"D:\GitHub Repos\culminating-project-group-1\Week 2\Database Files\BANA698GROUP1.db.db"
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Query SQLite master to get tables that contain the word "Indicator"
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name LIKE '%Indicator%'")
tables = [row[0] for row in cursor.fetchall()]

# Dictionary to map indicator name to its source table
indicator_mapping = {}

for table in tables:
    # Determine which column holds the indicator name
    if table.startswith("QPSD"):
        col_name = "Series Name"
    else:
        # For SPI and WDI tables, assume the indicator name is in "Indicator Name"
        col_name = "Indicator Name"
    
    # Build and execute the query
    query = f"SELECT [{col_name}] FROM [{table}]"
    try:
        cursor.execute(query)
        rows = cursor.fetchall()
        for row in rows:
            indicator_name = row[0]
            if indicator_name:
                # If an indicator appears in multiple tables, the last one will be kept.
                indicator_mapping[indicator_name] = table
    except Exception as e:
        print(f"Error querying table {table} (column '{col_name}'): {e}")

# Close the connection when done
conn.close()

# -------------------------------
# Step 3. Build the Indicator Mapping Report DataFrame
# -------------------------------
report_data = []
for indicator in indicator_cols:
    source_table = indicator_mapping.get(indicator, "Unknown Source")
    report_data.append({"Indicator": indicator, "Source Table": source_table})

report_df = pd.DataFrame(report_data)
# Sort the report by source table and indicator name
report_df = report_df.sort_values(by=["Source Table", "Indicator"]).reset_index(drop=True)

print("Indicator Mapping Report:")
print(report_df)

# -------------------------------
# Step 4. Compute missing values aggregated by country
# -------------------------------
missing_total_by_country = (
    df.groupby("CountryShortName")
      .apply(lambda group: group.isnull().sum().sum())
      .reset_index(name="missing_count")
)
missing_total_by_country = missing_total_by_country.sort_values("missing_count", ascending=False)

# -------------------------------
# Step 5. Query the Country table for metadata (Income Group and Region)
# -------------------------------
conn = sqlite3.connect(db_path)
country_query = "SELECT [Short Name], [Income Group], Region FROM Country"
df_country_meta = pd.read_sql_query(country_query, conn)
conn.close()

# -------------------------------
# Step 6. Merge missing count with country metadata
# -------------------------------
merged_report = missing_total_by_country.merge(
    df_country_meta,
    left_on="CountryShortName",
    right_on="Short Name",
    how="left"
)

# Optionally drop the duplicate country name column after merge
merged_report = merged_report.drop(columns=["Short Name"])
merged_report = merged_report.sort_values("missing_count", ascending=False)

print("\nMissing Values Count by Country with Income Group and Region:")
print(merged_report)


Indicator Mapping Report:
                                          Indicator  \
0                                              Year   
1                           GDP (constant 2015 US$)   
2                                 GDP (current US$)   
3                             GDP growth (annual %)   
4                GDP per capita (constant 2015 US$)   
..                                              ...   
118  Secure Internet servers (per 1 million people)   
119               Merchandise exports (current US$)   
120               Merchandise imports (current US$)   
121                    Merchandise trade (% of GDP)   
122                                   Net migration   

                   Source Table  
0                Unknown Source  
1          WDIEconomicIndicator  
2          WDIEconomicIndicator  
3          WDIEconomicIndicator  
4          WDIEconomicIndicator  
..                          ...  
118  WDIInfrastructureIndicator  
119   WDIPrivateSectorIndicator  
120   W