## Need: Dataset, Column Report, Row Report

In [None]:
import pandas as pd

# Load Excel file with column lists
allCols_df = pd.read_excel(
    '/Users/josh/Desktop/Macbook Working Files/Git Repos/culminating-project-group-1/Week 5/Datasets and Research Questions/JC/Group1Data_ColumnReport.xlsx'
)

indicator_to_source = dict(zip(allCols_df["Indicator"], allCols_df["Source Table"]))

LifeExpCols_df = pd.read_excel(
    '/Users/josh/Desktop/Macbook Working Files/Git Repos/culminating-project-group-1/Week 5/Datasets and Research Questions/JC/Group1Data_ColumnReport.xlsx',
    sheet_name='Life Expectancy Columns'
)
UnemCols_df = pd.read_excel(
    '/Users/josh/Desktop/Macbook Working Files/Git Repos/culminating-project-group-1/Week 5/Datasets and Research Questions/JC/Group1Data_ColumnReport.xlsx',
    sheet_name='Unemployment Columns'
)

# Load main dataset
dataset_df = pd.read_csv(
    '/Users/josh/Desktop/Macbook Working Files/Git Repos/culminating-project-group-1/Week 5/Datasets and Research Questions/JC/Group1Dataset_LCU_removed(except_exchange_rate).csv'
)

# ---------------------------------------------------------
# 1. Extract column lists from the Excel sheets
# ---------------------------------------------------------
life_exp_cols = LifeExpCols_df['Indicator'].tolist()
unem_cols = UnemCols_df['Indicator'].tolist()

# ---------------------------------------------------------
# 2. Retain only the columns that exist in the dataset
# ---------------------------------------------------------
valid_life_exp_cols = [col for col in life_exp_cols if col in dataset_df.columns]
valid_unem_cols = [col for col in unem_cols if col in dataset_df.columns]

# ---------------------------------------------------------
# 3. Define a helper function to ensure essential columns appear at the beginning
# ---------------------------------------------------------
def ensure_essential_columns_at_start(cols, df, essential_cols):
    """
    Ensures that the essential columns (if they exist in the dataset)
    are placed at the beginning of the columns list.
    """
    # Only consider essential columns that exist in the dataset
    valid_essentials = [col for col in essential_cols if col in df.columns]
    # Remove any occurrence of the essential columns from the existing list
    remaining = [col for col in cols if col not in valid_essentials]
    # Return the new list with essential columns first
    return valid_essentials + remaining

# Define essential columns in the desired order
essential_cols = ['CountryShortName', 'Year']

# ---------------------------------------------------------
# 4. Update the column lists for each group so that essential columns are first
# ---------------------------------------------------------
final_life_exp_cols = ensure_essential_columns_at_start(valid_life_exp_cols, dataset_df, essential_cols)
final_unem_cols = ensure_essential_columns_at_start(valid_unem_cols, dataset_df, essential_cols)

# ---------------------------------------------------------
# 5. Filter the dataset using the final column lists
# ---------------------------------------------------------
life_exp_data = dataset_df[final_life_exp_cols]
unem_data = dataset_df[final_unem_cols]

In [None]:
# ---------------------------
# Data Loading Functions
# ---------------------------
def load_dataset(dataset_path):
    """Load a CSV dataset."""
    return pd.read_csv(dataset_path)

def load_excel_sheet(excel_path, sheet_name=None):
    """Load an Excel sheet (or the first sheet if sheet_name is None)."""
    return pd.read_excel(excel_path, sheet_name=sheet_name)

# ---------------------------
# Filtering Functions
# ---------------------------
def get_valid_columns(df, columns):
    """Return the subset of columns that exist in the DataFrame."""
    return [col for col in columns if col in df.columns]

def ensure_essential_columns_at_start(cols, df, essential_cols):
    """
    Ensure that essential columns (if they exist in df)
    are placed at the beginning of the list.
    """
    valid_essentials = [col for col in essential_cols if col in df.columns]
    remaining = [col for col in cols if col not in valid_essentials]
    return valid_essentials + remaining

def filter_dataset(df, columns, essential_cols=['CountryShortName', 'Year']):
    """
    Filter the DataFrame to only include the valid columns from the provided list.
    Essential columns will be moved to the beginning.
    """
    valid_cols = get_valid_columns(df, columns)
    final_cols = ensure_essential_columns_at_start(valid_cols, df, essential_cols)
    return df[final_cols]

# ---------------------------
# Missing Data Report Functions
# ---------------------------
def create_missing_report_by_column_with_source(df, mapping):
    """
    Create a DataFrame reporting the count and percentage of missing values for each column,
    and add a column for the Source Table based on the mapping.
    """
    # Create missing values report with column names as a column named "Column"
    report = pd.DataFrame({
        'missing_count': df.isna().sum(),
        'missing_percent': df.isna().mean() * 100
    }).reset_index().rename(columns={'index': 'Column'})
    
    # Add the Source Table column by mapping the "Column" names to the dictionary
    report["Source Table"] = report["Column"].map(mapping)
    return report

# ---------------------------
# Utility Functions
# ---------------------------
def save_csv(df, output_path, index=False):
    """Save a DataFrame to a CSV file."""
    df.to_csv(output_path, index=index)

In [None]:
# ---------------------------
# Main Workflow
# ---------------------------
# Define file paths
excel_path = '/Users/josh/Desktop/Macbook Working Files/Git Repos/culminating-project-group-1/Week 5/Datasets and Research Questions/JC/Group1Data_ColumnReport.xlsx'
dataset_path = '/Users/josh/Desktop/Macbook Working Files/Git Repos/culminating-project-group-1/Week 5/Datasets and Research Questions/JC/Group1Dataset_LCU_removed(except_exchange_rate).csv'

# Load Excel sheets containing the column lists
allCols_df = load_excel_sheet(excel_path)
life_exp_cols_df = load_excel_sheet(excel_path, sheet_name='Life Expectancy Columns')
unem_cols_df = load_excel_sheet(excel_path, sheet_name='Unemployment Columns')

# Extract column lists from the Excel sheets
life_exp_cols = life_exp_cols_df['Indicator'].tolist()
unem_cols = unem_cols_df['Indicator'].tolist()

# Load the main dataset
dataset_df = load_dataset(dataset_path)

# Filter datasets using the lists (ensuring CountryShortName and Year are first)
life_exp_data = filter_dataset(dataset_df, life_exp_cols)
unem_data = filter_dataset(dataset_df, unem_cols)

# Generate missing reports for Life Expectancy data
life_exp_missing_report = create_missing_report_by_column_with_source(life_exp_data, indicator_to_source)

# Generate missing reports for Unemployment data
unem_missing_report = create_missing_report_by_column_with_source(unem_data, indicator_to_source)

# Save the filtered datasets to CSV files
save_csv(life_exp_data, '/Users/josh/Desktop/Macbook Working Files/Git Repos/culminating-project-group-1/Week 5/Datasets and Research Questions/JC/LifeExpectancyDataset.csv')
save_csv(unem_data, '/Users/josh/Desktop/Macbook Working Files/Git Repos/culminating-project-group-1/Week 5/Datasets and Research Questions/JC/UnemploymentDataset.csv')

# Save the missing reports for Life Expectancy data
save_csv(life_exp_missing_report, '/Users/josh/Desktop/Macbook Working Files/Git Repos/culminating-project-group-1/Week 5/Datasets and Research Questions/JC/LifeExp_MissingReport.csv')

# Save the missing reports for Unemployment data
save_csv(unem_missing_report, '/Users/josh/Desktop/Macbook Working Files/Git Repos/culminating-project-group-1/Week 5/Datasets and Research Questions/JC/Unem_MissingReport.csv')