In [1]:
import os
import pandas as pd

# Define the relative or absolute path to the data file
DATA_PATH = os.path.join("..", "HW3/data", "co2-gdp-pop-growth.csv")

def load_dataset(file_path: str = DATA_PATH) -> pd.DataFrame:
    """
    Load a CSV dataset into a pandas DataFrame.

    Parameters
    ----------
    file_path : str, optional
        The path to the CSV file. Defaults to '../data/co2-gdp-pop-growth.csv'.

    Returns
    -------
    pd.DataFrame
        The loaded dataset.

    Raises
    ------
    FileNotFoundError
        If the specified CSV file does not exist.
    pd.errors.EmptyDataError
        If the CSV file is empty.
    pd.errors.ParserError
        If the CSV file is malformed.
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(
            f"Error: The file '{file_path}' was not found.\n"
            "Please verify the path or ensure the file is in the 'data' directory."
        )

    try:
        df = pd.read_csv(file_path)
        print(f"Successfully loaded dataset: {file_path}")
        print(f"Data shape: {df.shape[0]} rows × {df.shape[1]} columns")
        return df

    except pd.errors.EmptyDataError:
        print(f"Warning: The file '{file_path}' is empty.")
        raise
    except pd.errors.ParserError as e:
        print(f"Error parsing '{file_path}': {e}")
        raise
    except Exception as e:
        print(f"Unexpected error while reading '{file_path}': {e}")
        raise

# Test load_dataset() function created previously
if __name__ == "__main__":
    try:
        df = load_dataset()
    except Exception as e:
        print(f"Failed to load dataset: {e}")


Successfully loaded dataset: ../HW3/data/co2-gdp-pop-growth.csv
Data shape: 27645 rows × 6 columns


In [2]:
# For reference, this is what the code used to look like before refactoring. 

#  # Load the dataset from the 'data' folder

# try:
#     df = pd.read_csv('../HW3/data/co2-gdp-pop-growth.csv')
# except FileNotFoundError:
#     print("Error: 'data/co2-gdp-pop-growth.csv' not found. Please ensure the file is in the 'data' folder.")

In [3]:
# Check dataset contents

# Check to see if dataset is empty
if df.empty:
    print("The dataset is empty. No rows to display.")

# If dataset is not empty, print the first 5 rows
else:
    print("\nLet's check the first 5 rows of the dataset:")
    print(df.head().to_string(index=False))


Let's check the first 5 rows of the dataset:
     Entity Code  Year  Population growth (annual %)  GDP growth (annual %)  Annual CO₂ emissions growth (%)
Afghanistan  AFG  1961                      1.962239                    NaN                        18.583180
Afghanistan  AFG  1962                      2.044523                    NaN                        40.300896
Afghanistan  AFG  1963                      2.105208                    NaN                         2.634644
Afghanistan  AFG  1964                      2.161195                    NaN                        18.651236
Afghanistan  AFG  1965                      2.233709                    NaN                        20.078205


In [4]:
# For reference, this is what the code used to look like before refactoring. 

# # Check dataset
# print("First 5 rows of the raw data:")
# df.head()

In [5]:
# We'll get an initial look at the data to understand its structure, data types, and any missing values

print("Initial Dataset Information:")
df.info()

Initial Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27645 entries, 0 to 27644
Data columns (total 6 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Entity                           27645 non-null  object 
 1   Code                             23674 non-null  object 
 2   Year                             27645 non-null  int64  
 3   Population growth (annual %)     14458 non-null  float64
 4   GDP growth (annual %)            11811 non-null  float64
 5   Annual CO₂ emissions growth (%)  26002 non-null  float64
dtypes: float64(3), int64(1), object(2)
memory usage: 1.3+ MB


In [6]:
import io
import pandas as pd

def summarize_dataset(df: pd.DataFrame, name: str = "Dataset") -> None:
    """
    Print a detailed summary of a pandas DataFrame including:
    - Basic info (columns, dtypes) captured from df.info()
    - Missing value summary
    - Descriptive statistics for numeric columns
    """
    print(f"\nInitial Information for {name}:")
    print("=" * (25 + len(name)))

    # Capture df.info() into a string buffer
    buf = io.StringIO()
    df.info(buf=buf)
    info_str = buf.getvalue()
    buf.close()
    print(info_str)   # or log it somewhere

    # Missing values summary
    print("\nMissing Values Summary:")
    missing = df.isnull().sum()
    missing = missing[missing > 0].sort_values(ascending=False)
    if missing.empty:
        print("No missing values detected.")
    else:
        print(missing.to_string())
        print(f"\nTotal columns with missing data: {len(missing)}")

    # Descriptive statistics for numeric columns
    print("\nDescriptive Statistics (Numeric Columns):")
    print(df.describe().T.round(2).to_string())

    # Shape and memory usage summary
    print(f"\nDataset shape: {df.shape[0]} rows × {df.shape[1]} columns")
    mem = df.memory_usage(deep=True).sum() / 1024**2
    print(f"Memory usage: {mem:.2f} MB")
