This code loads and preprocesses data by separating training data (2010–2016), removing non-numeric columns, replacing '1,000+' with '1000', and handling zeros with a small constant to prepare for transformations.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Paths to the data and metadata files
data_path = "/Users/avakrocheski-meyer/Downloads/TASK: 1119_start_to_finish/1_feature_filtering/final_data.csv"
metadata_path = "/Users/avakrocheski-meyer/Downloads/TASK: 1119_start_to_finish/1_feature_filtering/final_metadata.csv"
output_data_path = "/Users/avakrocheski-meyer/Downloads/TASK: 1119_start_to_finish/2_feature_standardization/final_data_transformed.csv"
output_metadata_path = "/Users/avakrocheski-meyer/Downloads/TASK: 1119_start_to_finish/2_feature_standardization/final_metadata_with_transformation.csv"

# Load data and metadata
df = pd.read_csv(data_path)
metadata_df = pd.read_csv(metadata_path)

# Separate the training data (2010–2016)
train_df = df[df['Year'] <= 2016]

# Drop 'GEO_ID' and 'Year' columns for transformations
train_df = train_df.drop(columns=['GEO_ID', 'Year'])

# Replace any values that are '1,000+' with '1000' and convert all columns to numeric
train_df.replace({r'1,000\+': '1000'}, regex=True, inplace=True)
df.replace({r'1,000\+': '1000'}, regex=True, inplace=True)
# train_df = train_df.apply(pd.to_numeric, errors='coerce')  # Coerce non-numeric to NaN

# Replace all zeros in the training DataFrame with a small positive constant
train_df.replace(0, 1e-9, inplace=True)

This code determines appropriate transformations for each column based on skewness using training data, updates metadata with transformation details, and applies the transformations (logarithmic, standardization, or both) consistently to the full dataset before saving the transformed data.

In [35]:
# Define function to determine transformations based on skewness and standard deviation
def determine_transformations(df, threshold_skew=1):
    transformations = {}
    for column in df.columns:
        # Skip non-numeric columns and explicitly exclude target_TOT_POPULATION
        if not np.issubdtype(df[column].dtype, np.number):
            continue
        if column == 'target_TOT_POPULATION':
            transformations[column] = "log only"
            continue

        # Calculate skewness and standard deviation
        skewness = df[column].skew()
        
        # Determine transformation based on skewness and standard deviation
        if abs(skewness) > threshold_skew:
            transformations[column] = "log + standardize"
        else:
            transformations[column] = "standardize only"
    
    return transformations

# Determine transformations using the training data
transformations = determine_transformations(train_df)
metadata_df['transformation'] = metadata_df['variable'].map(transformations).fillna("standardize only")

# Save the updated metadata file with transformation details
metadata_df.to_csv(output_metadata_path, index=False)
print("Updated metadata with transformation column saved to:", output_metadata_path)

# Apply the determined transformations using training statistics
scaler = StandardScaler()

for column, transformation in transformations.items():
    if transformation == "log + standardize":
        # Apply log transformation and then standardization using training parameters
        train_df[column] = np.log(train_df[column].replace(0, 1e-9))
        scaler.fit(train_df[[column]])  # Fit scaler on training data
        df[column] = np.log(df[column].replace(0, 1e-9))  # Apply log transformation to full dataset
        df[column] = scaler.transform(df[[column]])  # Transform full dataset using training parameters
    elif transformation == "standardize only":
        # Standardize only using training parameters
        scaler.fit(train_df[[column]])  # Fit scaler on training data
        df[column] = scaler.transform(df[[column]])  # Transform full dataset using training parameters
    elif transformation == "log only":
        df[column] = df[column].replace(0, 1e-9)  # Replace 0s to avoid log(0)
        df[column] = np.where(df[column].notna(), np.log(df[column]), np.nan)  # Apply log only to non-NaN values

# Save the fully transformed dataset
df.to_csv(output_data_path, index=False)
print("Transformed dataset saved to:", output_data_path)


Updated metadata with transformation column saved to: /Users/avakrocheski-meyer/Downloads/TASK: 1119_start_to_finish/2_feature_standardization/final_metadata_with_transformation.csv
Transformed dataset saved to: /Users/avakrocheski-meyer/Downloads/TASK: 1119_start_to_finish/2_feature_standardization/final_data_transformed.csv


This code checks for invalid values (NaNs and infinities) in a dataset, excluding the GEO_ID column, and reports columns containing such values, ensuring that only the target_TOT_POPULATION column is expected to have NaNs.

In [33]:
import pandas as pd
import numpy as np

# Function to identify columns with invalid values (NaNs and infinities)
def find_invalid_values(df):
    invalid_values_report = {}

    for column in df.columns:
        # Convert the column to numeric to ensure np.isinf works (non-numeric values will become NaN)
        numeric_col = pd.to_numeric(df[column], errors='coerce')

        # Check for NaNs
        nan_count = numeric_col.isnull().sum()

        # Check for infinities
        inf_count = np.isinf(numeric_col).sum()

        
        # Store details if any invalid values are found
        if nan_count > 0 or inf_count > 0:
            invalid_values_report[column] = {
                "NaNs": nan_count,
                "Infinities": inf_count
            }

    return invalid_values_report


# Create a copy of df without the GEO_ID column
df_without_geo_id = df.drop(columns=['GEO_ID'])

# Run the function and display results
invalid_values_report = find_invalid_values(df_without_geo_id)

print("target_TOT_POPULATION should have NaNs; nothing else")

# Display any columns with invalid values
if invalid_values_report:
    print("Columns with invalid values:")
    for column, issues in invalid_values_report.items():
        print(f"Column '{column}': {issues}")
else:
    print("No invalid values found in the dataset.")


target_TOT_POPULATION should have NaNs; nothing else
Columns with invalid values:
Column 'target_TOT_POPULATION': {'NaNs': 101, 'Infinities': 0}
