This code loads a dataset, splits it into training (2010–2016) and test sets (2017–2019), separates the features and target variable for both sets, and confirms their shapes to ensure the data is correctly prepared for modeling.

In [16]:
import pandas as pd

# Load your dataset
df = pd.read_csv('/Users/avakrocheski-meyer/Downloads/TASK: 1119_start_to_finish/6_feature_generation/all_lagged.csv') 

# Define the training and test data years
train_years = range(2010, 2016 + 1)  # Training data from 2010 to 2016
test_years = range(2017, 2019)   # Test data from 2017 to 2018

# Split the dataset into training and test sets
train_df = df[df['Year'].isin(train_years)].copy()
test_df = df[df['Year'].isin(test_years)].copy()

# Define the target column
target_column = 'target_TOT_POPULATION'  # Replace with your actual target column name

# Define feature columns to drop
columns_to_drop = ['GEO_ID', 'Year', target_column]

# Separate features and target for training
X_train_features = train_df.drop(columns=columns_to_drop)
y_train = train_df[target_column]

# Separate features and target for testing
X_test_features = test_df.drop(columns=columns_to_drop)
y_test = test_df[target_column]

# Confirm the shapes of the datasets
print("X_train_features shape:", X_train_features.shape)
print("y_train shape:", y_train.shape)
print("X_test_features shape:", X_test_features.shape)
print("y_test shape:", y_test.shape)


X_train_features shape: (707, 150)
y_train shape: (707,)
X_test_features shape: (202, 150)
y_test shape: (202,)


# Grid search

This code implements expanding window cross-validation, where a model is trained on incrementally larger training sets and evaluated on the next year's data, with evaluation metrics (MSE, MAE, RMSE) computed for each validation iteration

In [17]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor

# Function to perform cross-validation and return evaluation metrics
def cross_val_model(X_train, y_train, X_val, y_val, model):
    #validate_data(X_train, y_train)
    #validate_data(X_val, y_val)

    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    
    # Convert back from log scale
    # y_pred_exp = np.exp(y_pred)
    # y_val_exp = np.exp(y_val)
    
    mse = mean_squared_error(y_val, y_pred)
    mae = mean_absolute_error(y_val, y_pred)
    rmse = np.sqrt(mse)
    return mse, mae, rmse


# Function to perform expanding window cross-validation
def expanding_window_cv(df, start_year, end_year, target_column, model):
    results = []
    years = sorted(df['Year'].unique())
    years = [y for y in years if start_year <= y <= end_year]
    for i in range(3, len(years)):
        train_years = years[:i]
        val_year = years[i]
        train_data = df[df['Year'].isin(train_years)]
        val_data = df[df['Year'] == val_year]

        # Separate features and target
        X_train = train_data.drop(columns=[target_column, 'GEO_ID', 'Year'])
        y_train = train_data[target_column]
        X_val = val_data.drop(columns=[target_column, 'GEO_ID', 'Year'])
        y_val = val_data[target_column]


        # Train and evaluate model
        mse, mae, rmse = cross_val_model(X_train, y_train, X_val, y_val, model)
        results.append({'Year': val_year, 'MSE': mse, 'MAE': mae, 'RMSE': rmse})
    
    return pd.DataFrame(results)

This code performs hyperparameter tuning for a Random Forest model using a grid search over specified parameter combinations and evaluates each configuration with expanding window cross-validation, identifying the best parameters based on the lowest mean MSE across validation segments.

In [68]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from itertools import product

# Example parameter grid
rf_param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Define start_year, end_year, target_column for your expanding window CV
start_year = 2010  # e.g., 2000
end_year = 2016     # e.g., 2020
target_column = 'target_TOT_POPULATION'  # replace with your target column

# df should contain columns: 'Year', 'GEO_ID', target_column, and features
# You must ensure df is prepared prior to running this code.
# For example:
# df = pd.DataFrame(...)
# Make sure it has a 'Year' column, a 'GEO_ID' column, and a target column.

best_score = np.inf
best_params = None

results_list = []

# Iterate over all combinations of parameters
for (n_est, max_d, min_split) in product(rf_param_grid['n_estimators'],
                                                   rf_param_grid['max_depth'],
                                                   rf_param_grid['min_samples_split']):
    # Define the model with the given parameter combination
    model = RandomForestRegressor(
        random_state=42,
        n_estimators=n_est,
        max_depth=max_d,
        min_samples_split=min_split,
        n_jobs=-1
    )
    
    # Perform expanding window CV
    cv_results = expanding_window_cv(df, start_year, end_year, target_column, model)

    # Calculate mean MSE across all validation segments
    mean_mse = cv_results['MSE'].mean()
    
    # Store results
    params = {
        'n_estimators': n_est,
        'max_depth': max_d,
        'min_samples_split': min_split,
    }
    results_list.append({
        'params': params,
        'mean_MSE': mean_mse
    })
    
    # Print out the parameters and corresponding mean MSE
    print("Params:", params, "Mean MSE:", mean_mse)
    
    # Update best score and params if current is better
    if mean_mse < best_score:
        best_score = mean_mse
        best_params = params

print("\nBest Parameters for Random Forest:", best_params)
print("Best Mean MSE for Random Forest:", best_score)

# Convert results to a DataFrame for inspection if you want
final_results_df = pd.DataFrame(results_list)


Params: {'n_estimators': 50, 'max_depth': None, 'min_samples_split': 2} Mean MSE: 0.00034903714636115254
Params: {'n_estimators': 50, 'max_depth': None, 'min_samples_split': 5} Mean MSE: 0.000535969370298172
Params: {'n_estimators': 50, 'max_depth': None, 'min_samples_split': 10} Mean MSE: 0.0022264747777607586
Params: {'n_estimators': 50, 'max_depth': 10, 'min_samples_split': 2} Mean MSE: 0.00031322622198988253
Params: {'n_estimators': 50, 'max_depth': 10, 'min_samples_split': 5} Mean MSE: 0.0005097768586732026
Params: {'n_estimators': 50, 'max_depth': 10, 'min_samples_split': 10} Mean MSE: 0.0022332656955308715
Params: {'n_estimators': 50, 'max_depth': 20, 'min_samples_split': 2} Mean MSE: 0.00034903714636115146
Params: {'n_estimators': 50, 'max_depth': 20, 'min_samples_split': 5} Mean MSE: 0.0005359693702981668
Params: {'n_estimators': 50, 'max_depth': 20, 'min_samples_split': 10} Mean MSE: 0.0022264747777607495
Params: {'n_estimators': 50, 'max_depth': 30, 'min_samples_split': 2} M

This code performs hyperparameter tuning for an XGBoost model using a grid search over specified parameter combinations, evaluates each configuration with expanding window cross-validation, and identifies the best parameters based on the lowest mean MSE across validation segments.

In [66]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor
from itertools import product

# Example parameter grid for XGBoost
xgb_param_grid = {
    'n_estimators': [100, 150, 200, 250],
    'max_depth': [4, 5, 6],
    'learning_rate': [0.025, 0.05, 0.1]
}

# Define start_year, end_year, target_column for your expanding window CV
start_year = 2010  
end_year = 2016
target_column = 'target_TOT_POPULATION'

best_score = np.inf
best_params = None

results_list = []

# Iterate over all combinations of parameters
for (n_est, max_d, lr) in product(xgb_param_grid['n_estimators'],
                                                xgb_param_grid['max_depth'],
                                                xgb_param_grid['learning_rate']):

    # Define the model with the given parameter combination
    model = XGBRegressor(
        random_state=42,
        n_estimators=n_est,
        max_depth=max_d,
        learning_rate=lr,
        n_jobs=-1
    )
    
    # Perform expanding window CV
    cv_results = expanding_window_cv(df, start_year, end_year, target_column, model)

    # Calculate mean MSE across all validation segments
    mean_mse = cv_results['MSE'].mean()
    
    # Store results
    params = {
        'n_estimators': n_est,
        'max_depth': max_d,
        'learning_rate': lr
    }
    results_list.append({
        'params': params,
        'mean_MSE': mean_mse
    })
    
    # Print out the parameters and corresponding mean MSE
    print("Params:", params, "Mean MSE:", mean_mse)
    
    # Update best score and params if current is better
    if mean_mse < best_score:
        best_score = mean_mse
        best_params = params

print("\nBest Parameters for XGBoost:", best_params)
print("Best Mean MSE for XGBoost:", best_score)

# Convert results to a DataFrame for inspection if needed
final_results_df = pd.DataFrame(results_list)


Params: {'n_estimators': 100, 'max_depth': 4, 'learning_rate': 0.025} Mean MSE: 1.0544733330298317
Params: {'n_estimators': 100, 'max_depth': 4, 'learning_rate': 0.05} Mean MSE: 0.007315448263825889
Params: {'n_estimators': 100, 'max_depth': 4, 'learning_rate': 0.1} Mean MSE: 0.0004596471463860209
Params: {'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.025} Mean MSE: 1.0544733330298317
Params: {'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.05} Mean MSE: 0.00731537427906152
Params: {'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1} Mean MSE: 0.00044858774979759426
Params: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.025} Mean MSE: 1.0544733330298317
Params: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.05} Mean MSE: 0.00731537427906152
Params: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.1} Mean MSE: 0.0004667398076207074
Params: {'n_estimators': 150, 'max_depth': 4, 'learning_rate': 0.025} Mean MSE: 0.0906562705548671
Param

This code defines a function to perform hyperparameter tuning for regression models like Lasso or Ridge using expanding window cross-validation, iterating over all combinations of parameters from a grid to identify the configuration that minimizes mean MSE across validation segments.

In [52]:
# Define parameter grids for Lasso and Ridge
lasso_param_grid = {
    'alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0]
}

ridge_param_grid = {
    'alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0]
}

In [53]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Assuming expanding_window_cv is already defined as per your code
# from your_existing_code import expanding_window_cv

def tune_hyperparameters(df, start_year, end_year, target_column, model_class, param_grid, model_name_prefix):
    """
    Tunes hyperparameters for a given regression model using expanding window cross-validation.

    Parameters:
    - df: DataFrame containing the data.
    - start_year: The starting year for cross-validation.
    - end_year: The ending year for cross-validation.
    - target_column: The name of the target column.
    - model_class: The regression model class (e.g., Lasso, Ridge).
    - param_grid: Dictionary containing the hyperparameters to tune.
    - model_name_prefix: String prefix for the model name (e.g., 'Lasso').

    Returns:
    - best_params: The hyperparameters that achieved the lowest mean MSE.
    - tuning_results: DataFrame containing MSE for each hyperparameter setting.
    """
    tuning_results = []
    best_score = np.inf
    best_params = None

    print(f"Starting hyperparameter tuning for {model_name_prefix}...")

    # Iterate over all combinations of hyperparameters
    for params in product(*param_grid.values()):
        # Create a dictionary of current parameter settings
        current_params = dict(zip(param_grid.keys(), params))
        
        # Initialize the model with current parameters
        model = model_class(**current_params, random_state=42) if 'random_state' in model_class().get_params() else model_class(**current_params)
        
        # Perform expanding window CV
        cv_results = expanding_window_cv(df, start_year, end_year, target_column, model)
        
        # Calculate mean MSE across all validation segments
        mean_mse = cv_results['MSE'].mean()
        
        # Store results
        tuning_results.append({
            **current_params,
            'Mean_MSE': mean_mse
        })
        
        # Print progress
        print(f"Params: {current_params}, Mean MSE: {mean_mse:.8f}")
        
        # Update best parameters if current mean MSE is lower
        if mean_mse < best_score:
            best_score = mean_mse
            best_params = current_params

    tuning_results_df = pd.DataFrame(tuning_results)
    
    print(f"Best parameters for {model_name_prefix}: {best_params} with Mean MSE: {best_score:.8f}\n")
    
    return best_params, tuning_results_df


In [54]:
from itertools import product

# Define the start and end years for cross-validation
start_year = 2010
end_year = 2016

# Tune Lasso
best_lasso_alpha, lasso_tuning_results = tune_hyperparameters(
    df=df,
    start_year=start_year,
    end_year=end_year,
    target_column=target_column,
    model_class=Lasso,
    param_grid=lasso_param_grid,
    model_name_prefix='Lasso Regression'
)

# Tune Ridge
best_ridge_alpha, ridge_tuning_results = tune_hyperparameters(
    df=df,
    start_year=start_year,
    end_year=end_year,
    target_column=target_column,
    model_class=Ridge,
    param_grid=ridge_param_grid,
    model_name_prefix='Ridge Regression'
)


Starting hyperparameter tuning for Lasso Regression...
Params: {'alpha': 1e-11}, Mean MSE: 0.00008498
Params: {'alpha': 1e-05}, Mean MSE: 0.00007911
Params: {'alpha': 0.0001}, Mean MSE: 0.00008294
Params: {'alpha': 0.001}, Mean MSE: 0.00008616
Params: {'alpha': 0.01}, Mean MSE: 0.00018487
Params: {'alpha': 0.1}, Mean MSE: 0.01025032
Params: {'alpha': 1.0}, Mean MSE: 0.53887931
Best parameters for Lasso Regression: {'alpha': 1e-05} with Mean MSE: 0.00007911

Starting hyperparameter tuning for Ridge Regression...
Params: {'alpha': 1e-11}, Mean MSE: 0.00010936
Params: {'alpha': 1e-05}, Mean MSE: 0.00008776
Params: {'alpha': 0.0001}, Mean MSE: 0.00008376
Params: {'alpha': 0.001}, Mean MSE: 0.00008370
Params: {'alpha': 0.01}, Mean MSE: 0.00008515
Params: {'alpha': 0.1}, Mean MSE: 0.00009102
Params: {'alpha': 1.0}, Mean MSE: 0.00011641
Best parameters for Ridge Regression: {'alpha': 0.001} with Mean MSE: 0.00008370



In [55]:
def compute_mean_mse_for_alpha_zero(df, start_year, end_year, target_column):
    """
    Computes the mean MSE for Linear Regression (alpha=0) using expanding window cross-validation.

    Parameters:
    - df: DataFrame containing the data.
    - start_year: The starting year for cross-validation.
    - end_year: The ending year for cross-validation.
    - target_column: The name of the target column.

    Returns:
    - mean_mse: The mean MSE across all validation folds for Linear Regression.
    """
    from sklearn.linear_model import LinearRegression

    # Initialize the Linear Regression model (alpha=0)
    model = LinearRegression()

    # Perform expanding window cross-validation
    cv_results = expanding_window_cv(df, start_year, end_year, target_column, model)

    # Calculate and return the mean MSE across all validation segments
    mean_mse = cv_results['MSE'].mean()

    print(f"Mean MSE for Linear Regression (alpha=0): {mean_mse:.7f}")
    return mean_mse

# Usage example
mean_mse_linear_regression = compute_mean_mse_for_alpha_zero(
    df=df,
    start_year=start_year,
    end_year=end_year,
    target_column=target_column
)

Mean MSE for Linear Regression (alpha=0): 0.0001094
