<a href="https://colab.research.google.com/github/basugautam/Reproducibility-Challenge-Project/blob/Architecture-Files/window_STD_ERM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# B.	Window STD of ERM, constant-constrained and exponentially-constrained settings in the ECL dataset, for three model architectures

In [None]:
# Why we import these libraries:

# Numpy & Pandas: These are essential for data manipulation and numerical calculations.
# Numpy is used for performing mathematical operations efficiently, such as array manipulation and statistical operations like mean, std, etc.
# Pandas is used for loading the dataset, manipulating it, and organizing it into DataFrames, which makes it easier to work with tabular data.

import numpy as np  # Numpy for numerical computing
import pandas as pd  # Pandas for data manipulation and analysis

# TensorFlow & Keras: These libraries are used for building, training, and evaluating deep learning models.
# TensorFlow provides the core functionality for implementing machine learning models, while Keras offers a high-level interface for model building, making the process more user-friendly.

import tensorflow as tf  # TensorFlow for deep learning model implementation
from tensorflow import keras  # Keras is a high-level API for TensorFlow to build and train models

# Sklearn: We use this for preprocessing data and splitting it into training/testing sets.
# The MinMaxScaler from Sklearn will be used to scale the features, and train_test_split will help in dividing the dataset into training and testing subsets.
# This is crucial for model validation to ensure that the model is evaluated properly and can generalize to unseen data.

from sklearn.preprocessing import MinMaxScaler  # For scaling features to a specific range, such as 0 to 1
from sklearn.model_selection import train_test_split  # For splitting the dataset into training and testing sets

# Matplotlib & Seaborn: These are used for data visualization.
# Matplotlib is a basic plotting library that helps in creating various types of graphs such as line plots, bar plots, etc.
# Seaborn builds on Matplotlib and makes it easier to create more attractive and informative visualizations like heatmaps and statistical plots.

import matplotlib.pyplot as plt  # For plotting graphs and visualizing results
import seaborn as sns  # For creating high-level statistical plots

# Google Colab's drive module: This is used for accessing datasets stored on Google Drive.
# By mounting Google Drive, we can directly access the dataset files and load them into the Colab environment for processing.
# This is especially useful for retrieving datasets like the Canadian Census data from the Google Drive and using them in the analysis.

from google.colab import drive  # For mounting Google Drive in Colab and accessing files stored on it

# What they do:

# TensorFlow/Keras: These libraries help in defining, training, and evaluating deep learning models.
# We will use TensorFlow for implementing the model and Keras to simplify model-building processes like adding layers and compiling the model.
# The Canadian Census data will be used as input for these models, allowing us to forecast long-term trends based on the historical data.

# Google Colab's drive module: This allows access to files stored on Google Drive directly within Colab.
# It enables us to load the Canadian Census dataset into the environment and work with it, performing necessary preprocessing, model training, and evaluation steps.
drive.mount('/content/drive')  # Mounting Google Drive to access the dataset files


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Define custom loss functions for ERM, Constant, and Exponential Constraints

# Define River Functions
def river_erm(y_true, y_pred):
    return tf.reduce_mean(tf.square(y_true - y_pred))  # ERM (Empirical Risk Minimization)
    # Defining the river_erm function to compute the error
def river_erm(y_true, y_pred):
    """
    This function calculates the error between the true values and predicted values.
    It computes the Mean Squared Error (MSE).

    Parameters:
    y_true (array-like): True values.
    y_pred (array-like): Predicted values.

    Returns:
    float: The Mean Squared Error between y_true and y_pred.
    """
    # Calculate Mean Squared Error (MSE)
    mse = np.mean((y_true - y_pred) ** 2)
    return mse


def river_constant(y_true, y_pred, growth_rate=0.02):
    # Constant growth loss function
    constant_penalty = tf.reduce_mean(tf.abs((y_pred[:, 1:] - y_pred[:, :-1]) / y_pred[:, :-1] - growth_rate))
    return tf.reduce_mean(tf.square(y_true - y_pred)) + constant_penalty

def river_exp(y_true, y_pred, growth_rate=0.02):
    # Exponential growth loss function
    exp_penalty = tf.reduce_mean(tf.abs((y_pred[:, 1:] / y_pred[:, :-1]) - tf.exp(growth_rate)))
    return tf.reduce_mean(tf.square(y_true - y_pred)) + exp_penalty

# Explanation:
# We define three custom loss functions: river_erm for ERM, river_constant for the constant growth penalty, and river_exp for the exponential growth penalty.
# These loss functions are tailored to ensure the model predictions follow the desired constraints.


In [None]:
# Mount Google Drive to access dataset
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import csv

# Step 1: Try loading the dataset from Google Drive
csv_path = '/content/drive/MyDrive/your_file.csv'  # Replace with your actual filename

try:
    # Load while skipping bad lines
    df = pd.read_csv(csv_path, on_bad_lines='skip')  # on_bad_lines replaces deprecated error_bad_lines
    print("✅ Dataset loaded successfully.")
except Exception as e:
    print(f"❌ Error loading dataset: {e}")
    print("Attempting to locate malformed lines manually...")

    # Manual inspection of malformed rows (optional)
    try:
        with open(csv_path, 'r') as f:
            reader = csv.reader(f)
            for i, row in enumerate(reader):
                if len(row) != 2826:  # Replace 2826 with the expected number of columns
                    print(f"⚠️ Row {i+1} has {len(row)} fields (Expected: 2826)")
    except Exception as e2:
        print(f"❌ Error during manual row inspection: {e2}")
    df = None  # Set df to None if reading failed

# Step 2: If dataframe loaded, proceed with processing
if isinstance(df, pd.DataFrame):
    print("\n🔍 First few rows of the dataset:")
    print(df.head())

    # Check for missing values
    print("\n🔧 Missing values per column:")
    print(df.isnull().sum())

    # Fill missing values using forward fill method
    df.fillna(method='ffill', inplace=True)

    # Normalize population column
    if 'population' in df.columns:
        scaler = MinMaxScaler()
        df['population'] = scaler.fit_transform(df[['population']])

    # Encode 'year' as sine and cosine features for periodicity
    if 'year' in df.columns:
        df['year_sin'] = np.sin(2 * np.pi * df['year'] / 365)
        df['year_cos'] = np.cos(2 * np.pi * df['year'] / 365)

    # Preview the processed DataFrame
    print("\n✅ Processed DataFrame Preview:")
    print(df.head())

    # Split the data into train and test sets
    train_data, test_data = train_test_split(df, test_size=0.2, shuffle=False)

    print(f"\n📊 Train Data Shape: {train_data.shape}")
    print(f"📊 Test Data Shape: {test_data.shape}")
else:
    print("❌ Dataset not loaded. Cannot proceed with preprocessing.")


Mounted at /content/drive
❌ Error loading dataset: [Errno 2] No such file or directory: '/content/drive/MyDrive/your_file.csv'
Attempting to locate malformed lines manually...
❌ Error during manual row inspection: [Errno 2] No such file or directory: '/content/drive/MyDrive/your_file.csv'
❌ Dataset not loaded. Cannot proceed with preprocessing.


In [None]:
# Mount Google Drive to access dataset
from google.colab import drive
drive.mount('/content/drive', force_remount=True)  # Remount if needed

# Use the file ID to locate the file in Google Drive
file_id = '1f8aDJV4oSnKTFeHirIOt8IxMAs1YHSre'

# Construct the file URL
file_url = f'https://drive.google.com/uc?id={file_id}'
# Mount Google Drive to access dataset
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Use the file ID to locate the file in Google Drive
file_id = '1f8aDJV4oSnKTFeHirIOt8IxMAs1YHSre'

# Construct the file URL
file_url = f'https://drive.google.com/uc?id={file_id}'

import pandas as pd
from sklearn.model_selection import train_test_split

# Step 1: Load the data with the correct argument for bad lines
try:
    df = pd.read_csv(file_url, on_bad_lines='skip')  # Skip bad lines instead of error_bad_lines
    print("Data loaded successfully.")
    print(df.head())  # Check the first few rows to ensure it's loaded correctly
except Exception as e:
    print(f"Error loading data: {e}")

# Step 2: Proceed with splitting the data into training and testing datasets if df is loaded successfully
if 'df' in locals():
    train_data, test_data = train_test_split(df, test_size=0.2, shuffle=False)
    print(f"Train Data Shape: {train_data.shape}")
    print(f"Test Data Shape: {test_data.shape}")
else:
    print("Failed to load data.")


# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Step 1: Load the data (with error handling for malformed rows)
try:
    df = pd.read_csv(file_url, error_bad_lines=False, warn_bad_lines=True)
    # Check the first few rows to ensure it's loaded correctly
    print(df.head())
except Exception as e:
    print(f"Error loading data: {e}")

# Step 2: Split the data into training and testing datasets
train_data, test_data = train_test_split(df, test_size=0.2, shuffle=False)
print(f"Train Data Shape: {train_data.shape}")
print(f"Test Data Shape: {test_data.shape}")

# Step 3: Data Preprocessing
# Check if 'population' column exists and apply transformations
if 'population' in df.columns:
    # Apply MinMaxScaler to normalize 'population'
    scaler = MinMaxScaler()
    df['population'] = scaler.fit_transform(df[['population']])

# Temporal Encoding for 'year' (sine & cosine encoding)
# Check if 'year' column exists for temporal encoding
if 'year' in df.columns:
    df['year_sin'] = np.sin(2 * np.pi * df['year'] / 365)  # Sine encoding for cyclic features
    df['year_cos'] = np.cos(2 * np.pi * df['year'] / 365)  # Cosine encoding for cyclic features
    print("Temporal encoding applied.")
else:
    print("Error: 'year' column not found in the dataset.")

# Check if 'year_sin' is available in the training data before proceeding
if 'year_sin' in train_data.columns:
    # Define the input shape based on the 'year_sin' feature
    input_shape = (train_data[['year_sin']].shape[1],)  # This should correctly grab the feature shape
    model = Sequential()
    model.add(Dense(64, activation='relu', input_shape=input_shape))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1))  # Output layer

    # Compile the model
    model.compile(optimizer='adam', loss='mean_squared_error')
    print("Model compiled successfully.")
else:
    print("Error: 'year_sin' column not found in the training data.")

if 'year' in df.columns:
    df['year_sin'] = np.sin(2 * np.pi * df['year'] / 365)  # Sine encoding for cyclic features
    df['year_cos'] = np.cos(2 * np.pi * df['year'] / 365)  # Cosine encoding for cyclic features

    print("Temporal encoding applied.")
else:
    print("Error: 'year' column not found in the dataset.")

# Step 4: Define the Model Creation Function
def transformer_model(input_shape):
    model = Sequential()
# Assuming that the train_data has columns like 'year_sin' and 'population' after preprocessing,
# you need to reshape the input for the model.


# Check if 'year' column exists for temporal encoding
if 'year' in df.columns:
    df['year_sin'] = np.sin(2 * np.pi * df['year'] / 365)  # Sine encoding for cyclic features
    df['year_cos'] = np.cos(2 * np.pi * df['year'] / 365)  # Cosine encoding for cyclic features
    print("Temporal encoding applied.")
else:
    print("Error: 'year' column not found in the dataset.")

# Check if 'year_sin' is available in the training data before proceeding
if 'year_sin' in train_data.columns:
    # Define the input shape based on the 'year_sin' feature
    input_shape = (train_data[['year_sin']].shape[1],)  # This should correctly grab the feature shape

    # Initialize the model
    model = Sequential()

    # Add layers to the model
    model.add(Dense(64, activation='relu', input_shape=input_shape))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1))  # Output layer

    # Compile the model
    model.compile(optimizer='adam', loss='mean_squared_error')
    print("Model compiled successfully.")
else:
    print("Error: 'year_sin' column not found in the training data.")




Mounted at /content/drive
Mounted at /content/drive
Data loaded successfully.
  <!doctype html><html lang="en-US" dir="ltr"><head><base href="https://accounts.google.com/v3/signin/"><link rel="preconnect" href="//www.gstatic.com"><meta name="referrer" content="origin"><style data-href="https://www.gstatic.com/_/mss/boq-identity/_/ss/k=boq-identity.AccountsSignInUi.bRRGln9G7ms.L.X.O/am=yQEMSyZrBArlIACgEAAAAgAUAAAAAAAAAAACAwAAmBA/d=1/ed=1/rs=AOaEmlFHRAC5Qsb49KIaVYK_q9hj3AV-8A/m=identifierview  \
0                                           --c-ps-s                                                                                                                                                                                                                                                                                                                                                                                
1                                                  )                            

In [None]:
# Mount Google Drive to access dataset
from google.colab import drive
drive.mount('/content/drive', force_remount=True)  # Remount if needed

# Use the file ID to locate the file in Google Drive
file_id = '1f8aDJV4oSnKTFeHirIOt8IxMAs1YHSre'

# Construct the file URL
file_url = f'https://drive.google.com/uc?id={file_id}'

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Try to read the CSV file into a DataFrame with error handling for malformed lines
try:
    # Try reading the dataset while skipping bad lines
    df = pd.read_csv(file_url, on_bad_lines='skip')  # Skips problematic lines

    # Display the first few rows of the dataset
    print("First few rows of the dataset:")
    print(df.head())

except Exception as e:
    print(f"Error: {e}")

# Check if df is defined, then proceed with data preprocessing
if 'df' in locals():
    # Check for missing values and handle them
    print("\nMissing values in each column:")
    print(df.isnull().sum())

    # Fill missing values with forward fill method (ffill)
    df.fillna(method='ffill', inplace=True)

    # Example: Normalize 'population' column (assuming it's a continuous variable)
    if 'population' in df.columns:
        scaler = MinMaxScaler()
        df['population'] = scaler.fit_transform(df[['population']])

    # Temporal Encoding (Sine & Cosine encoding for 'year')
    if 'year' in df.columns:
        df['year_sin'] = np.sin(2 * np.pi * df['year'] / 365)
        df['year_cos'] = np.cos(2 * np.pi * df['year'] / 365)

    # Display the processed dataset
    print("\nProcessed DataFrame:")
    print(df.head())

    # Now, split the dataset into training and test sets
    train_data, test_data = train_test_split(df, test_size=0.2, shuffle=False)

    # Display the shape of the resulting datasets
    print(f"\nTrain Data Shape: {train_data.shape}")
    print(f"Test Data Shape: {test_data.shape}")

    # Check if 'year_sin' exists in the train data before training the model
    if 'year_sin' in train_data.columns:
        # Create a simple model using Keras
        model = Sequential()
        model.add(Dense(64, input_dim=1, activation='relu'))  # Input dimension should match the number of features
        model.add(Dense(1))  # Output layer (single value prediction)

        # Compile the model
        model.compile(optimizer=Adam(), loss='mse')

        # Train the model with ERM loss
        model.fit(train_data[['year_sin']], train_data['population'], epochs=50, batch_size=32)

        # Recompile with constant and exponential growth constraints and train again
        model.compile(optimizer=Adam(), loss=lambda y_true, y_pred: river_constant(y_true, y_pred, growth_rate=0.02))
        model.fit(train_data[['year_sin']], train_data['population'], epochs=50, batch_size=32)

    else:
        print("Column 'year_sin' not found in train_data. Please check the preprocessing steps.")

else:
    print("Dataset not loaded. Please check the file path or file access.")


Mounted at /content/drive
First few rows of the dataset:
  <!doctype html><html lang="en-US" dir="ltr"><head><base href="https://accounts.google.com/v3/signin/"><link rel="preconnect" href="//www.gstatic.com"><meta name="referrer" content="origin"><style data-href="https://www.gstatic.com/_/mss/boq-identity/_/ss/k=boq-identity.AccountsSignInUi.bRRGln9G7ms.L.X.O/am=yQEMSyZrBArlIACgEAAAAgAUAAAAAAAAAAACAwAAmBA/d=1/ed=1/rs=AOaEmlFHRAC5Qsb49KIaVYK_q9hj3AV-8A/m=identifierview  \
0                                           --c-ps-s                                                                                                                                                                                                                                                                                                                                                                                
1                                                  )                                                 

  df.fillna(method='ffill', inplace=True)


In [None]:
# Function to calculate Window STD
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Define input shape based on features
feature_columns = ['year_sin', 'year_cos']
input_shape = (train_data[feature_columns].shape[1],)

# Create a simple feedforward neural network model
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=input_shape))
model.add(Dense(64, activation='relu'))
model.add(Dense(1))  # Output layer for regression

# Compile the model
model.compile(optimizer='adam', loss='mse')

# Now you can fit the model
model.fit(train_data[feature_columns], train_data['population'], epochs=50, verbose=1)

# Function to calculate the standard deviation of predictions over a sliding window
def river_window_std(predictions, window_size):
    """
    This function calculates the standard deviation of predictions using a sliding window.

    Parameters:
    predictions (array-like): List or array of predicted values.
    window_size (int): The size of the window over which the standard deviation will be calculated.

    Returns:
    np.array: Standard deviation of predictions over the window.
    """
    # Calculate standard deviation over the sliding window
    stds = []
    for i in range(len(predictions) - window_size + 1):
        window = predictions[i:i+window_size]
        stds.append(np.std(window))

    return np.array(stds)

def window_std(predictions, window_size):
    stds = []
    for i in range(0, len(predictions), window_size):
        window = predictions[i:i+window_size]
        stds.append(np.std(window))
    return np.array(stds)

# Example: Calculate Window STD for different window sizes
window_sizes = [96, 192, 336, 720]
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
print(train_data.columns.tolist())
# Temporal Encoding (Sine & Cosine) for 'year'
if 'year' in df.columns:
    df['year_sin'] = np.sin(2 * np.pi * df['year'] / 365)
    df['year_cos'] = np.cos(2 * np.pi * df['year'] / 365)

# Define input shape based on our features
feature_columns = ['year_sin', 'year_cos']

model.fit(train_data[feature_columns], train_data['population'], epochs=50)
predictions = model.predict(test_data[feature_columns])

input_shape = (train_data[['year_sin']].shape[1],)

# Build the model
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=input_shape))
model.add(Dense(32, activation='relu'))
model.add(Dense(1))  # Assuming you're predicting a single output like population

# Compile the model
model.compile(optimizer='adam', loss='mse')

# Train the model
model.fit(train_data[['year_sin']], train_data['population'], epochs=50, verbose=1)
predictions = model.predict(test_data[['year_sin']])

predictions = model.predict(test_data['year_sin'])

# Calculate and display Window STD for each window size
for size in window_sizes:
    stds = window_std(predictions, size)
    print(f"Window Size: {size} - STD: {np.mean(stds)}")

# Explanation:
# We define the window_std function to calculate the standard deviation within a window of data points.
# For each window size, we calculate the standard deviation of the predictions to assess model stability over time.


KeyError: "None of [Index(['year_sin', 'year_cos'], dtype='object')] are in the [columns]"

In [None]:
train_data, test_data = train_test_split(df, test_size=0.2, shuffle=False)


In [None]:
feature_columns = ['year_sin', 'year_cos']
input_shape = (train_data[feature_columns].shape[1],)


In [None]:
missing = [col for col in feature_columns if col not in train_data.columns]
if missing:
    print(f"Missing columns: {missing}")
else:
    print("All feature columns are present.")


In [None]:
# You can plot the results for comparison using matplotlib
plt.figure(figsize=(10, 6))
plt.plot(window_sizes, [np.mean(window_std(predictions, size)) for size in window_sizes])
plt.title('Window STD for Different Models')
plt.xlabel('Window Size')
plt.ylabel('Window STD')
plt.show()

# Explanation:
# We plot the Window STD values for each model under different window sizes to visually compare their performance and stability.
