# **GeoCryoAI Model | Modeling Codebase**
---
Decoding the Spatiotemporal Complexities of the Permafrost Carbon Feedback with Multimodal Ensemble Learning </br>
Journal of Geophysical Research - Machine Learning and Computation (2024JH000402)</br>

Bradley A. Gay, PhD | Jet Propulsion Laboratory, California Institute of Technology</br>
31 December 2024

# JGR-MLC Code
To skip ahead to modeling analyses, access the ensemble tensors in the [ORNL DAAC repository](https://doi.org/10.3334/ORNLDAAC/2371), or reference the chunked parquet files in the GitHub subfolders and execute the `reassembly.py` script to stack the data chunks and generate the original dataframe (i.e., scaled, detrended, and normalized). Thereafter, navigate to the **Tuning** heading below. To proceed forward with the training, validation, and testing formulation, and model simulation code, access the code following the **Load** heading. </br>
</br>
For a more intricate deluge into the details associated with creating each individual dataset that was concatenated together to form the base dataset used for training the GeoCryoAI framework, navigate to the Jupyter notebook located in the _/geocryoai/preprocessing/code/notebook directory and proceed forward with the individual headings labeled, **In Situ**, **UAVSAR**, **AVIRIS-NG**, **SIBBORK-TTE**, and **TCFM-Arctic** followed by the concatenation of these individual datasets into the harmonized base dataset used for model development, simulations, and forecasts.

# Libraries and Functions

In [None]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
import os, sys
import datetime
import re
import glob
#from glob import glob,iglob
import requests
import warnings, intake
import shutil
#os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
warnings.filterwarnings("ignore", category=RuntimeWarning)
#
import bs4
import eofs
import pyts
import pyarrow
import fastparquet
import math
import numpy as np
import pandas as pd
import subprocess
import pyarrow.parquet as pq
import pyarrow as pa
from tqdm import tqdm
#import polars
#import polars as pl
import pickle
import math
import time
import cftime
import netCDF4 as nc
import xarray as xr
from netCDF4 import Dataset
import h5py
os.environ['HDF5_USE_FILE_LOCKING']='FALSE'
import progressbar
from dask.diagnostics import ProgressBar
ProgressBar().register()
#
import sklearn
#!pip install --upgrade tensorflow
import tensorflow as tf
tf.config.run_functions_eagerly(True)
#import tensorflow_addons
import tensorflow.keras
import tensorflow.keras.backend as K
import keras.backend as K
import keras.optimizers
import keras_tuner
import statsmodels
import pyrsgis
#
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as col
import matplotlib.cm as cm
import matplotlib.colors as col
import matplotlib.cm as cm
%matplotlib inline
mpl.rcParams['agg.path.chunksize'] = 10000
import seaborn as sns
sns.set(font_scale=1.5, style="white")
import csv
import io
#
import dask.dataframe as dd
import geopandas as gpd
import rioxarray as rxr
import spectral
import earthpy as et
import geopandas as gpd
import utm
#
import pydot
import pydotplus
import graphviz
import datetime
import cv2 as cv
import rasterio
import tiledb
import polars as pl
import pyproj
import dask.dataframe as dd
import tqdm
import rasterio
import rasterio as rio
import rioxarray
import pandas as pd
import psutil
import dask.dataframe as dd
import time
import memory_profiler
import zipfile
import getpass
import pyinterp
import pyinterp.backends.xarray
import requests
import h5py
import rasterio
import h5py
import pyproj
import affine
import rasterio
import rioxarray
import matplotlib.pyplot as plt
import dask
import dask.array as da
import scipy
import tensorflow as tf
import warnings
import cdsapi
import libpysal
#
import rasterio
import codecs # for text parsing code
import netrc
import rioxarray as rxr
import pyarrow as pa
import pyarrow.parquet as pq
import dask.dataframe as dd
import time
import tensorboard
tensorboard.__version__
# Clear logs
#rm -rf ./logs/
# Load TensorBoard
%load_ext tensorboard
import gc
##

from osgeo import gdal
from pyproj import Proj, Transformer, CRS
from affine import Affine
from rasterio import enums
from rasterio.enums import Resampling
from rasterio.warp import calculate_default_transform, reproject
from rasterio.warp import calculate_default_transform, reproject, Resampling
from rasterio.errors import NotGeoreferencedWarning
from collections import defaultdict
from functools import partial
from dask import delayed, compute
from dask.diagnostics import ProgressBar
from scipy.ndimage import zoom
from scipy.interpolate import griddata
from scipy.ndimage import zoom
from uncertain_panda import pandas as up
from numpy import isnan, array, count_nonzero
from pandas import read_csv, DataFrame, concat
from collections import defaultdict
from openpyxl import Workbook
from itertools import groupby, islice
from zipfile import ZipFile
from operator import itemgetter
from pathlib import Path
from datetime import datetime as dt
from datetime import timedelta
from pyrsgis import raster
#
from pywaffle import Waffle
from eofs.standard import Eof
from statsmodels.tsa.arima.model import ARIMA
#
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, PowerTransformer, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, accuracy_score, \
confusion_matrix, ConfusionMatrixDisplay, consensus_score, explained_variance_score,r2_score, \
roc_auc_score, roc_curve
from sklearn.metrics import jaccard_score, mean_squared_error, nan_euclidean_distances, precision_score
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import *
from tensorflow.keras import mixed_precision
from tensorflow.python.client import device_lib
from keras_tuner.engine.hyperparameters import HyperParameters
from keras_tuner.tuners import RandomSearch, BayesianOptimization, Hyperband
from keras_tuner import HyperModel
#import tensorflow_addons as tfa
#from tensorflow_addons.metrics.r_square import RSquare
from keras_tuner.tuners import *
from keras import backend as K
from keras import optimizers 
from keras.preprocessing import *
from keras.models import Sequential
from keras.layers import Dense, InputLayer, Dropout, Flatten, BatchNormalization, Conv1D, Bidirectional
from keras.layers import *
from keras.activations import swish, elu, gelu, selu, sigmoid, relu, tanh, linear, softmax, swish
from keras.layers import MaxPool1D
from keras import *
from keras.models import Model, Sequential
#
from spectral import *
from libpysal import weights
from libpysal.weights import lat2W
from esda.moran import Moran
#
from keras.layers import Reshape
from keras import layers
from keras import models
from keras.utils import plot_model
from datetime import datetime
from dateutil.rrule import DAILY,rrule
from time import gmtime, strftime 
#
from rasterio.enums import Resampling
from rasterio.mask import mask
from rasterio.plot import show
from rasterio.transform import from_origin
from pyrsgis.ml import array_to_chips
from shapely import geometry
from pyproj import Proj, transform
from pyrsgis import ml
from pyrsgis import ml, raster, convert
from itertools import chain
from shapely.geometry import Point, box
from pyproj import Proj, transform, CRS
from scipy.interpolate import griddata
from scipy.stats import bootstrap
from scipy.interpolate import interpn
from rasterio.mask import mask
from rasterio import sample
from scipy.interpolate import Rbf
from scipy.interpolate import griddata
from scipy.spatial import cKDTree
from rasterio.plot import show
from scipy.interpolate import griddata
from scipy.interpolate import interp1d
#
from dask import delayed, compute
from dask.distributed import Client, LocalCluster
from tqdm import tqdm
from dask import delayed, compute
from dask.distributed import Client, LocalCluster
from rasterio.windows import Window
from rasterio.warp import calculate_default_transform, reproject, Resampling
from rasterio.plot import show
from rasterio.plot import show_hist
from rasterio.transform import from_origin
from tqdm import tqdm
from joblib import Parallel, delayed
from osgeo import gdal
from rasterio.windows import Window

##########If error thrown, Just run cell again!

from netCDF4 import Dataset
from datetime import datetime
from packaging import version
from osgeo import gdal
from osgeo import gdal_array
from pyproj import Transformer, ProjError, Proj
from IPython.display import display
#
from skimage import data, io   # Import skimage library (data - Test images and example data.
#                          io - Reading, saving, and displaying images.)
from skimage.color import rgb2gray
from pandas.plotting import autocorrelation_plot
#
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from tensorflow.keras import utils
from tensorflow.keras.models import Sequential,load_model
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow import keras
from tensorflow.python.keras.utils import conv_utils, tf_utils
#from tensorflow_addons.metrics.r_square import RSquare
from keras_tuner import HyperModel
from keras_tuner.engine.hyperparameters import HyperParameters
from keras_tuner.tuners import RandomSearch, BayesianOptimization, Hyperband
#
from uncertain_panda import pandas as pd
tqdm.pandas()
#
from esda.moran import Moran
from libpysal.weights import KNN  # libpysal now contains the weights module
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.stats.stattools import durbin_watson
from statsmodels.tsa.stattools import acf
from statsmodels.tsa.stattools import adfuller, acf, pacf, kpss
from esda.moran import Moran
from esda.geary import Geary
from pysal.lib import weights
from scipy.stats import linregress
from libpysal.weights import KNN
from shapely.geometry import Point

print("TensorFlow version: ", tf.__version__)
assert version.parse(tf.__version__).release[0] >= 2, \
    "This notebook requires TensorFlow 2.0 or above."

print("==========================")

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
print(sys.getrecursionlimit())
sys.setrecursionlimit(1000000000)
print(sys.getrecursionlimit())



##########################
##FUNCTIONS##
##########################

def divisorGenerator(n):
    large_divisors = []
    for i in range(1, int(math.sqrt(n) + 1)):
        if n % i == 0:
            yield i
            if i*i != n:
                large_divisors.append(n / i)
    for divisor in reversed(large_divisors):
        yield divisor

def series_to_supervised(data, lags = 1, forecasting_steps = 1, dropna=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    for i in range(lags, 0, -1):
        cols.append(df.shift(i))
        names += [(df.columns[j], str('t-%d') %  i) for j in range(n_vars)]
    for a in range(0, forecasting_steps):
        cols.append(df.shift(-a))
        if a == 0:
            names += [(df.columns[b], str('t')) for b in range(n_vars)]
        else:
            names += [(df.columns[b], str('t+%d') %  a) for b in range(n_vars)]
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    agg = agg.loc[:,~agg.columns.duplicated()]
    if dropna:
        agg.dropna(inplace=True)
    return agg

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great-circle distance between two points on the Earth.
    :param lon1: Longitude of the first point in decimal degrees.
    :param lat1: Latitude of the first point in decimal degrees.
    :param lon2: Longitude of the second point in decimal degrees.
    :param lat2: Latitude of the second point in decimal degrees.
    :return: Distance between the two points in meters.
    """
    # Radius of the Earth in meters
    R = 6371000
    # Convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(math.radians, [lon1, lat1, lon2, lat2])
    # Differences in coordinates
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    # Haversine formula
    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    # Distance in meters
    distance = R * c
    return distance

def fast_flatten(input_list):
    return list(chain.from_iterable(input_list))

def convert_to_numeric(value):
    if isinstance(value, str):
        if value == '-9999':
            return np.nan
        try:
            return int(value)
        except ValueError:
            try:
                return float(value)
            except ValueError:
                return value
    return value

def convert_to_datetime_underscore(match):
    parts = match.group(1).split('_')
    if len(parts) == 3:
        day = int(parts[1])
        month = int(parts[0])
        year = int(parts[2])
        if year < 100:
            year += 2000
        try:
            date = datetime(year, month, day)
            return date.strftime('%Y%m%d')
        except ValueError:
            return None
    return None

def convert_to_datetime_slash(match):
    parts = match.group(1).split('/')
    if len(parts) == 3:
        month = int(parts[0])
        day = int(parts[1])
        year = int(parts[2])
        try:
            date = datetime(year, month, day)
            return date.strftime('%Y%m%d')
        except ValueError:
            return None
    return None

def process_csv(file_path):
    try:
        df = pd.read_csv(file_path, encoding='utf-8')
    except UnicodeDecodeError:
        df = pd.read_csv(file_path, encoding='ISO-8859-1')
    
    required_columns = ['collectDate', 'siteID', 'decimalLatitude', 'decimalLongitude']
    thaw_depth_columns = [col for col in df.columns if 'thawProbeDepth' in col]
    
    if not all(col in df.columns for col in required_columns):
        #print(f"Skipping file {file_path} due to missing required columns")
        return None
    
    columns_to_extract = required_columns + thaw_depth_columns
    
    # Extract the necessary columns
    df_extracted = df[columns_to_extract]
    
    # Melt the dataframe to have thawProbeDepth columns as rows
    df_melted = df_extracted.melt(
        id_vars=required_columns,
        value_vars=thaw_depth_columns,
        var_name='thawProbeDepthType',
        value_name='thawProbeDepth'
    )
    return df_melted

def decimal_year_to_datetime(decimal_years):
    dates = []
    for decimal_year in decimal_years:
        year = int(decimal_year)
        remainder = decimal_year - year
        start_of_year = datetime(year, 1, 1)
        days_in_year = (datetime(year + 1, 1, 1) - start_of_year).days
        date = start_of_year + timedelta(days=remainder * days_in_year)
        dates.append(date)
    return np.array(dates)

def enhanced_save_as_xlsx(file_path, df_dict):
    with pd.ExcelWriter(file_path, engine='xlsxwriter') as writer:
        for sheet_name, df in df_dict.items():
            df.to_excel(writer, sheet_name=sheet_name, index=False)
            
def move_files(source_folder, destination_folder):
    for filename in os.listdir(source_folder):
        shutil.move(os.path.join(source_folder, filename), os.path.join(destination_folder, filename))

# def extract_keywords_from_excel_robust(file_path):
#     try:
#         # Load the Excel file
#         xls = pd.ExcelFile(file_path)
#         # Get all sheet names
#         sheets = xls.sheet_names
#         # Container for keywords
#         keywords = set()
#         # Loop over all sheets
#         for sheet in sheets:
#             # Read the sheet
#             df = pd.read_excel(xls, sheet_name=sheet)
#             # Extract column names (assuming they are keywords)
#             keywords.update(df.columns.str.strip().tolist())
#     except Exception as e:
#         print(f"Error processing {file_path}: {str(e)}")
#         return None
#     return keywords

def extract_keywords_from_excel_robust(file_path):
    try:
        xls = pd.ExcelFile(file_path)  # Load the Excel file
        keywords = set()
        for sheet in xls.sheet_names:  # Iterate through each sheet
            df = pd.read_excel(xls, sheet_name=sheet)
            # Collect all headers, converting each to string for uniform processing
            keywords.update(map(str, df.columns))
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None
    return keywords

def convert_to_snake_case(name):
    # Convert to lowercase
    name = name.lower()
    # Replace spaces and hyphens with underscores
    name = re.sub(r'[\s\-]+', '_', name)
    # Remove invalid characters, keeping only alphanumerics, underscores, and dots (for file extensions)
    name = re.sub(r'[^a-z0-9_\.]', '', name)
    return name

def rename_files_in_directory(root_dir):
    for root, dirs, files in os.walk(root_dir, topdown=False):
        # Rename files
        for filename in files:
            new_filename = convert_to_snake_case(filename)
            old_file_path = os.path.join(root, filename)
            new_file_path = os.path.join(root, new_filename)

            if old_file_path != new_file_path:
                os.rename(old_file_path, new_file_path)
                print(f"Renamed file: {old_file_path} -> {new_file_path}")

        # Rename directories
        for dirname in dirs:
            new_dirname = convert_to_snake_case(dirname)
            old_dir_path = os.path.join(root, dirname)
            new_dir_path = os.path.join(root, new_dirname)

            if old_dir_path != new_dir_path:
                os.rename(old_dir_path, new_dir_path)
                print(f"Renamed directory: {old_dir_path} -> {new_dir_path}")

def extract_site_id(filename):
    return os.path.basename(filename).split('_')[1]
    
def get_replicate_columns(base_var, df):
    #pattern = rf'^{base_var}(_\d+)*$'
    pattern = rf'^{base_var}(_[A-Z0-9]+)*(_PI_F)*$'
    return [col for col in df.columns if re.match(pattern, col)]

In [None]:
# import os
# import sys
# import math
# import time
# import warnings
# import shutil
# import subprocess
# from datetime import datetime, timedelta
# from collections import defaultdict
# from functools import partial
# from itertools import chain
# from pathlib import Path
# from tqdm import tqdm

# # Environmental settings
# os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
# os.environ['HDF5_USE_FILE_LOCKING'] = 'FALSE'

# # Suppress warnings
# warnings.filterwarnings("ignore", category=RuntimeWarning)

# # General-purpose libraries
# import numpy as np
# import pandas as pd
# import pickle
# import csv
# import io
# import glob
# import re

# # Visualization libraries
# import matplotlib as mpl
# import matplotlib.pyplot as plt
# import seaborn as sns
# sns.set(font_scale=1.5, style="white")
# %matplotlib inline

# # Geospatial and data processing
# import geopandas as gpd
# import rasterio
# import rioxarray as rxr
# import xarray as xr
# import dask
# import dask.dataframe as dd
# from dask import delayed, compute
# from dask.diagnostics import ProgressBar
# ProgressBar().register()

# # Scientific computation and interpolation
# from scipy.interpolate import griddata, Rbf, interp1d
# from scipy.ndimage import zoom
# from scipy.stats import bootstrap, linregress
# from scipy.spatial import cKDTree

# # TensorFlow and machine learning
# import tensorflow as tf
# from tensorflow import keras
# from tensorflow.keras import layers, models, callbacks, optimizers
# from keras_tuner import HyperModel, RandomSearch, BayesianOptimization, Hyperband
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
# from sklearn.metrics import mean_squared_error, r2_score

# # Statsmodels and spatial statistics
# import statsmodels.api as sm
# from statsmodels.tsa.stattools import adfuller, kpss
# from esda.moran import Moran
# from libpysal.weights import lat2W

# # Geospatial utilities
# from shapely.geometry import Point, box
# from pyproj import Transformer, CRS

# # File handling
# import zipfile
# from osgeo import gdal

# # Debugging and system utilities
# import gc
# import psutil

# # Print TensorFlow info
# print(f"TensorFlow version: {tf.__version__}")
# print(f"Num GPUs Available: {len(tf.config.list_physical_devices('GPU'))}")

# # Recursion limit setup
# print("Default Recursion Limit:", sys.getrecursionlimit())
# sys.setrecursionlimit(10**6)
# print("Updated Recursion Limit:", sys.getrecursionlimit())

# # Function definitions
# def divisorGenerator(n):
#     for i in range(1, int(math.sqrt(n)) + 1):
#         if n % i == 0:
#             yield i
#             if i * i != n:
#                 yield n // i

# def series_to_supervised(data, lags=1, forecasting_steps=1, dropna=True):
#     df = pd.DataFrame(data)
#     cols, names = [], []
#     for i in range(lags, 0, -1):
#         cols.append(df.shift(i))
#         names += [(df.columns[j], f't-{i}') for j in range(df.shape[1])]
#     for i in range(forecasting_steps):
#         cols.append(df.shift(-i))
#         names += [(df.columns[j], f't+{i}') if i else (df.columns[j], 't') for j in range(df.shape[1])]
#     agg = pd.concat(cols, axis=1)
#     agg.columns = names
#     if dropna:
#         agg.dropna(inplace=True)
#     return agg

# def haversine(lon1, lat1, lon2, lat2):
#     R = 6371000  # Earth's radius in meters
#     lon1, lat1, lon2, lat2 = map(math.radians, [lon1, lat1, lon2, lat2])
#     dlon, dlat = lon2 - lon1, lat2 - lat1
#     a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
#     return 2 * R * math.atan2(math.sqrt(a), math.sqrt(1 - a))

# def fast_flatten(input_list):
#     return list(chain.from_iterable(input_list))

# def convert_to_snake_case(name):
#     return re.sub(r'[^a-z0-9_\.]', '', re.sub(r'[\s\-]+', '_', name.lower()))

# def rename_files_in_directory(root_dir):
#     for root, dirs, files in os.walk(root_dir):
#         for filename in files:
#             new_filename = convert_to_snake_case(filename)
#             os.rename(os.path.join(root, filename), os.path.join(root, new_filename))

# def decimal_year_to_datetime(decimal_years):
#     dates = []
#     for decimal_year in decimal_years:
#         year = int(decimal_year)
#         remainder = decimal_year - year
#         start_of_year = datetime(year, 1, 1)
#         days_in_year = (datetime(year + 1, 1, 1) - start_of_year).days
#         date = start_of_year + timedelta(days=remainder * days_in_year)
#         dates.append(date)
#     return np.array(dates)

# def process_csv(file_path):
#     try:
#         df = pd.read_csv(file_path, encoding='utf-8')
#     except UnicodeDecodeError:
#         df = pd.read_csv(file_path, encoding='ISO-8859-1')
#     required_columns = ['collectDate', 'siteID', 'decimalLatitude', 'decimalLongitude']
#     thaw_depth_columns = [col for col in df.columns if 'thawProbeDepth' in col]
#     if not all(col in df.columns for col in required_columns):
#         return None
#     columns_to_extract = required_columns + thaw_depth_columns
#     df_extracted = df[columns_to_extract]
#     df_melted = df_extracted.melt(
#         id_vars=required_columns,
#         value_vars=thaw_depth_columns,
#         var_name='thawProbeDepthType',
#         value_name='thawProbeDepth'
#     )
#     return df_melted

# def enhanced_save_as_xlsx(file_path, df_dict):
#     with pd.ExcelWriter(file_path, engine='xlsxwriter') as writer:
#         for sheet_name, df in df_dict.items():
#             df.to_excel(writer, sheet_name=sheet_name, index=False)

In [None]:
from IPython.display import clear_output
clear_output(wait=True)

### Other Functionality Settings

In [None]:
# To disable mixed precision and verbosely assign float32 data type to output (saves computational cost):
mixed_precision.set_global_policy('float32')

In [None]:
# Assuming you do not want to use the first GPU and instead the CPU
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
print(device_lib.list_local_devices())

In [None]:
# Assuming you want to use the first GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
print(device_lib.list_local_devices())

In [None]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

In [None]:
### Clean up filenaming nomenclature ###
########################################

#root_dir='/Users/bgay/NEON_permafrost-measures/'
#rename_files_in_directory(root_dir)

#root_dir='/Volumes/JPL/geocryoai/insitu/alt/calm/source/'
#rename_files_in_directory(root_dir)

#root_dir='/Volumes/JPL/geocryoai/insitu/alt/gtnp/source/'
#rename_files_in_directory(root_dir)

# Tuning

## Cleaning datasets

In [None]:
df = pd.read_parquet('/Users/bgay/Downloads/final_fcfch4alt_monthly_1km_ds.parquet')

In [None]:
#df.alt.isna().sum()
#96654894
#df.fc.isna().sum()
#3163257965
#df.fch4.isna().sum()
#3163257965

In [None]:
# for col in ['alt', 'fc', 'fch4']:
#     df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
# df

In [None]:
df[['datetime','lat','lon','alt']].to_parquet('alt_df.parquet')

In [None]:
df[['datetime','lat','lon','fc']].to_parquet('fc_df.parquet')

In [None]:
df[['datetime','lat','lon','fch4']].to_parquet('fch4_df.parquet')

In [None]:
del df

In [None]:
alt_df = pd.read_parquet('alt_df.parquet')

In [None]:
alt_df

In [None]:
alt_df = alt_df.dropna()

In [None]:
alt_df['alt'] = (alt_df['alt'] - alt_df['alt'].min()) / (alt_df['alt'].max() - alt_df['alt'].min())

In [None]:
alt_df.to_parquet('alt_df_nonan_norm.parquet')

In [None]:
del alt_df

In [None]:
fc_df = pd.read_parquet('fc_df.parquet')

In [None]:
fc_df = fc_df.dropna()

In [None]:
fc_df['fc'] = (fc_df['fc'] - fc_df['fc'].min()) / (fc_df['fc'].max() - fc_df['fc'].min())

In [None]:
fc_df.to_parquet('fc_df_nonan_norm.parquet')

In [None]:
del fc_df

In [None]:
fch4_df = pd.read_parquet('fch4_df.parquet')

In [None]:
fch4_df = fch4_df.dropna()

In [None]:
fch4_df['fch4'] = (fch4_df['fch4'] - fch4_df['fch4'].min()) / (fch4_df['fch4'].max() - fch4_df['fch4'].min())

In [None]:
fch4_df.to_parquet('fch4_df_nonan_norm.parquet')

In [None]:
del fch4_df

## Reshaping datasets

### ALT

In [None]:
import pandas as pd
alt_df = pd.read_parquet('alt_df_nonan_norm.parquet')
#alt_df = pd.read_parquet('/Volumes/JPL/geocryoai/modeling/data/input/alt/alt_df_nonan_norm.parquet')
alt_df

In [None]:
alt_df['datetime'] = pd.to_datetime(alt_df['datetime'])

In [None]:
alt_df['datetime_index'] = pd.Categorical(alt_df['datetime']).codes
alt_df['lat_index'] = pd.Categorical(alt_df['lat']).codes
alt_df['lon_index'] = pd.Categorical(alt_df['lon']).codes

In [None]:
# alt_df['datetime_index'] = pd.factorize(alt_df['datetime'])[0]
# alt_df['lat_index'] = pd.factorize(alt_df['lat'])[0]
# alt_df['lon_index'] = pd.factorize(alt_df['lon'])[0]

In [None]:
alt_df[['datetime_index','lat_index','lon_index','alt']]

In [None]:
datetime_mapping = alt_df[['datetime', 'datetime_index']].drop_duplicates().reset_index(drop=True)

In [None]:
datetime_mapping.to_parquet('alt_datetime_mapping.parquet')

In [None]:
alt_df = alt_df[['datetime_index','lat','lon','lat_index','lon_index','alt']]

In [None]:
alt_df.to_parquet('alt_first.parquet')

In [None]:
import pandas as pd
alt_df = pd.read_parquet('alt_first.parquet')

In [None]:
alt_df['alt'] = alt_df['alt'].astype('float32')
alt_df['datetime_index'] = alt_df['datetime_index'].astype('int32')
alt_df['lat_index'] = alt_df['lat_index'].astype('int32')
alt_df['lon_index'] = alt_df['lon_index'].astype('int32')

In [None]:
lat_mapping = alt_df[['lat', 'lat_index']].drop_duplicates().reset_index(drop=True)

In [None]:
lat_mapping.to_parquet('alt_lat_mapping.parquet')

In [None]:
alt_df = alt_df[['datetime_index','lat_index','lon','lon_index','alt']]

In [None]:
alt_df.to_parquet('alt_second.parquet')

In [None]:
# import pandas as pd
# alt_df = pd.read_parquet('alt_second.parquet')

# lon_mapping = alt_df[['lon', 'lon_index']].drop_duplicates()
# lon_mapping = lon_mapping.reset_index(drop=True)
# lon_mapping.to_parquet('alt_lon_mapping.parquet')
# alt_df = alt_df[['datetime_index','lat_index','lon_index','alt']]

In [None]:
import pandas as pd
from tqdm import tqdm

tqdm.pandas()

alt_df = pd.read_parquet('alt_second.parquet', columns=['lon', 'lon_index'])

chunk_size = 10_000_000
lon_mapping = pd.DataFrame()

for start in tqdm(range(0, len(alt_df), chunk_size), desc="Processing Chunks"):
    chunk = alt_df.iloc[start:start + chunk_size]
    chunk = chunk.drop_duplicates()
    lon_mapping = pd.concat([lon_mapping, chunk])

lon_mapping = lon_mapping.drop_duplicates().reset_index(drop=True)

In [None]:
lon_mapping.to_parquet('lon_mapping.parquet', index=False)

In [None]:
import pandas as pd
alt_df = pd.read_parquet('alt_second.parquet')

In [None]:
alt_df = alt_df[['datetime_index','lat_index','lon_index','alt']]

In [None]:
alt_df.to_parquet('alt_third.parquet')

In [None]:
import pandas as pd
import numpy as np
from scipy.ndimage import gaussian_filter
import h5py
from tqdm import tqdm

alt_df = pd.read_parquet('alt_third.parquet')
alt_df

In [None]:
time_indices = alt_df['datetime_index'].unique()
lat_indices = alt_df['lat_index'].unique()
lon_indices = alt_df['lon_index'].unique()

In [None]:
time_steps = alt_df['datetime_index'].max() + 1
lat_steps = alt_df['lat_index'].max() + 1
lon_steps = alt_df['lon_index'].max() + 1

In [None]:
tensor = np.full((time_steps, lat_steps, lon_steps), np.nan, dtype=np.float32)

In [None]:
t_indices = alt_df['datetime_index'].to_numpy(dtype=np.int32)
lat_indices = alt_df['lat_index'].to_numpy(dtype=np.int32)
lon_indices = alt_df['lon_index'].to_numpy(dtype=np.int32)
alt_values = alt_df['alt'].to_numpy(dtype=np.float32)

In [None]:
tensor[t_indices, lat_indices, lon_indices] = alt_values

In [None]:
# tensor_filled = np.where(np.isnan(tensor), gaussian_filter(tensor, sigma=1), tensor)

In [None]:
with h5py.File('alt_tensor.h5', 'w') as f:
    dset = f.create_dataset('alt', shape=tensor.shape, dtype=np.float32)
    for t in tqdm(range(time_steps), desc='Processing time steps in tensor...'):
        if not np.isnan(tensor[t]).all():
            slice_min = np.nanmin(tensor[t])
            slice_to_filter = np.where(np.isnan(tensor[t]), slice_min, tensor[t])
            dset[t] = gaussian_filter(slice_to_filter, sigma=1)
        else:
            dset[t] = tensor[t]

In [None]:
with h5py.File('alt_tensor.h5', 'r') as f:
    tensor_filled_from_file = f['alt'][:]

In [None]:
# nan_count = np.isnan(tensor_filled_from_file).sum()
# print(f"Number of NaN values in reloaded tensor: {nan_count}")
# 6653413736

In [None]:
# nan_count = np.isnan(tensor).sum()
# print(f"Number of NaN values in reloaded tensor: {nan_count}")
# 4836086858

In [None]:
# After interpolating via nanmin() and Gaussian filtration
# nan_count = np.isnan(tensor_filled_from_file).sum()
# print(f"Number of NaN values in reloaded tensor: {nan_count}")
# 0

In [None]:
tensor_filled_from_file = tensor_filled_from_file.reshape(6708,1092,1092,1)

In [None]:
tensor = tensor_filled_from_file; del tensor_filled_from_file

In [None]:
with h5py.File('alt_tensor.h5', 'w') as f:
    f.create_dataset('alt', data=tensor, dtype=np.float32)

### FC

In [None]:
import pandas as pd

fc_df = pd.read_parquet('fc_df_nonan_norm.parquet')
#fc_df = pd.read_parquet('/Volumes/JPL/geocryoai/modeling/data/input/fc/fc_df_nonan_norm.parquet')
fc_df

In [None]:
fc_df['datetime'] = pd.to_datetime(fc_df['datetime'])

In [None]:
fc_df['lat'] = fc_df['lat'].round(4)
fc_df['lon'] = fc_df['lon'].round(4)

In [None]:
fc_df['datetime_index'] = pd.Categorical(fc_df['datetime']).codes
fc_df['lat_index'] = pd.Categorical(fc_df['lat']).codes
fc_df['lon_index'] = pd.Categorical(fc_df['lon']).codes

In [None]:
# fc_df['datetime_index'] = pd.factorize(fc_df['datetime'])[0]
# fc_df['lat_index'] = pd.factorize(fc_df['lat'])[0]
# fc_df['lon_index'] = pd.factorize(fc_df['lon'])[0]

In [None]:
fc_df[['datetime_index','lat_index','lon_index','fc']]

In [None]:
datetime_mapping = fc_df[['datetime', 'datetime_index']].drop_duplicates().reset_index(drop=True)

In [None]:
datetime_mapping.to_parquet('fc_datetime_mapping.parquet')

In [None]:
fc_df = fc_df[['datetime_index','lat','lon','lat_index','lon_index','fc']]

In [None]:
fc_df.to_parquet('fc_first.parquet')

In [None]:
import pandas as pd
fc_df = pd.read_parquet('fc_first.parquet')

In [None]:
fc_df['fc'] = fc_df['fc'].astype('float32')
fc_df['datetime_index'] = fc_df['datetime_index'].astype('int32')
fc_df['lat_index'] = fc_df['lat_index'].astype('int32')
fc_df['lon_index'] = fc_df['lon_index'].astype('int32')

In [None]:
lat_mapping = fc_df[['lat', 'lat_index']].drop_duplicates().reset_index(drop=True)

In [None]:
lat_mapping.to_parquet('fc_lat_mapping.parquet')

In [None]:
fc_df = fc_df[['datetime_index','lat_index','lon','lon_index','fc']]

In [None]:
fc_df.to_parquet('fc_second.parquet')

In [None]:
import pandas as pd
fc_df = pd.read_parquet('fc_second.parquet')

In [None]:
lon_mapping = fc_df[['lon', 'lon_index']].drop_duplicates()
lon_mapping = lon_mapping.reset_index(drop=True)
lon_mapping.to_parquet('fc_lon_mapping.parquet')
fc_df = fc_df[['datetime_index','lat_index','lon_index','fc']]

In [None]:
fc_df.to_parquet('fc_third.parquet')

In [None]:
import pandas as pd
import numpy as np
from scipy.ndimage import gaussian_filter
import h5py
from tqdm import tqdm

In [None]:
fc_df = pd.read_parquet('fc_third.parquet')
fc_df

In [None]:
time_indices = fc_df['datetime_index'].unique()
lat_indices = fc_df['lat_index'].unique()
lon_indices = fc_df['lon_index'].unique()

In [None]:
time_steps = fc_df['datetime_index'].max() + 1
lat_steps = fc_df['lat_index'].max() + 1
lon_steps = fc_df['lon_index'].max() + 1

In [None]:
time_steps, lat_steps, lon_steps

In [None]:
from scipy.sparse import coo_matrix

with h5py.File('fc_tensor_sparse.h5', 'w') as f:
    grp = f.create_group('sparse_tensor')
    for t in tqdm(range(time_steps), desc='Processing time slices'):
        current_slice = fc_df[fc_df['datetime_index'] == t]
        lat_indices = current_slice['lat_index'].to_numpy(dtype=np.int32)
        lon_indices = current_slice['lon_index'].to_numpy(dtype=np.int32)
        fc_values = current_slice['fc'].to_numpy(dtype=np.float32)
        sparse_matrix = coo_matrix((fc_values, (lat_indices, lon_indices)), shape=(lat_steps, lon_steps))
        grp.create_dataset(f'time_{t}_data', data=sparse_matrix.data)
        grp.create_dataset(f'time_{t}_row', data=sparse_matrix.row)
        grp.create_dataset(f'time_{t}_col', data=sparse_matrix.col)

In [None]:
time_steps = 314
lat_steps = 95896
lon_steps = 194761

tensor = np.full((time_steps, lat_steps, lon_steps), np.nan, dtype=np.float32)

with h5py.File('fc_tensor_sparse.h5', 'r') as f:
    for t in tqdm(range(time_steps), desc="Reconstructing dense tensor"):
        data = f[f'sparse_tensor/time_{t}_data'][:]
        row = f[f'sparse_tensor/time_{t}_row'][:]
        col = f[f'sparse_tensor/time_{t}_col'][:]
        sparse_matrix = coo_matrix((data, (row, col)), shape=(lat_steps, lon_steps))
        tensor[t, :, :] = sparse_matrix.toarray()

In [None]:
tensor.shape

In [None]:
t_indices = fc_df['datetime_index'].to_numpy(dtype=np.int32)
lat_indices = fc_df['lat_index'].to_numpy(dtype=np.int32)
lon_indices = fc_df['lon_index'].to_numpy(dtype=np.int32)
fc_values = fc_df['fc'].to_numpy(dtype=np.float32)

In [None]:
tensor[t_indices, lat_indices, lon_indices] = fc_values

In [None]:
# tensor_filled = np.where(np.isnan(tensor), gaussian_filter(tensor, sigma=1), tensor)

In [None]:
with h5py.File('alt_tensor.h5', 'w') as f:
    dset = f.create_dataset('alt', shape=tensor.shape, dtype=np.float32)
    for t in tqdm(range(time_steps), desc='Processing time steps in tensor...'):
        if not np.isnan(tensor[t]).all():
            slice_min = np.nanmin(tensor[t])
            slice_to_filter = np.where(np.isnan(tensor[t]), slice_min, tensor[t])
            dset[t] = gaussian_filter(slice_to_filter, sigma=1)
        else:
            dset[t] = tensor[t]

In [None]:
# nan_count = np.isnan(tensor_filled_from_file).sum()
# print(f"Number of NaN values in reloaded tensor: {nan_count}")
# 6653413736

In [None]:
# nan_count = np.isnan(tensor).sum()
# print(f"Number of NaN values in reloaded tensor: {nan_count}")
# 4836086858

In [None]:
# After interpolating via nanmin() and Gaussian filtration
# nan_count = np.isnan(tensor_filled_from_file).sum()
# print(f"Number of NaN values in reloaded tensor: {nan_count}")
# 0

In [None]:
tensor_filled_from_file = tensor_filled_from_file.reshape(6708,1092,1092,1)

In [None]:
tensor = tensor_filled_from_file; del tensor_filled_from_file

In [None]:
with h5py.File('alt_tensor.h5', 'w') as f:
    f.create_dataset('alt', data=tensor, dtype=np.float32)

### FCH4

In [None]:
import pandas as pd
fch4_df = pd.read_parquet('fch4_df_nonan_norm.parquet')
#fch4_df = pd.read_parquet('/Volumes/JPL/geocryoai/modeling/data/input/fch4/fch4_df_nonan_norm.parquet')
fch4_df

In [None]:
alt_df['datetime'] = pd.to_datetime(alt_df['datetime'])

In [None]:
alt_df['datetime_index'] = pd.Categorical(alt_df['datetime']).codes
alt_df['lat_index'] = pd.Categorical(alt_df['lat']).codes
alt_df['lon_index'] = pd.Categorical(alt_df['lon']).codes

In [None]:
# alt_df['datetime_index'] = pd.factorize(alt_df['datetime'])[0]
# alt_df['lat_index'] = pd.factorize(alt_df['lat'])[0]
# alt_df['lon_index'] = pd.factorize(alt_df['lon'])[0]

In [None]:
alt_df[['datetime_index','lat_index','lon_index','alt']]

In [None]:
datetime_mapping = alt_df[['datetime', 'datetime_index']].drop_duplicates().reset_index(drop=True)

In [None]:
datetime_mapping.to_parquet('alt_datetime_mapping.parquet')

In [None]:
alt_df = alt_df[['datetime_index','lat','lon','lat_index','lon_index','alt']]

In [None]:
alt_df.to_parquet('alt_first.parquet')

In [None]:
import pandas as pd
alt_df = pd.read_parquet('alt_first.parquet')

In [None]:
alt_df['alt'] = alt_df['alt'].astype('float32')
alt_df['datetime_index'] = alt_df['datetime_index'].astype('int32')
alt_df['lat_index'] = alt_df['lat_index'].astype('int32')
alt_df['lon_index'] = alt_df['lon_index'].astype('int32')

In [None]:
lat_mapping = alt_df[['lat', 'lat_index']].drop_duplicates().reset_index(drop=True)

In [None]:
lat_mapping.to_parquet('alt_lat_mapping.parquet')

In [None]:
alt_df = alt_df[['datetime_index','lat_index','lon','lon_index','alt']]

In [None]:
alt_df.to_parquet('alt_second.parquet')

In [None]:
# import pandas as pd
# alt_df = pd.read_parquet('alt_second.parquet')

# lon_mapping = alt_df[['lon', 'lon_index']].drop_duplicates()
# lon_mapping = lon_mapping.reset_index(drop=True)
# lon_mapping.to_parquet('alt_lon_mapping.parquet')
# alt_df = alt_df[['datetime_index','lat_index','lon_index','alt']]

In [None]:
import pandas as pd
from tqdm import tqdm

tqdm.pandas()

alt_df = pd.read_parquet('alt_second.parquet', columns=['lon', 'lon_index'])

chunk_size = 10_000_000
lon_mapping = pd.DataFrame()

for start in tqdm(range(0, len(alt_df), chunk_size), desc="Processing Chunks"):
    chunk = alt_df.iloc[start:start + chunk_size]
    chunk = chunk.drop_duplicates()
    lon_mapping = pd.concat([lon_mapping, chunk])

lon_mapping = lon_mapping.drop_duplicates().reset_index(drop=True)

In [None]:
lon_mapping.to_parquet('lon_mapping.parquet', index=False)

In [None]:
import pandas as pd
alt_df = pd.read_parquet('alt_second.parquet')

In [None]:
alt_df = alt_df[['datetime_index','lat_index','lon_index','alt']]

In [None]:
alt_df.to_parquet('alt_third.parquet')

In [None]:
import pandas as pd
import numpy as np
from scipy.ndimage import gaussian_filter
import h5py
from tqdm import tqdm

alt_df = pd.read_parquet('alt_third.parquet')
alt_df

In [None]:
time_indices = alt_df['datetime_index'].unique()
lat_indices = alt_df['lat_index'].unique()
lon_indices = alt_df['lon_index'].unique()

In [None]:
time_steps = alt_df['datetime_index'].max() + 1
lat_steps = alt_df['lat_index'].max() + 1
lon_steps = alt_df['lon_index'].max() + 1

In [None]:
tensor = np.full((time_steps, lat_steps, lon_steps), np.nan, dtype=np.float32)

In [None]:
t_indices = alt_df['datetime_index'].to_numpy(dtype=np.int32)
lat_indices = alt_df['lat_index'].to_numpy(dtype=np.int32)
lon_indices = alt_df['lon_index'].to_numpy(dtype=np.int32)
alt_values = alt_df['alt'].to_numpy(dtype=np.float32)

In [None]:
tensor[t_indices, lat_indices, lon_indices] = alt_values

In [None]:
# tensor_filled = np.where(np.isnan(tensor), gaussian_filter(tensor, sigma=1), tensor)

In [None]:
with h5py.File('alt_tensor.h5', 'w') as f:
    dset = f.create_dataset('alt', shape=tensor.shape, dtype=np.float32)
    for t in tqdm(range(time_steps), desc='Processing time steps in tensor...'):
        if not np.isnan(tensor[t]).all():
            slice_min = np.nanmin(tensor[t])
            slice_to_filter = np.where(np.isnan(tensor[t]), slice_min, tensor[t])
            dset[t] = gaussian_filter(slice_to_filter, sigma=1)
        else:
            dset[t] = tensor[t]

In [None]:
with h5py.File('alt_tensor.h5', 'r') as f:
    tensor_filled_from_file = f['alt'][:]

In [None]:
# nan_count = np.isnan(tensor_filled_from_file).sum()
# print(f"Number of NaN values in reloaded tensor: {nan_count}")
# 6653413736

In [None]:
# nan_count = np.isnan(tensor).sum()
# print(f"Number of NaN values in reloaded tensor: {nan_count}")
# 4836086858

In [None]:
# After interpolating via nanmin() and Gaussian filtration
# nan_count = np.isnan(tensor_filled_from_file).sum()
# print(f"Number of NaN values in reloaded tensor: {nan_count}")
# 0

In [None]:
tensor_filled_from_file = tensor_filled_from_file.reshape(6708,1092,1092,1)

In [None]:
tensor = tensor_filled_from_file; del tensor_filled_from_file

In [None]:
with h5py.File('alt_tensor.h5', 'w') as f:
    f.create_dataset('alt', data=tensor, dtype=np.float32)

### Mapping indices back to original values

In [None]:
# Mapping indices back to original values
datetime_mapping = pd.read_parquet('datetime_mapping.parquet')
lat_mapping = pd.read_parquet('lat_mapping.parquet')
lon_mapping = pd.read_parquet('lon_mapping.parquet')

# Use mappings for interpretation of predictions
datetime_mapping.set_index('datetime_index', inplace=True)
lat_mapping.set_index('lat_index', inplace=True)
lon_mapping.set_index('lon_index', inplace=True)

original_datetime = datetime_mapping.loc[0, 'datetime']  # Example
original_lat = lat_mapping.loc[915, 'lat']
original_lon = lon_mapping.loc[635, 'lon']

### All

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy.sparse import coo_matrix
from scipy.sparse import coo_matrix
from scipy.ndimage import gaussian_filter

alt_df = pd.read_parquet('alt_df_nonan_norm.parquet')
fch4_df = pd.read_parquet('fch4_df_nonan_norm.parquet')
fc_df = pd.read_parquet('fc_df_nonan_norm.parquet')
#merged = pd.read_parquet('/Volumes/JPL/geocryoai/preprocessing/data/ds/merged/final_fcfch4alt_monthly_1km_ds.parquet')

In [None]:
# print(alt_df.head())
# print(fc_df.head())
# print(fch4_df.head())

In [None]:
for df in [alt_df, fc_df, fch4_df]:
    df['datetime'] = pd.to_datetime(df['datetime'])

In [None]:
alt_time = alt_df['datetime'].to_numpy()
fc_time = fc_df['datetime'].to_numpy()
fch4_time = fch4_df['datetime'].to_numpy()

In [None]:
common_time = np.intersect1d(alt_time, fc_time)
common_time = np.intersect1d(common_time, fch4_time)
common_time = pd.to_datetime(common_time)

In [None]:
pd.DataFrame(common_time).to_parquet('common_time.parquet')

In [None]:
# common_time #1994-06-01 to 2022-12-01 (28 years), i.e., 314 values
# DatetimeIndex(['1994-06-01', '1994-07-01', '1994-08-01', '1994-09-01',
#                '1994-10-01', '1994-11-01', '1994-12-01', '1995-01-01',
#                '1995-02-01', '1995-03-01',
#                ...
#                '2022-03-01', '2022-04-01', '2022-05-01', '2022-06-01',
#                '2022-07-01', '2022-08-01', '2022-09-01', '2022-10-01',
#                '2022-11-01', '2022-12-01'],
#               dtype='datetime64[ns]', length=314, freq=None)

In [None]:
alt_df = alt_df[alt_df['datetime'].isin(common_time)].reset_index(drop=True)
fc_df = fc_df[fc_df['datetime'].isin(common_time)].reset_index(drop=True)
fch4_df = fch4_df[fch4_df['datetime'].isin(common_time)].reset_index(drop=True)
print(f"Common time steps: {len(common_time)}")

In [None]:
alt_df

In [None]:
lat_min = min(alt_df['lat'].min(), fc_df['lat'].min(), fch4_df['lat'].min())
lat_max = max(alt_df['lat'].max(), fc_df['lat'].max(), fch4_df['lat'].max())
lon_min = min(alt_df['lon'].min(), fc_df['lon'].min(), fch4_df['lon'].min())
lon_max = max(alt_df['lon'].max(), fc_df['lon'].max(), fch4_df['lon'].max())

In [None]:
grid_spacing = 1 / 111  # 1 km grid in degrees
lat_grid = np.arange(lat_min, lat_max + grid_spacing, grid_spacing)
lon_grid = np.arange(lon_min, lon_max + grid_spacing, grid_spacing)

In [None]:
grid_lat, grid_lon = np.meshgrid(lat_grid, lon_grid, indexing='ij')

In [None]:
grid_lat = grid_lat.round(3)
grid_lon = grid_lon.round(3)

In [None]:
def align_and_interpolate(df, var, method='mean'):
    tensors = []
    for dt in tqdm(common_time, desc=f"Processing {var} data"):
        time_slice = df[df['datetime'] == dt]
        points = time_slice[['lat', 'lon']].values
        values = time_slice[var].values

        # Interpolate onto the grid
        interpolated = griddata(
            points, values, (grid_lat, grid_lon), method='linear'
        )

        # Handle missing values
        if method == 'min':
            interpolated[np.isnan(interpolated)] = np.nanmin(interpolated)
        elif method == 'mean':
            interpolated[np.isnan(interpolated)] = np.nanmean(interpolated)

        tensors.append(interpolated)

    return np.stack(tensors)

In [None]:
from scipy.interpolate import griddata
alt_tensor = align_and_interpolate(alt_df, 'alt', method='min')

In [None]:
def align_and_interpolate(df, var, method='mean'):
    tensors = []
    for dt in tqdm(common_time, desc=f"Processing {var} data"):
        time_slice = df[df['datetime'] == dt]
        points = time_slice[['lat', 'lon']].values
        values = time_slice[var].values

        if len(points) < 4:  # Not enough points for triangulation
            print(f"Skipping datetime {dt} due to insufficient points.")
            # Fill entire grid with global min/mean as fallback
            if method == 'min':
                fallback_value = np.nanmin(values) if len(values) > 0 else 0
            elif method == 'mean':
                fallback_value = np.nanmean(values) if len(values) > 0 else 0
            interpolated = np.full(grid_lat.shape, fallback_value)
        else:
            # Interpolate onto the grid
            interpolated = griddata(
                points, values, (grid_lat, grid_lon), method='linear'
            )
            # Handle missing values
            if method == 'min':
                interpolated[np.isnan(interpolated)] = np.nanmin(values)
            elif method == 'mean':
                interpolated[np.isnan(interpolated)] = np.nanmean(values)

        tensors.append(interpolated)

    return np.stack(tensors)

In [None]:
fc_tensor = align_and_interpolate(fc_df, 'fc', method='mean')

In [None]:
fch4_tensor = align_and_interpolate(fch4_df, 'fch4', method='mean')

In [None]:
alt_tensor = alt_tensor.reshape(314,2070,4178,1)

In [None]:
fc_tensor = fc_tensor.reshape(314,2070,4178,1)

In [None]:
fch4_tensor = fch4_tensor.reshape(314,2070,4178,1)

In [None]:
print(f"alt_tensor shape: {alt_tensor.shape}")
print(f"fc_tensor shape: {fc_tensor.shape}")
print(f"fch4_tensor shape: {fch4_tensor.shape}")

In [None]:
import h5py

with h5py.File('alt_tensor.h5', 'w') as f:
    f.create_dataset('alt', data=alt_tensor, dtype=np.float32)

with h5py.File('fc_tensor.h5', 'w') as f:
    f.create_dataset('fc', data=fc_tensor, dtype=np.float32)

with h5py.File('fch4_tensor.h5', 'w') as f:
    f.create_dataset('fch4', data=fch4_tensor, dtype=np.float32)

In [None]:
with h5py.File('ensemble_tensor.h5', 'w') as f:
    f.create_dataset('alt', data=alt_tensor, dtype=np.float32)
    f.create_dataset('fc', data=fc_tensor, dtype=np.float32)
    f.create_dataset('fch4', data=fch4_tensor, dtype=np.float32)
print("Tensors saved successfully.")

#### Archived

In [None]:
alt_df['lat'] = alt_df.lat.round(3)
alt_df['lon'] = alt_df.lon.round(3)

In [None]:
fc_df['lat'] = fc_df.lat.round(3)
fc_df['lon'] = fc_df.lon.round(3)

In [None]:
fch4_df['lat'] = fch4_df.lat.round(3)
fch4_df['lon'] = fch4_df.lon.round(3)

In [None]:
alt_lat_lon = pd.concat([alt_df['lat'], alt_df['lon']], axis=1).drop_duplicates()
fc_lat_lon = pd.concat([fc_df['lat'], fc_df['lon']], axis=1).drop_duplicates()
fch4_lat_lon = pd.concat([fch4_df['lat'], fch4_df['lon']], axis=1).drop_duplicates()

In [None]:
lat_mapping = pd.Categorical(pd.concat([alt_df['lat'], fc_df['lat'], fch4_df['lat']])).codes
lon_mapping = pd.Categorical(pd.concat([alt_df['lon'], fc_df['lon'], fch4_df['lon']])).codes

In [None]:
for df in [alt_df, fc_df, fch4_df]:
    df['lat_index'] = pd.Categorical(df['lat']).codes
    df['lon_index'] = pd.Categorical(df['lon']).codes

In [None]:
# Combine all unique latitudes and longitudes
lat_union = sorted(set(alt_df['lat']).union(fc_df['lat']).union(fch4_df['lat']))
lon_union = sorted(set(alt_df['lon']).union(fc_df['lon']).union(fch4_df['lon']))

print(f"Unified latitudes: {len(lat_union)}")
print(f"Unified longitudes: {len(lon_union)}")

In [None]:
lat_mapping = {lat: idx for idx, lat in enumerate(lat_union)}
lon_mapping = {lon: idx for idx, lon in enumerate(lon_union)}

In [None]:
for df in [alt_df, fc_df, fch4_df]:
    df['lat_index'] = df['lat'].map(lat_mapping)
    df['lon_index'] = df['lon'].map(lon_mapping)

print("Lat/Lon indexing complete.")

In [None]:
from scipy.sparse import coo_matrix

# Function to create a sparse matrix
def to_sparse_matrix(df, var, time_steps, lat_steps, lon_steps):
    t_indices = pd.Categorical(df['datetime']).codes
    lat_indices = df['lat_index'].to_numpy()
    lon_indices = df['lon_index'].to_numpy()
    values = df[var].to_numpy()

    return coo_matrix(
        (values, (t_indices, lat_indices * lon_steps + lon_indices)),
        shape=(time_steps, lat_steps * lon_steps)
    )

# Define grid dimensions
time_steps = len(pd.unique(alt_df['datetime']))
lat_steps = len(lat_union)
lon_steps = len(lon_union)

# Convert datasets to sparse matrices
alt_sparse = to_sparse_matrix(alt_df, 'alt', time_steps, lat_steps, lon_steps)
fc_sparse = to_sparse_matrix(fc_df, 'fc', time_steps, lat_steps, lon_steps)
fch4_sparse = to_sparse_matrix(fch4_df, 'fch4', time_steps, lat_steps, lon_steps)

print("Sparse matrices created.")

In [None]:
# print(alt_df[['lat', 'lon']].drop_duplicates().head())
# print(fc_df[['lat', 'lon']].drop_duplicates().head())
# print(fch4_df[['lat', 'lon']].drop_duplicates().head())

In [None]:
for df in [alt_df, fc_df, fch4_df]:
    df['lat_index'] = pd.Categorical(df['lat']).codes
    df['lon_index'] = pd.Categorical(df['lon']).codes

In [None]:
#del df, alt_time, fc_time, fch4_time, common_time, alt_lat_lon, fc_lat_lon, fch4_lat_lon

In [None]:
# Define tensor dimensions
time_steps = len(pd.unique(alt_df['datetime']))
lat_steps = len(pd.unique(lat_mapping))
lon_steps = len(pd.unique(lon_mapping))

print(f"Tensor dimensions: time_steps={time_steps}, lat_steps={lat_steps}, lon_steps={lon_steps}")

In [None]:
# Define tensor dimensions
time_steps = len(pd.unique(fc_df['datetime']))
lat_steps = len(pd.unique(lat_mapping))
lon_steps = len(pd.unique(lon_mapping))

print(f"Tensor dimensions: time_steps={time_steps}, lat_steps={lat_steps}, lon_steps={lon_steps}")

In [None]:
# Define tensor dimensions
time_steps = len(pd.unique(fc_df['datetime']))
lat_steps = len(pd.unique(lat_mapping))
lon_steps = len(pd.unique(lon_mapping))

print(f"Tensor dimensions: time_steps={time_steps}, lat_steps={lat_steps}, lon_steps={lon_steps}")

In [None]:
assert np.array_equal(pd.unique(alt_df['datetime']), pd.unique(fc_df['datetime'])), "Temporal misalignment between ALT and FC datasets."
assert np.array_equal(pd.unique(alt_df['datetime']), pd.unique(fch4_df['datetime'])), "Temporal misalignment between ALT and FCH4 datasets."
assert np.array_equal(pd.unique(fc_df['datetime']), pd.unique(fch4_df['datetime'])), "Temporal misalignment between FC and FCH4 datasets."

assert np.array_equal(alt_df['lat'], fc_df['lat']), "Spatial misalignment between ALT and FC in latitude."
assert np.array_equal(alt_df['lon'], fc_df['lon']), "Spatial misalignment between ALT and FC in longitude."
assert np.array_equal(alt_df['lat'], fch4_df['lat']), "Spatial misalignment between ALT and FCH4 in latitude."
assert np.array_equal(alt_df['lon'], fch4_df['lon']), "Spatial misalignment between ALT and FCH4 in longitude."
assert np.array_equal(fc_df['lat'], fch4_df['lat']), "Spatial misalignment between FC and FCH4 in latitude."
assert np.array_equal(fc_df['lon'], fch4_df['lon']), "Spatial misalignment between FC and FCH4 in longitude."

In [None]:
# import pandas as pd
# import numpy as np
# from tqdm import tqdm
# from scipy.sparse import coo_matrix
# from scipy.sparse import coo_matrix
# from scipy.ndimage import gaussian_filter

# alt_df = pd.read_parquet('alt_df_nonan_norm.parquet')
# fch4_df = pd.read_parquet('fch4_df_nonan_norm.parquet')
# fc_df = pd.read_parquet('fc_df_nonan_norm.parquet')

# for df in [alt_df, fc_df, fch4_df]:
#     df['datetime'] = pd.to_datetime(df['datetime'])

# alt_time = alt_df['datetime'].to_numpy()
# fc_time = fc_df['datetime'].to_numpy()
# fch4_time = fch4_df['datetime'].to_numpy()

# common_time = np.intersect1d(alt_time, fc_time)
# common_time = np.intersect1d(common_time, fch4_time)
# common_time = pd.to_datetime(common_time)

# alt_df = alt_df[alt_df['datetime'].isin(common_time)]
# fc_df = fc_df[fc_df['datetime'].isin(common_time)]
# fch4_df = fch4_df[fch4_df['datetime'].isin(common_time)]
# print(f"Common time steps: {len(common_time)}")

# Common time steps: 314

# alt_df = alt_df.reset_index(drop=True)
# fc_df = fc_df.reset_index(drop=True)
# fch4_df = fch4_df.reset_index(drop=True)

# grid_spacing = 1 / 111

# chunk_size = 1_000_000
# for df in [alt_df, fc_df, fch4_df]:
#     chunks = [df[i:i + chunk_size] for i in tqdm(range(0, len(df), chunk_size), desc='Processing chunks...')]
#     aligned_chunks = []
#     for chunk in chunks:
#         chunk['lat'] = np.floor(chunk['lat'] / grid_spacing) * grid_spacing
#         chunk['lon'] = np.floor(chunk['lon'] / grid_spacing) * grid_spacing
#         aligned_chunks.append(chunk)
#     df = pd.concat(aligned_chunks, ignore_index=True)

# alt_lat_lon = pd.concat([alt_df['lat'], alt_df['lon']], axis=1).drop_duplicates()
# fc_lat_lon = pd.concat([fc_df['lat'], fc_df['lon']], axis=1).drop_duplicates()
# fch4_lat_lon = pd.concat([fch4_df['lat'], fch4_df['lon']], axis=1).drop_duplicates()

# lat_mapping = pd.Categorical(pd.concat([alt_df['lat'], fc_df['lat'], fch4_df['lat']])).codes
# lon_mapping = pd.Categorical(pd.concat([alt_df['lon'], fc_df['lon'], fch4_df['lon']])).codes

# for df in [alt_df, fc_df, fch4_df]:
#     df['lat_index'] = pd.Categorical(df['lat']).codes
#     df['lon_index'] = pd.Categorical(df['lon']).codes

# # Define tensor dimensions
# time_steps = len(pd.unique(alt_df['datetime']))
# lat_steps = len(pd.unique(lat_mapping))
# lon_steps = len(pd.unique(lon_mapping))

# Tensor dimensions: time_steps=314, lat_steps=431300, lon_steps=467680

# # Define tensor dimensions
# time_steps = len(pd.unique(fc_df['datetime']))
# lat_steps = len(pd.unique(lat_mapping))
# lon_steps = len(pd.unique(lon_mapping))

# print(f"Tensor dimensions: time_steps={time_steps}, lat_steps={lat_steps}, lon_steps={lon_steps}")

# print(f"Tensor dimensions: time_steps={time_steps}, lat_steps={lat_steps}, lon_steps={lon_steps}")

# Tensor dimensions: time_steps=314, lat_steps=431300, lon_steps=467680

# # Define tensor dimensions
# time_steps = len(pd.unique(fch4_df['datetime']))
# lat_steps = len(pd.unique(lat_mapping))
# lon_steps = len(pd.unique(lon_mapping))

# print(f"Tensor dimensions: time_steps={time_steps}, lat_steps={lat_steps}, lon_steps={lon_steps}")

# Tensor dimensions: time_steps=314, lat_steps=431300, lon_steps=467680

# assert np.array_equal(pd.unique(alt_df['datetime']), pd.unique(fc_df['datetime'])), "Temporal misalignment between ALT and FC datasets."
# assert np.array_equal(pd.unique(alt_df['datetime']), pd.unique(fch4_df['datetime'])), "Temporal misalignment between ALT and FCH4 datasets."
# assert np.array_equal(pd.unique(fc_df['datetime']), pd.unique(fch4_df['datetime'])), "Temporal misalignment between FC and FCH4 datasets."

# assert np.array_equal(alt_df['lat'], fc_df['lat']), "Spatial misalignment between ALT and FC in latitude."
# assert np.array_equal(alt_df['lon'], fc_df['lon']), "Spatial misalignment between ALT and FC in longitude."
# assert np.array_equal(alt_df['lat'], fch4_df['lat']), "Spatial misalignment between ALT and FCH4 in latitude."
# assert np.array_equal(alt_df['lon'], fch4_df['lon']), "Spatial misalignment between ALT and FCH4 in longitude."
# assert np.array_equal(fc_df['lat'], fch4_df['lat']), "Spatial misalignment between FC and FCH4 in latitude."
# assert np.array_equal(fc_df['lon'], fch4_df['lon']), "Spatial misalignment between FC and FCH4 in longitude."

# ---------------------------------------------------------------------------
# AssertionError                            Traceback (most recent call last)
# Cell In[18], line 5
#       2 assert np.array_equal(pd.unique(alt_df['datetime']), pd.unique(fch4_df['datetime'])), "Temporal misalignment between ALT and FCH4 datasets."
#       3 assert np.array_equal(pd.unique(fc_df['datetime']), pd.unique(fch4_df['datetime'])), "Temporal misalignment between FC and FCH4 datasets."
# ----> 5 assert np.array_equal(alt_df['lat'], fc_df['lat']), "Spatial misalignment between ALT and FC in latitude."
#       6 assert np.array_equal(alt_df['lon'], fc_df['lon']), "Spatial misalignment between ALT and FC in longitude."
#       7 assert np.array_equal(alt_df['lat'], fch4_df['lat']), "Spatial misalignment between ALT and FCH4 in latitude."

# AssertionError: Spatial misalignment between ALT and FC in latitude.

In [None]:
for df in [alt_df, fc_df, fch4_df]:
    print(f"Before deduplication: {len(df)} rows")
    df = df.drop_duplicates(subset=['datetime', 'lat', 'lon']).reset_index(drop=True)
    print(f"After deduplication: {len(df)} rows")

In [None]:
alt_df.to_parquet('alt_new_df.parquet')
fc_df.to_parquet('fc_new_df.parquet')
fch4_df.to_parquet('fch4_new_df.parquet')

In [None]:
import pandas as pd
import numpy as np

alt_df = pd.read_parquet('alt_new_df.parquet')
fc_df = pd.read_parquet('fc_new_df.parquet')
fch4_df = pd.read_parquet('fch4_new_df.parquet')
common_time = pd.to_datetime(pd.read_parquet('common_time.parquet')[0].values)

In [None]:
print(f"Unique latitudes in ALT: {alt_df['lat'].nunique()}")
print(f"Unique latitudes in FC: {fc_df['lat'].nunique()}")
print(f"Unique latitudes in FCH4: {fch4_df['lat'].nunique()}")

print(f"Unique longitudes in ALT: {alt_df['lon'].nunique()}")
print(f"Unique longitudes in FC: {fc_df['lon'].nunique()}")
print(f"Unique longitudes in FCH4: {fch4_df['lon'].nunique()}")

In [None]:
missing_lat_fc = set(alt_df['lat']) - set(fc_df['lat'])
missing_lon_fc = set(alt_df['lon']) - set(fc_df['lon'])

print(f"Missing latitudes in FC compared to ALT: {missing_lat_fc}")
print(f"Missing longitudes in FC compared to ALT: {missing_lon_fc}")

In [None]:
grid_spacing = 1 / 111

for df in [alt_df, fc_df, fch4_df]:
    df['lat'] = (df['lat'] // grid_spacing) * grid_spacing
    df['lon'] = (df['lon'] // grid_spacing) * grid_spacing
    if df is alt_df:
        df = df.groupby(['datetime', 'lat', 'lon'], as_index=False)['alt'].min()
    else:
        df = df.groupby(['datetime', 'lat', 'lon'], as_index=False).mean()

In [None]:
assert np.array_equal(pd.unique(alt_df['datetime']), pd.unique(fc_df['datetime'])), "Temporal misalignment between ALT and FC datasets."
assert np.array_equal(pd.unique(alt_df['datetime']), pd.unique(fch4_df['datetime'])), "Temporal misalignment between ALT and FCH4 datasets."
assert np.array_equal(pd.unique(fc_df['datetime']), pd.unique(fch4_df['datetime'])), "Temporal misalignment between FC and FCH4 datasets."

assert np.array_equal(alt_df['lat'], fc_df['lat']), "Spatial misalignment between ALT and FC in latitude."
assert np.array_equal(alt_df['lon'], fc_df['lon']), "Spatial misalignment between ALT and FC in longitude."
assert np.array_equal(alt_df['lat'], fch4_df['lat']), "Spatial misalignment between ALT and FCH4 in latitude."
assert np.array_equal(alt_df['lon'], fch4_df['lon']), "Spatial misalignment between ALT and FCH4 in longitude."
assert np.array_equal(fc_df['lat'], fch4_df['lat']), "Spatial misalignment between FC and FCH4 in latitude."
assert np.array_equal(fc_df['lon'], fch4_df['lon']), "Spatial misalignment between FC and FCH4 in longitude."

In [None]:
for df in [alt_df, fc_df, fch4_df]:
    print(f"Before deduplication: {len(df)} rows")
    df = df.drop_duplicates(subset=['datetime', 'lat', 'lon']).reset_index(drop=True)
    print(f"After deduplication: {len(df)} rows")

In [None]:
print(f"Unique latitudes in ALT: {alt_df['lat'].nunique()}")
print(f"Unique latitudes in FC: {fc_df['lat'].nunique()}")
print(f"Unique latitudes in FCH4: {fch4_df['lat'].nunique()}")

print(f"Unique longitudes in ALT: {alt_df['lon'].nunique()}")
print(f"Unique longitudes in FC: {fc_df['lon'].nunique()}")
print(f"Unique longitudes in FCH4: {fch4_df['lon'].nunique()}")

In [None]:
missing_lat_fc = set(alt_df['lat']) - set(fc_df['lat'])
missing_lon_fc = set(alt_df['lon']) - set(fc_df['lon'])

print(f"Missing latitudes in FC compared to ALT: {missing_lat_fc}")
print(f"Missing longitudes in FC compared to ALT: {missing_lon_fc}")

In [None]:
alt_df.to_parquet('alt_new_df_2.parquet')
fc_df.to_parquet('fc_new_df_2.parquet')
fch4_df.to_parquet('fch4_new_df_2.parquet')

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

alt_df = pd.read_parquet('alt_new_df_2.parquet')
fc_df = pd.read_parquet('fc_new_df_2.parquet')
fch4_df = pd.read_parquet('fch4_new_df_2.parquet')
common_time = pd.to_datetime(pd.read_parquet('common_time.parquet')[0].values)

In [None]:
alt = alt_df.sample(n=100)
fc = fc_df.sample(n=100)
fch4 = fch4_df.sample(n=100)

In [None]:
alt = alt[alt['datetime'].isin(common_time)].reset_index(drop=True)
fc = fc[fc['datetime'].isin(common_time)].reset_index(drop=True)
fch4 = fch4[fch4['datetime'].isin(common_time)].reset_index(drop=True)

print(f"Common time steps: {len(common_time)}")

In [None]:
# Combine all unique latitudes and longitudes
lat_union = sorted(set(alt['lat']).union(fc['lat']).union(fch4['lat']))
lon_union = sorted(set(alt['lon']).union(fc['lon']).union(fch4['lon']))

print(f"Unified latitudes: {len(lat_union)}")
print(f"Unified longitudes: {len(lon_union)}")

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

# Create the full spatial grid
full_grid = pd.DataFrame(
    [(lat, lon) for lat in lat_union for lon in lon_union],
    columns=['lat', 'lon']
)

# Add temporal dimension to the grid
full_grid = full_grid.assign(key=1)
common_time_df = pd.DataFrame({'datetime': common_time, 'key': 1})
full_grid = pd.merge(full_grid, common_time_df, on='key').drop('key', axis=1)

# Merge each dataset with the full grid
def expand_to_full_grid(df, full_grid, value_column):
    merged = pd.merge(full_grid, df, on=['datetime', 'lat', 'lon'], how='left')
    merged[value_column] = merged[value_column].fillna(np.nan)
    return merged

alt = expand_to_full_grid(alt, full_grid, 'alt')
fc = expand_to_full_grid(fc, full_grid, 'fc')
fch4 = expand_to_full_grid(fch4, full_grid, 'fch4')

print(f"ALT shape after merging: {alt.shape}")
print(f"FC shape after merging: {fc.shape}")
print(f"FCH4 shape after merging: {fch4.shape}")

In [None]:
from scipy.interpolate import griddata

# Temporal interpolation for each grid point
for df, var in zip([alt, fc, fch4], ['alt', 'fc', 'fch4']):
    df[var] = df.groupby(['lat', 'lon'])[var].transform(
        lambda x: x.interpolate(method='linear', limit_direction='both')
    )

# Spatial interpolation for remaining NaNs
def spatial_interpolate(df, var, lat_union, lon_union):
    for t in tqdm(common_time, desc=f"Interpolating {var} spatially"):
        
        time_slice = df[df['datetime'] == t]
        points = time_slice[['lat', 'lon']].dropna().values
        print(f"Number of points: {len(points)}")
        values = time_slice[var].dropna().values
        print(f"Number of values: {len(values)}")
        
        if len(points) == 0 or len(values) == 0:
            print(f"Skipping datetime {t}: insufficient data for interpolation.")
            continue

        grid_lat, grid_lon = np.meshgrid(lat_union, lon_union, indexing='ij')
        print(f"Lat range in points: {points[:, 0].min()} to {points[:, 0].max()}")
        print(f"Lon range in points: {points[:, 1].min()} to {points[:, 1].max()}")
        print(f"Grid lat range: {lat_union[0]} to {lat_union[-1]}")
        print(f"Grid lon range: {lon_union[0]} to {lon_union[-1]}")
        
        interpolated = griddata(points, values, (grid_lat, grid_lon), method='linear')

        # Update the dataframe
        mask = df['datetime'] == t
        interpolated_flatten = interpolated.flatten()
        valid_interpolated = ~np.isnan(interpolated_flatten)

        if valid_interpolated.sum() > 0:  # Ensure there are valid interpolations
            df.loc[mask, var] = df.loc[mask, var].fillna(
                pd.Series(interpolated_flatten[valid_interpolated])
            )
    return df

alt = spatial_interpolate(alt, 'alt', lat_union, lon_union)
fc = spatial_interpolate(fc, 'fc', lat_union, lon_union)
fch4 = spatial_interpolate(fch4, 'fch4', lat_union, lon_union)

In [None]:
# Validate alignment
assert alt.shape == fc_df.shape == fch4.shape, "Datasets are misaligned!"
assert alt[['datetime', 'lat', 'lon']].equals(fc[['datetime', 'lat', 'lon']]), "Spatial-temporal misalignment!"
assert fc[['datetime', 'lat', 'lon']].equals(fch4[['datetime', 'lat', 'lon']]), "Spatial-temporal misalignment!"

# Check for remaining NaNs
print(f"Remaining NaNs in ALT: {alt['alt'].isna().sum()}")
print(f"Remaining NaNs in FC: {fc['fc'].isna().sum()}")
print(f"Remaining NaNs in FCH4: {fch4['fch4'].isna().sum()}")

In [None]:
from scipy.interpolate import griddata

# Create a grid for interpolation
lat_grid, lon_grid = np.meshgrid(
    sorted(master_spatial['lat'].unique()),
    sorted(master_spatial['lon'].unique())
)

# Function to perform spatial interpolation for a single time step
def spatial_interpolate(df, var, lat_grid, lon_grid):
    points = df[['lat', 'lon']].dropna().values
    values = df[var].dropna().values
    grid_values = griddata(points, values, (lat_grid, lon_grid), method='linear')
    return grid_values

# Apply spatial interpolation for each time step
interpolated_alt = []
for t in tqdm(common_time, desc="Interpolating ALT spatially"):
    time_slice = alt_df[alt_df['datetime'] == t]
    interpolated_slice = spatial_interpolate(time_slice, 'alt', lat_grid, lon_grid)
    interpolated_alt.append(interpolated_slice)

# Convert back to DataFrame if needed
# Similarly for fc_df and fch4_df


In [None]:
# Check for remaining NaNs and fill them
alt_df['alt'] = alt_df['alt'].fillna(method='ffill').fillna(method='bfill')  # Last resort
fc_df['fc'] = fc_df['fc'].fillna(method='ffill').fillna(method='bfill')
fch4_df['fch4'] = fch4_df['fch4'].fillna(method='ffill').fillna(method='bfill')


In [None]:
# Check for NaNs
print(f"Remaining NaNs in ALT: {alt_df['alt'].isna().sum()}")
print(f"Remaining NaNs in FC: {fc_df['fc'].isna().sum()}")
print(f"Remaining NaNs in FCH4: {fch4_df['fch4'].isna().sum()}")


In [None]:
chunk_size = 10_000

def fill_grid_in_chunks(df, full_grid, chunk_size):
    grid_chunks = [full_grid[i:i + chunk_size] for i in tqdm(range(0, len(full_grid), chunk_size), desc='Processing grid chunks...')]
    filled_chunks = []
    for chunk in grid_chunks:
        merged_chunk = pd.merge(chunk, df, on=['datetime', 'lat', 'lon'], how='left')
        filled_chunks.append(merged_chunk)
    return pd.concat(filled_chunks, ignore_index=True)

In [None]:
full_grid2 = pd.DataFrame({
    'datetime': np.repeat(common_time, len(lat_union) * len(lon_union)),
    'lat': np.tile(np.repeat(lat_union, len(lon_union)), len(common_time)),
    'lon': np.tile(lon_union, len(lat_union) * len(common_time))
})

In [None]:
full_grid.shape, full_grid2.shape

In [None]:
alt_df.set_index(['datetime', 'lat', 'lon'], inplace=True)
fc_df.set_index(['datetime', 'lat', 'lon'], inplace=True)
fch4_df.set_index(['datetime', 'lat', 'lon'], inplace=True)

full_grid.set_index(['datetime', 'lat', 'lon'], inplace=True)

In [None]:
alt_df = fill_grid_in_chunks(alt_df, full_grid, chunk_size)
fc_df = fill_grid_in_chunks(fc_df, full_grid, chunk_size)
fch4_df = fill_grid_in_chunks(fch4_df, full_grid, chunk_size)

In [None]:
print(f"ALT shape after merging: {alt_df.shape}")
print(f"FC shape after merging: {fc_df.shape}")
print(f"FCH4 shape after merging: {fch4_df.shape}")

In [None]:
alt_df.to_parquet('alt_new_df_3.parquet')
fc_df.to_parquet('fc_new_df_3.parquet')
fch4_df.to_parquet('fch4_new_df_3.parquet')

In [None]:
# Aggregate data on the full grid
alt_df = alt_df.groupby(['datetime', 'lat', 'lon'], as_index=False)['alt'].min()
fc_df = fc_df.groupby(['datetime', 'lat', 'lon'], as_index=False).mean()
fch4_df = fch4_df.groupby(['datetime', 'lat', 'lon'], as_index=False).mean()

In [None]:
# Fill missing data
alt_df['alt'] = alt_df['alt'].fillna(alt_df['alt'].min())  # Fill with min value
fc_df.fillna(fc_df.mean(), inplace=True)  # Fill with column mean
fch4_df.fillna(fch4_df.mean(), inplace=True)  # Fill with column mean


In [None]:
assert alt_df.shape == fc_df.shape == fch4_df.shape, "Shape mismatch after alignment"
assert (alt_df[['datetime', 'lat', 'lon']] == fc_df[['datetime', 'lat', 'lon']]).all().all(), "Spatial or temporal misalignment"
assert (alt_df[['datetime', 'lat', 'lon']] == fch4_df[['datetime', 'lat', 'lon']]).all().all(), "Spatial or temporal misalignment"


In [None]:
import numpy as np

# Define tensor dimensions
time_steps = len(common_time)
lat_steps = len(lat_union)
lon_steps = len(lon_union)

# Initialize tensors
alt_tensor = np.full((time_steps, lat_steps, lon_steps), np.nan, dtype=np.float32)
fc_tensor = np.full((time_steps, lat_steps, lon_steps), np.nan, dtype=np.float32)
fch4_tensor = np.full((time_steps, lat_steps, lon_steps), np.nan, dtype=np.float32)

# Populate tensors
for df, tensor, var in zip([alt_df, fc_df, fch4_df], [alt_tensor, fc_tensor, fch4_tensor], ['alt', 'fc', 'fch4']):
    for idx, row in df.iterrows():
        t_idx = common_time.index(row['datetime'])
        lat_idx = lat_union.index(row['lat'])
        lon_idx = lon_union.index(row['lon'])
        tensor[t_idx, lat_idx, lon_idx] = row[var]


In [None]:
alt_df = alt_df[alt_df['lat'].isin(common_lat) & alt_df['lon'].isin(common_lon)]
fc_df = fc_df[fc_df['lat'].isin(common_lat) & fc_df['lon'].isin(common_lon)]
fch4_df = fch4_df[fch4_df['lat'].isin(common_lat) & fch4_df['lon'].isin(common_lon)]

In [None]:
alt_df.to_parquet('alt_new_df.parquet')
fc_df.to_parquet('fc_new_df.parquet')
fch4_df.to_parquet('fch4_new_df.parquet')

In [None]:
import pandas as pd
import numpy as np

alt_df = pd.read_parquet('alt_new_df.parquet')
fc_df = pd.read_parquet('fc_new_df.parquet')
fch4_df = pd.read_parquet('fch4_new_df.parquet')

In [None]:
# Verify unique lat/lon values
print(f"Unique latitudes in ALT: {alt_df['lat'].nunique()}")
print(f"Unique latitudes in FC: {fc_df['lat'].nunique()}")
print(f"Unique latitudes in FCH4: {fch4_df['lat'].nunique()}")

print(f"Unique longitudes in ALT: {alt_df['lon'].nunique()}")
print(f"Unique longitudes in FC: {fc_df['lon'].nunique()}")
print(f"Unique longitudes in FCH4: {fch4_df['lon'].nunique()}")

# Check for alignment
assert np.array_equal(alt_df['lat'], fc_df['lat']), "Spatial misalignment between ALT and FC in latitude."
assert np.array_equal(alt_df['lon'], fc_df['lon']), "Spatial misalignment between ALT and FC in longitude."
assert np.array_equal(alt_df['lat'], fch4_df['lat']), "Spatial misalignment between ALT and FCH4 in latitude."
assert np.array_equal(alt_df['lon'], fch4_df['lon']), "Spatial misalignment between ALT and FCH4 in longitude."

In [None]:
alt_sparse = []
fc_sparse = []
fch4_sparse = []

for t in tqdm(range(time_steps), desc='Processing sparse tensors...'):
    alt_slice = alt_df[alt_df['datetime'] == pd.unique(alt_df['datetime'])[t]]
    fc_slice = fc_df[fc_df['datetime'] == pd.unique(fc_df['datetime'])[t]]
    fch4_slice = fch4_df[fch4_df['datetime'] == pd.unique(fch4_df['datetime'])[t]]
    
    alt_sparse.append(coo_matrix((alt_slice['alt'], (alt_slice['lat_index'], alt_slice['lon_index'])), shape=(lat_steps, lon_steps)))
    fc_sparse.append(coo_matrix((fc_slice['fc'], (fc_slice['lat_index'], fc_slice['lon_index'])), shape=(lat_steps, lon_steps)))
    fch4_sparse.append(coo_matrix((fch4_slice['fch4'], (fch4_slice['lat_index'], fch4_slice['lon_index'])), shape=(lat_steps, lon_steps)))

In [None]:
for t, sparse_tensor in enumerate(alt_sparse):
    print(f"ALT Sparse Tensor - Time {t}: Shape = {sparse_tensor.shape}, Non-Zero Count = {sparse_tensor.nnz}")
for t, sparse_tensor in enumerate(fc_sparse):
    print(f"FC Sparse Tensor - Time {t}: Shape = {sparse_tensor.shape}, Non-Zero Count = {sparse_tensor.nnz}")
for t, sparse_tensor in enumerate(fch4_sparse):
    print(f"FCH4 Sparse Tensor - Time {t}: Shape = {sparse_tensor.shape}, Non-Zero Count = {sparse_tensor.nnz}")

In [None]:
smoothed_alt_sparse = []
smoothed_fc_sparse = []
smoothed_fch4_sparse = []

for alt, fc, fch4 in tqdm(zip(alt_sparse, fc_sparse, fch4_sparse), total=len(alt_sparse), desc="Smoothing Sparse Tensors"):
    alt_dense = alt.toarray()
    fc_dense = fc.toarray()
    fch4_dense = fch4.toarray()
    
    alt_smoothed = gaussian_filter(alt_dense, sigma=1)
    fc_smoothed = gaussian_filter(fc_dense, sigma=1)
    fch4_smoothed = gaussian_filter(fch4_dense, sigma=1)

    smoothed_alt_sparse.append(coo_matrix(alt_smoothed))
    smoothed_fc_sparse.append(coo_matrix(fc_smoothed))
    smoothed_fch4_sparse.append(coo_matrix(fch4_smoothed))

In [None]:
import matplotlib.pyplot as plt

# Visualize a slice from the ALT tensor (e.g., first time step)
plt.imshow(alt_tensor[0, :, :], cmap='viridis')
plt.colorbar(label='ALT Value')
plt.title('ALT Tensor - Time Step 0')
plt.xlabel('Longitude Index')
plt.ylabel('Latitude Index')
plt.show()

In [None]:
from scipy.sparse import hstack

ensemble_sparse = []
for alt, fc, fch4 in zip(smoothed_alt_sparse, smoothed_fc_sparse, smoothed_fch4_sparse):
    ensemble_sparse.append(hstack([alt, fc, fch4]))

In [None]:
# Validate ensemble tensor shape
print(f"Ensemble Tensor Shape: {ensemble_tensor.shape}")
assert ensemble_tensor.shape[0] == time_steps, "Mismatch in time steps."
assert ensemble_tensor.shape[1] == lat_steps, "Mismatch in latitude steps."
assert ensemble_tensor.shape[2] == lon_steps, "Mismatch in longitude steps."
assert ensemble_tensor.shape[3] == 3, "Mismatch in feature count (ALT, FC, FCH4)."

In [None]:
import h5py

with h5py.File('ensemble_tensor_sparse.h5', 'w') as f:
    grp = f.create_group('sparse_tensor')
    for t, sparse_slice in enumerate(ensemble_sparse):
        grp.create_dataset(f'time_{t}_data', data=sparse_slice.data)
        grp.create_dataset(f'time_{t}_row', data=sparse_slice.row)
        grp.create_dataset(f'time_{t}_col', data=sparse_slice.col)
        grp.attrs['shape'] = sparse_slice.shape

In [None]:
# Reconstruct dense tensor for a specific time step
t = 0  # Example: first time step
with h5py.File('ensemble_tensor_sparse.h5', 'r') as f:
    data = f[f'sparse_tensor/time_{t}_data'][:]
    row = f[f'sparse_tensor/time_{t}_row'][:]
    col = f[f'sparse_tensor/time_{t}_col'][:]
    shape = f['sparse_tensor'].attrs['shape']

    # Reconstruct sparse matrix and convert to dense
    sparse_slice = coo_matrix((data, (row, col)), shape=shape)
    dense_slice = sparse_slice.toarray()

In [None]:
# Define tensor dimensions
time_steps = len(common_time)
lat_steps = len(lat_mapping)
lon_steps = len(lon_mapping)

# Initialize tensors
alt_tensor = np.full((time_steps, lat_steps, lon_steps), np.nan, dtype=np.float32)
fc_tensor = np.full((time_steps, lat_steps, lon_steps), np.nan, dtype=np.float32)
fch4_tensor = np.full((time_steps, lat_steps, lon_steps), np.nan, dtype=np.float32)

# Populate tensors
for df, tensor in zip([alt_df, fc_df, fch4_df], [alt_tensor, fc_tensor, fch4_tensor]):
    t_indices = df['datetime_index'].to_numpy(dtype=np.int32)
    lat_indices = df['lat_index'].to_numpy(dtype=np.int32)
    lon_indices = df['lon_index'].to_numpy(dtype=np.int32)
    values = df.iloc[:, -1].to_numpy(dtype=np.float32)  # Assume the last column contains data
    tensor[t_indices, lat_indices, lon_indices] = values

In [None]:
# Check tensor shapes
print(f"ALT Tensor: {alt_tensor.shape}")
print(f"FC Tensor: {fc_tensor.shape}")
print(f"FCH4 Tensor: {fch4_tensor.shape}")

In [None]:
time_mapping = {dt: idx for idx, dt in enumerate(common_time)}

In [None]:
for df in [alt_df, fc_df, fch4_df]:
    df['datetime_index'] = df['datetime'].map(time_mapping)

In [None]:
alt_df = alt_df[alt_df['datetime_index'].notna()]
fc_df = fc_df[fc_df['datetime_index'].notna()]
fch4_df = fch4_df[fch4_df['datetime_index'].notna()]

#### Archived

In [None]:
merged_df = pd.merge(lat_lon_grid, alt_df, on=['datetime_index', 'lat_index', 'lon_index'], how='left')

In [None]:
merged_df = merged_df.sort_values(by=['datetime_index', 'lat_index', 'lon_index'])
merged_df['alt'] = merged_df['alt'].interpolate(method='nearest')

In [None]:
merged_df = pd.merge(merged_df, datetime_mapping, on='datetime_index', how='left')
merged_df = pd.merge(merged_df, lat_mapping, on='lat_index', how='left')
merged_df = pd.merge(merged_df, lon_mapping, on='lon_index', how='left')

In [None]:
merged_df = merged_df[['datetime', 'lat', 'lon', 'alt', 'datetime_index', 'lat_index', 'lon_index']]

In [None]:
print(merged_df.head())
print(merged_df.tail())

In [None]:
alt_df['time_index'] = alt_df['datetime'].dt.to_period('M').astype('category').cat.codes

In [None]:
lat_grid = np.linspace(alt_df['lat'].min(), alt_df['lat'].max(), 100)  # Adjust resolution
lon_grid = np.linspace(alt_df['lon'].min(), alt_df['lon'].max(), 100)

In [None]:
time_indices = alt_df['time_index'].unique()

In [None]:
time_indices = alt_df['time_index'].unique()

In [None]:
datetime_mapping = alt_df[['time_index', 'datetime']].drop_duplicates()  # Map time_index back to datetime

In [None]:
lat_mapping = alt_df[['time_index', 'datetime']].drop_duplicates()  # Map time_index back to datetime

In [None]:
lat_lon_grid = pd.DataFrame(
    [(t, lat, lon) for t in time_indices for lat in lat_grid for lon in lon_grid],
    columns=['time_index', 'lat', 'lon']
)

In [None]:
lat_grid

In [None]:
merged_df = pd.merge(lat_lon_grid, alt_df, on=['time_index', 'lat', 'lon'], how='left')

In [None]:
print(merged_df.head())
print(merged_df.tail())

In [None]:
merged_df = pd.merge(merged_df, datetime_mapping, on='time_index', how='left', suffixes=('', '_mapped'))

In [None]:
print(merged_df.head())
print(merged_df.tail())

In [None]:
merged_df['datetime'] = merged_df['datetime'].fillna(merged_df['datetime_mapped'])

In [None]:
print(merged_df.head())
print(merged_df.tail())

In [None]:
merged_df = merged_df.drop(columns=['datetime_mapped'])  # Clean up

In [None]:
print(merged_df.head())
print(merged_df.tail())

In [None]:
merged_df = merged_df.sort_values(by=['time_index', 'lat', 'lon'])

In [None]:
print(merged_df.head())
print(merged_df.tail())

In [None]:
merged_df['alt'] = merged_df['alt'].interpolate(method='nearest')

In [None]:
print(merged_df.head())
print(merged_df.tail())

In [None]:
merged_df = merged_df[['time_index','lat','lon','alt']]

In [None]:
tensor = merged_df.pivot_table(
    index='time_index', columns=['lat', 'lon'], values='alt', aggfunc='min'
).fillna(0).values

In [None]:
#tensor_reshaped = tensor.reshape(len(pd.unique(alt_df['time_index'])), len(lat_grid), len(lon_grid))

In [None]:
print(f"Tensor Shape: {tensor.shape}")

In [None]:
reshaped_tensor = tensor.reshape(701,100,100,1)

In [None]:
alt_df['timestamp'] = alt_df['datetime'].astype('int64')//10**9 #datetime timestamp encoding | conversion to seconds from epoch for DL methods

In [None]:
alt_df = alt_df[['timestamp','lat','lon','alt']]

In [None]:
alt_df.columns = ['datetime','lat','lon','alt']

In [None]:
alt_df.alt = alt_df.alt.astype('float16')

In [None]:
#6708 alt_df.datetime.nunique()
#1092 alt_df.lat.nunique()
#1092 alt_df.lon.nunique()
#238860 alt_df.alt.nunique()

In [None]:
#3163257965+55-6708 (gives us the next factoring opportunity to divide dataset into nunique() values of datetime)
#so, subtract 6653 from 3163257965
#3163251312

In [None]:
print(alt_df.dtypes)
print(alt_df[['lat', 'lon']].isnull().sum())

In [None]:
alt_df['lat'] = pd.to_numeric(alt_df['lat'], errors='coerce')
alt_df['lon'] = pd.to_numeric(alt_df['lon'], errors='coerce')
alt_df = alt_df.dropna(subset=['lat', 'lon'])

In [None]:
alt_df

In [None]:
alt_df = alt_df.sort_values(by=['datetime', 'lat', 'lon'])

In [None]:
with h5py.File('alt_df_nonan_norm_sorted.h5', 'w') as f:
    f.create_dataset('alt', data=alt_df)

In [None]:
# duplicates = alt_df.duplicated(subset=['lat', 'lon'])
# print("Number of duplicates:", duplicates.sum())
# #Number of duplicates: 3162257873

In [None]:
# alt_df = alt_df.groupby(['datetime', 'lat', 'lon'])['alt'].min().reset_index()

In [None]:
import dask.dataframe as dd

dask_df = dd.read_parquet('alt_df_nonan_norm_dtencoded_tonumeric_nonan.parquet')
grouped_dask_df = dask_df.groupby(['datetime', 'lat', 'lon'])['alt'].min()

In [None]:
alt_df = grouped_dask_df.compute(); del dask_df, grouped_dask_df

In [None]:
#If duplicates exist:
temp_df = temp_df.groupby(['lat', 'lon'])['alt'].min().reset_index()

In [None]:
all_lats = np.linspace(alt_df['lat'].min(), alt_df['lat'].max(), 1092)
all_lons = np.linspace(alt_df['lon'].min(), alt_df['lon'].max(), 1092)

In [None]:
grid = grid.reindex(index=all_lats, columns=all_lons)
grid = grid.interpolate(method='linear', axis=0).interpolate(method='linear', axis=1).fillna(0)

In [None]:
if temp_df.empty:
    spatiotemporal_data[i, :, :] = 0
    continue

In [None]:
timestamps = alt_df['datetime'].unique()
spatiotemporal_data = np.zeros((len(timestamps), 1092, 1092), dtype=np.float16)

In [None]:
all_lats = np.linspace(alt_df['lat'].min(), alt_df['lat'].max(), 1092)
all_lons = np.linspace(alt_df['lon'].min(), alt_df['lon'].max(), 1092)

In [None]:
for i, timestamp in enumerate(timestamps):
    temp_df = alt_df[alt_df['datetime'] == timestamp]
    temp_df = temp_df.groupby(['lat', 'lon'])['alt'].min().reset_index()
    if temp_df.empty:
        spatiotemporal_data[i, :, :] = 0
        continue
    grid = temp_df.pivot(index='lat', columns='lon', values='alt')
    grid = grid.reindex(index=all_lats, columns=all_lons)
    grid = grid.interpolate(method='linear', axis=0).interpolate(method='linear', axis=1).fillna(0)
    spatiotemporal_data[i, :, :] = grid.values

In [None]:
print("Spatiotemporal data shape:", spatiotemporal_data.shape)

#### Archived

In [None]:
import pandas as pd
alt_df = pd.read_parquet('alt_df_nonan_norm.parquet')
#alt_df = pd.read_parquet('/Volumes/JPL/geocryoai/modeling/data/input/alt/alt_df_nonan_norm.parquet')

In [None]:
TIME_STEP_CHUNK = 30
LAT_DIM = 1092
LON_DIM = 1092
OUTPUT_PATH = "processed_tensor_alt.h5"

with h5py.File(OUTPUT_PATH, 'w') as f:
    unique_datetimes = alt_df['datetime'].unique()
    total_time_steps = len(unique_datetimes)
    f.create_dataset(
        'tensor', 
        shape=(total_time_steps, LAT_DIM, LON_DIM), 
        dtype='float32', 
        chunks=(TIME_STEP_CHUNK, LAT_DIM, LON_DIM),
        compression='gzip'
    )

In [None]:
chunked_datetimes = [
    unique_datetimes[i:i + TIME_STEP_CHUNK]
    for i in range(0, len(unique_datetimes), TIME_STEP_CHUNK)
]

In [None]:
current_idx = 0
for chunk in tqdm(chunked_datetimes, desc="Processing datetime chunks"):
    chunk_data = alt_df[alt_df['datetime'].isin(chunk)]
    grids = []
    for dt in tqdm(chunk, desc="Processing spatial grids", leave=False):
        grid = chunk_data[chunk_data['datetime'] == dt].pivot_table(
            index='lat',
            columns='lon',
            values='alt',
            aggfunc='min'
        )
        grid = grid.reindex(index=np.linspace(grid.index.min(), grid.index.max(), LAT_DIM),
                            columns=np.linspace(grid.columns.min(), grid.columns.max(), LON_DIM),
                            method='nearest').fillna(0)
        grids.append(grid.values)
    tensor_chunk = np.stack(grids, axis=0)
    with h5py.File(OUTPUT_PATH, 'a') as f:
        f['tensor'][current_idx:current_idx + tensor_chunk.shape[0], :, :] = tensor_chunk
        current_idx += tensor_chunk.shape[0]

In [None]:
with h5py.File(OUTPUT_PATH, 'r') as f:
    tensor = f['tensor'][:]

In [None]:
tensor = tensor[..., np.newaxis]

In [None]:
samples = tensor.shape[0] // TIME_STEP_CHUNK

In [None]:
reshaped_tensor = tensor[:samples * TIME_STEP_CHUNK].reshape(
    samples, TIME_STEP_CHUNK, LAT_DIM, LON_DIM, 1
)

In [None]:
reshaped_tensor.shape

In [None]:
with h5py.File('alt_df_nonan_norm_5dtensor.h5', 'w') as f:
    f.create_dataset('alt_5dtensor', data=reshaped_tensor)

In [None]:
alt = reshaped_tensor

In [None]:
del tensor, samples, reshaped_tensor, current_idx, tensor_chunk, grids, grid, chunk, chunk_data, chunked_datetimes, unique_datetimes, total_time_steps, alt_df, OUTPUT_PATH, TIME_STEP_CHUNK, LAT_DIM, LON_DIM

In [None]:
fc_df = pd.read_parquet('fc_df_nonan_norm.parquet')
#fc_df = pd.read_parquet('/Volumes/JPL/geocryoai/modeling/data/input/fc/fc_df_nonan_norm.parquet')
fc_df

In [None]:
#314 fc_df.datetime.nunique() #--> unique values of datetime
#430493 fc_df.lat.nunique() #--> unique values of latitude
#466928 fc_df.lon.nunique() #--> unique values of longitude
#2229070 fc_df.fc.nunique() #--> unique values of fc 

In [None]:
fc_df['timestamp'] = fc_df['datetime'].astype('int64')//10**6 #datetime timestamp encoding | conversion to seconds from epoch for DL methods

In [None]:
fc_df = fc_df[['timestamp','lat','lon','fc']]

In [None]:
fc_df.columns = ['datetime','lat','lon','fc']
fc_df

In [None]:
TIME_STEP_CHUNK = 30
LAT_DIM = 1092
LON_DIM = 1092
OUTPUT_PATH = "processed_tensor_fc.h5"

In [None]:
with h5py.File(OUTPUT_PATH, 'w') as f:
    unique_datetimes = fc_df['datetime'].unique()
    total_time_steps = len(unique_datetimes)
    f.create_dataset(
        'tensor', 
        shape=(total_time_steps, LAT_DIM, LON_DIM), 
        dtype='float32', 
        chunks=(TIME_STEP_CHUNK, LAT_DIM, LON_DIM),
        compression='gzip'
    )

In [None]:
chunked_datetimes = [
    unique_datetimes[i:i + TIME_STEP_CHUNK]
    for i in range(0, len(unique_datetimes), TIME_STEP_CHUNK)
]

In [None]:
current_idx = 0
for chunk in tqdm(chunked_datetimes, desc="Processing datetime chunks"):
    chunk_data = fc_df[fc_df['datetime'].isin(chunk)]
    grids = []
    for dt in tqdm(chunk, desc="Processing spatial grids", leave=False):
        grid = chunk_data[chunk_data['datetime'] == dt].pivot_table(
            index='lat',
            columns='lon',
            values='fc',
            aggfunc='sum'
        )
        grid = grid.reindex(index=np.linspace(grid.index.min(), grid.index.max(), LAT_DIM),
                            columns=np.linspace(grid.columns.min(), grid.columns.max(), LON_DIM),
                            method='nearest').fillna(0)
        grids.append(grid.values)
    tensor_chunk = np.stack(grids, axis=0)
    with h5py.File(OUTPUT_PATH, 'a') as f:
        f['tensor'][current_idx:current_idx + tensor_chunk.shape[0], :, :] = tensor_chunk
        current_idx += tensor_chunk.shape[0]

In [None]:
with h5py.File(OUTPUT_PATH, 'r') as f:
    tensor = f['tensor'][:]

In [None]:
tensor = tensor[..., np.newaxis]

In [None]:
samples = tensor.shape[0] // TIME_STEP_CHUNK

In [None]:
reshaped_tensor = tensor[:samples * TIME_STEP_CHUNK].reshape(
    samples, TIME_STEP_CHUNK, LAT_DIM, LON_DIM, 1
)

In [None]:
reshaped_tensor.shape

In [None]:
with h5py.File('fc_df_nonan_norm_5dtensor.h5', 'w') as f:
    f.create_dataset('fc_5dtensor', data=reshaped_tensor)

In [None]:
fc = reshaped_tensor

In [None]:
del tensor, samples, reshaped_tensor, current_idx, tensor_chunk, grids, grid, chunk, chunk_data, chunked_datetimes, unique_datetimes, total_time_steps, fc_df, OUTPUT_PATH, TIME_STEP_CHUNK, LAT_DIM, LON_DIM

In [None]:
fc

In [None]:
fch4_df = pd.read_parquet('fch4_df_nonan_norm.parquet')
#fch4_df = pd.read_parquet('/Volumes/JPL/geocryoai/modeling/data/input/fch4/fch4_df_nonan_norm.parquet')
fch4_df

In [None]:
#314 fch4_df.datetime.nunique() #--> unique values of datetime
#430493 fch4_df.lat.nunique() #--> unique values of latitude
#466928 fc_df.lon.nunique() #--> unique values of longitude
#4085540 fc_df.fc.nunique() #--> unique values of fc 

In [None]:
fch4_df['timestamp'] = fch4_df['datetime'].astype('int64')//10**6 #datetime timestamp encoding | conversion to seconds from epoch for DL methods

In [None]:
fch4_df = fch4_df[['timestamp','lat','lon','fch4']]

In [None]:
fch4_df.columns = ['datetime','lat','lon','fch4']
fch4_df

In [None]:
TIME_STEP_CHUNK = 30
LAT_DIM = 1092
LON_DIM = 1092
OUTPUT_PATH = "fch4_processed_tensors.h5" 

In [None]:
with h5py.File(OUTPUT_PATH, 'w') as f:
    unique_datetimes = fch4_df['datetime'].unique()
    total_time_steps = len(unique_datetimes)
    f.create_dataset(
        'tensor', 
        shape=(total_time_steps, LAT_DIM, LON_DIM), 
        dtype='float32', 
        chunks=(TIME_STEP_CHUNK, LAT_DIM, LON_DIM), 
        compression='gzip'
    )

In [None]:
chunked_datetimes = [
    unique_datetimes[i:i + TIME_STEP_CHUNK]
    for i in range(0, len(unique_datetimes), TIME_STEP_CHUNK)
]

In [None]:
current_idx = 0
for chunk in tqdm(chunked_datetimes, desc="Processing datetime chunks"):
    chunk_data = fch4_df[fch4_df['datetime'].isin(chunk)]
    grids = []
    for dt in tqdm(chunk, desc="Processing spatial grids", leave=False):
        grid = chunk_data[chunk_data['datetime'] == dt].pivot_table(
            index='lat',
            columns='lon',
            values='fch4',
            aggfunc='sum'
        )
        grid = grid.reindex(index=np.linspace(grid.index.min(), grid.index.max(), LAT_DIM),
                            columns=np.linspace(grid.columns.min(), grid.columns.max(), LON_DIM),
                            method='nearest').fillna(0)
        grids.append(grid.values)
    tensor_chunk = np.stack(grids, axis=0)
    with h5py.File(OUTPUT_PATH, 'a') as f:
        f['tensor'][current_idx:current_idx + tensor_chunk.shape[0], :, :] = tensor_chunk
        current_idx += tensor_chunk.shape[0]

In [None]:
with h5py.File(OUTPUT_PATH, 'r') as f:
    tensor = f['tensor'][:]

In [None]:
tensor = tensor[..., np.newaxis]

In [None]:
samples = tensor.shape[0] // TIME_STEP_CHUNK

In [None]:
reshaped_tensor = tensor[:samples * TIME_STEP_CHUNK].reshape(
    samples, TIME_STEP_CHUNK, LAT_DIM, LON_DIM, 1
)

In [None]:
reshaped_tensor.shape

In [None]:
with h5py.File('fch4_df_nonan_norm_5dtensor.h5', 'w') as f:
    f.create_dataset('fch4_5dtensor', data=reshaped_tensor)

In [None]:
fch4 = reshaped_tensor

In [None]:
del tensor, samples, reshaped_tensor, current_idx, tensor_chunk, grids, grid, chunk, chunk_data, chunked_datetimes, unique_datetimes, total_time_steps, fch4_df, OUTPUT_PATH, TIME_STEP_CHUNK, LAT_DIM, LON_DIM

In [None]:
alt.shape

In [None]:
fc.shape

In [None]:
fch4.shape

#### Archived

In [None]:
#alt_df.alt.shape
#(3163257965,)
#fc_df.fc.shape
#(96654894,)
#fch4_df.fch4.shape
#(96654894,)

In [None]:
alt_df.alt.values.shape

In [None]:
print(list(divisorGenerator(3163257965)))

In [None]:
# padded_alt_df = np.pad(alt_df.alt.values, (0, 9825), mode='constant')

# # Check the new shape of the padded array
# print(f"Padded array shape: {padded_alt_df.shape}")

In [None]:
# reshaped_array = padded_alt_df.reshape(325, 100, 100, 973)

# # Check the new shape of the reshaped array
# print(f"New reshaped array shape: {reshaped_array.shape}")

In [None]:
alt_df.alt.values.reshape(-1, 1).shape

In [None]:
#6708 alt_df.datetime.nunique() --> (unique values of datetime)
#1092 alt_df.lat.nunique() --> (unique values of lat)
#1092 alt_df.lon.nunique() --> (unique values of lon)
#238860 alt_df.alt.nunique() --> (unique values of alt)

In [None]:
total_elements = 3163257965
factors = []

for i in range(1, int(total_elements ** 0.5) + 1):
    if total_elements % i == 0:
        factors.append(i)

print(factors)

In [None]:
# import numpy as np

# # Total number of elements in the original array
# total_elements = 3163257965

# # Choose DIM1, DIM2, DIM3 such that their product divides total_elements
# DIM1 = 325
# DIM2 = 100
# DIM3 = 100

# # Calculate the product of the first three dimensions
# product_of_first_three = DIM1 * DIM2 * DIM3

# # Calculate the target size (next multiple of product_of_first_three)
# target_size = (total_elements // product_of_first_three) * product_of_first_three
# if total_elements % product_of_first_three != 0:
#     target_size += product_of_first_three

# # Calculate the padding required
# padding_needed = target_size - total_elements

# # If the original array size is smaller than the target size, pad it
# if total_elements < target_size:
#     padded_array = np.pad(alt_df.alt.values, (0, padding_needed), mode='constant')
#     print(f"Padded array size: {padded_array.size}")
# else:
#     padded_array = alt_df.alt.values
#     print(f"No padding needed")

# # Check the original and target sizes
# print(f"Original size: {total_elements}, Target size: {target_size}")

# # Reshape the padded array into the desired 4D shape with a singleton dimension at the end
# reshaped_array = padded_array.reshape(DIM1, DIM2, DIM3, padded_array.size // (DIM1 * DIM2 * DIM3), 1)

# # Output the new shape
# print(f"Reshaped array shape: {reshaped_array.shape}")


# # Padded array size: 3165500000
# # Original size: 3163257965, Target size: 3165500000
# # Reshaped array shape: (325, 100, 100, 974, 1)

In [None]:
# import h5py

# # Save the reshaped array as an HDF5 file
# with h5py.File('alt_df_nonan_norm_5dtensor_padded.h5', 'w') as f:
#     f.create_dataset('alt_5dtensor', data=reshaped_array)

In [None]:
# alt = reshaped_array; del alt_df, reshaped_arrays

In [None]:
# del padded_alt_df, padded_array, padding_needed

In [None]:
import numpy as np

total_elements = 3163257965

DIM1 = 7
DIM2 = 100
DIM3 = 100

product_of_first_three = DIM1 * DIM2 * DIM3

target_size = (total_elements // product_of_first_three) * product_of_first_three

if total_elements > target_size:
    trimmed_array = alt_df.alt.values[:target_size]  # Trim the array to the target size
    print(f"Trimmed array size: {trimmed_array.size}")
else:
    trimmed_array = alt_df.alt.values
    print(f"No trimming needed")

print(f"Original size: {total_elements}, Target size: {target_size}")

reshaped_array = trimmed_array.reshape(DIM1, DIM2, DIM3, trimmed_array.size // (DIM1 * DIM2 * DIM3), 1)

print(f"Reshaped array shape: {reshaped_array.shape}")

In [None]:
reshaped_array

In [None]:
import h5py

# Save the reshaped array as an HDF5 file
with h5py.File('alt_df_nonan_norm_5dtensor_trimmed_2.h5', 'w') as f:
    f.create_dataset('alt_5dtensor', data=reshaped_array)

In [None]:
alt = reshaped_array; del alt_df, reshaped_array

In [None]:
fc_df.fc.values.shape

In [None]:
print(list(divisorGenerator(96654894)))

In [None]:
fc_df.fc.values.reshape(-1, 1).shape

In [None]:
total_elements = 96654894

factors = []

for i in range(1, int(total_elements ** 0.5) + 1):
    if total_elements % i == 0:
        factors.append(i)

print(factors)

In [None]:
import numpy as np

total_elements = 96654894

DIM1 = 7
DIM2 = 100
DIM3 = 100

product_of_first_three = DIM1 * DIM2 * DIM3

target_size = (total_elements // product_of_first_three) * product_of_first_three

if total_elements > target_size:
    trimmed_array = fc_df.fc.values[:target_size]
    print(f"Trimmed array size: {trimmed_array.size}")
else:
    trimmed_array = fc_df.fc.values
    print(f"No trimming needed")

print(f"Original size: {total_elements}, Target size: {target_size}")

reshaped_array = trimmed_array.reshape(DIM1, DIM2, DIM3, trimmed_array.size // (DIM1 * DIM2 * DIM3), 1)

print(f"Reshaped array shape: {reshaped_array.shape}")

In [None]:
reshaped_array

In [None]:
import h5py

# Save the reshaped array as an HDF5 file
with h5py.File('fc_df_nonan_norm_5dtensor_trimmed_2.h5', 'w') as f:
    f.create_dataset('fc_5dtensor', data=reshaped_array)

In [None]:
fc = reshaped_array; del fc_df, reshaped_array

In [None]:
fch4_df.fch4.values.shape

In [None]:
print(list(divisorGenerator(96654894)))

In [None]:
fch4_df.fch4.values.reshape(-1, 1).shape

In [None]:
total_elements = 96654894

factors = []

for i in range(1, int(total_elements ** 0.5) + 1):
    if total_elements % i == 0:
        factors.append(i)

print(factors)

In [None]:
import numpy as np

total_elements = 96654894

DIM1 = 7
DIM2 = 100
DIM3 = 100

product_of_first_three = DIM1 * DIM2 * DIM3

target_size = (total_elements // product_of_first_three) * product_of_first_three

if total_elements > target_size:
    trimmed_array = fch4_df.fch4.values[:target_size]
    print(f"Trimmed array size: {trimmed_array.size}")
else:
    trimmed_array = fch4_df.fch4.values
    print(f"No trimming needed")

print(f"Original size: {total_elements}, Target size: {target_size}")

reshaped_array = trimmed_array.reshape(DIM1, DIM2, DIM3, trimmed_array.size // (DIM1 * DIM2 * DIM3), 1)

print(f"Reshaped array shape: {reshaped_array.shape}")

In [None]:
reshaped_array

In [None]:
import h5py

# Save the reshaped array as an HDF5 file
with h5py.File('fch4_df_nonan_norm_5dtensor_trimmed_2.h5', 'w') as f:
    f.create_dataset('fch4_5dtensor', data=reshaped_array)

In [None]:
fch4 = reshaped_array; del fch4_df, reshaped_array

In [None]:
# # To load it back:
# with h5py.File('alt_df_nonan_norm_5dtensor_padded.h5', 'r') as f:
#     loaded_array = f['reshaped_array'][:]
# print(f"Loaded array shape: {loaded_array.shape}")

In [None]:
# To load it back:
with h5py.File('alt_df_nonan_norm_5dtensor_trimmed.h5', 'r') as f:
    alt = f['reshaped_array'][:]
print(f"Loaded array shape: {alt.shape}")

In [None]:
# To load it back:
with h5py.File('fc_df_nonan_norm_5dtensor_trimmed.h5', 'r') as f:
    fc = f['reshaped_array'][:]
print(f"Loaded array shape: {fc.shape}")

In [None]:
# To load it back:
with h5py.File('fch4_df_nonan_norm_5dtensor_trimmed.h5', 'r') as f:
    fch4 = f['reshaped_array'][:]
print(f"Loaded array shape: {fch4.shape}")

In [None]:
alt.shape

In [None]:
fc.shape

In [None]:
fch4.shape

In [None]:
# If my tensors are shaped like the following:
# alt.shape
# (325, 100, 100, 973, 1)
# fc.shape
# (42, 100, 100, 230, 1)
# fch4.shape
# (42, 100, 100, 230, 1)

# How can we reshape them so that they are identical to the following tensor shape formats?
# alt = (X, Y, 100, 100, 1)
# fc = (X, Y, 1)
# fch4 = (X, Y, 1)
# target = (X, 1)
# whereby X should be the same number and Y should be the same number...?

In [None]:
If data_format='channels_last': 5D tensor with shape: (samples, time, rows, cols, channels)

# Model

**Remember to reverse normalization of alt, fc, and fch4 values in each coincident column after training and validation to confirm values.** </br>
**Normalization was conducted with min-max normalization methodology, e.g., alt_df['alt'] = (alt_df['alt'] - alt_df['alt'].min()) / (alt_df['alt'].max() - alt_df['alt'].min())**

In [None]:
# Load tensors

In [None]:
with h5py.File('ensemble_tensor.h5', 'r') as f:
    alt = f['alt'][:].astype('float32')
    fc = f['fc'][:].astype('float32')
    fch4 = f['fch4'][:].astype('float32')

In [None]:
train_alt = alt[:180]
val_alt = alt[180:200]
test_alt = alt[200:]

In [None]:
# def downsample(data, target_size=(256, 256)):
#     shape = data.shape
#     reshaped = tf.image.resize(data.reshape(-1, shape[2], shape[3], shape[4]), target_size)
#     return reshaped.numpy().reshape(shape[0], shape[1], target_size[0], target_size[1], shape[4])

In [None]:
# train_alt = downsample(train_alt)
# val_alt = downsample(val_alt)
# test_alt = downsample(test_alt)
# print(f"New Shape: {train_alt.shape}")

In [None]:
from tensorflow.keras import layers, models

def spatial_model(input_shape):
    inputs = layers.Input(shape=input_shape)
    x = layers.Conv2D(filters=32, kernel_size=(3, 3), activation='relu', padding='same')(inputs)
    x = layers.MaxPooling2D(pool_size=(2, 2))(x)
    x = layers.Flatten()(x)
    return models.Model(inputs, x)

def temporal_model(input_shape):
    inputs = layers.Input(shape=input_shape)
    x = layers.LSTM(units=64, activation='relu', return_sequences=False)(inputs)
    return models.Model(inputs, x)

def build_combined_model(spatial_shape, temporal_steps):
    spatial_input = layers.Input(shape=spatial_shape)
    temporal_input = layers.Input(shape=(temporal_steps, spatial_shape[0]))

    spatial_features = layers.TimeDistributed(spatial_model(spatial_shape))(spatial_input)
    temporal_output = temporal_model((temporal_steps, spatial_features.shape[-1]))(spatial_features)

    outputs = layers.Dense(units=1, activation='linear')(temporal_output)
    model = models.Model(inputs=[spatial_input, temporal_input], outputs=outputs)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), 
                  loss='mse', 
                  metrics=['mae'])
    return model

# spatial_shape = (1092, 1092, 1)
# temporal_steps = 30

# model = build_combined_model(spatial_shape, temporal_steps)
# model.summary()

In [None]:
spatial_model((8,8,1))

In [None]:
plot_model(model, to_file='model_architecture.png', show_shapes=True)

In [None]:
def create_dataset(data, batch_size=4):
    dataset = tf.data.Dataset.from_tensor_slices((data[:, :-1], data[:, -1]))
    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

train_dataset = create_dataset(train_alt)
val_dataset = create_dataset(val_alt)
test_dataset = create_dataset(test_alt)

In [None]:
early_stopping = callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
checkpoint_cb = callbacks.ModelCheckpoint('best_model.h5', save_best_only=True)

history = model.fit(train_dataset, 
                    validation_data=val_dataset, 
                    epochs=50, 
                    callbacks=[early_stopping, checkpoint_cb])

# Evaluate the model
results = model.evaluate(test_dataset)
print(f"Test Loss: {results[0]}, Test MAE: {results[1]}")

# Save the final model
model.save('final_spatiotemporal_model.h5')

In [None]:
def build_model():
    model = models.Sequential()
    
    # Conv3D layers for spatiotemporal feature extraction
    model.add(layers.Conv3D(
        #filters=hp.Int('conv3d_filters', min_value=16, max_value=64, step=16),
        filters=64,
        kernel_size=(3, 3, 3),
        activation='relu',
        padding='same',
        input_shape=(30, 1092, 1092, 1)))
    model.add(layers.MaxPooling3D(pool_size=(2, 2, 2)))

    # ConvLSTM2D for temporal-spatial dependencies
    model.add(layers.ConvLSTM2D(
        #filters=hp.Int('convlstm_filters', min_value=16, max_value=64, step=16),
        filters=64,
        kernel_size=(3, 3),
        activation='relu',
        padding='same',
        return_sequences=False))

    # Fully connected layers for prediction
    model.add(layers.Flatten())
    model.add(layers.Dense(
        #units=hp.Int('dense_units', min_value=32, max_value=128, step=32),
        units=128,
        activation='relu'))
    model.add(layers.Dense(1, activation='linear'))

    model.compile(
        #optimizer=tf.keras.optimizers.Adam(learning_rate=hp.Choice('learning_rate', [1e-3, 1e-4, 1e-5])),
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
        loss='mse',
        metrics=['mae']
    )
    return model

In [None]:
model = build_model()

In [None]:
model.summary()

In [None]:
checkpoint_cb = callbacks.ModelCheckpoint('best_model.keras', save_best_only=True)
earlystop_cb = callbacks.EarlyStopping(patience=10, restore_best_weights=True)

In [None]:
history = model.fit(
    train_data, train_data[:, 0],
    validation_data=(val_data, val_data[:, 0]),
    epochs=100,
    callbacks=[checkpoint_cb, earlystop_cb]
)

In [None]:
results = model.evaluate(test_data, test_data[:, 0])
print("Test Loss, Test MAE:", results)

In [None]:
model.save('final_alt_model.h5')

In [None]:
train_alt_verify = train_alt[-10:]
valid_alt_verify = val_alt[-10:]
test_alt_verify = test_alt[-10:]

In [None]:
def run_tuner_with_progress(tuner, train_data, val_data):
    progress_bar = tqdm(total=tuner.oracle.max_trials, desc="Tuning Progress")
    class TQDMCallback(callbacks.Callback):
        def on_trial_begin(self, trial, logs=None):
            progress_bar.set_description(f"Running Trial {trial.trial_id}")

        def on_trial_end(self, trial, logs=None):
            progress_bar.update(1)

    tuner.search(
        train_alt_verify, train_alt_verify[:, 0],
        validation_data=(valid_alt_verify, valid_alt_verify[:, 0]),
        epochs=50,
        callbacks=[TQDMCallback()]
    )
    progress_bar.close()

In [None]:
def create_tuner_with_progress():
    tuner_creation_desc = "Creating BayesianOptimization Tuner"
    with tqdm(total=1, desc=tuner_creation_desc) as pbar:
        tuner = BayesianOptimization(
            build_model,
            objective='val_loss',
            max_trials=2,
            directory='tuner_logs',
            project_name='alt_model_tuning',
            executions_per_trial=1,
            overwrite=True
        )
        pbar.update(1)
    return tuner

In [None]:
num_parallel_trials = 2
tuner = create_tuner_with_progress()

In [None]:
# #OR TRY:
# tuner = BayesianOptimization(
#     build_model,
#     objective='val_loss',
#     max_trials=10,
#     directory='tuner_logs',
#     project_name='alt_model_tuning',
#     executions_per_trial=1,
#     overwrite=True
# )

In [None]:
# #OR TRY:
# from kerastuner.tuners import RandomSearch

# def create_tuner_with_progress():
#     tuner_creation_desc = "Creating Random Search Tuner"
#     with tqdm(total=1, desc=tuner_creation_desc) as pbar:
#         tuner = RandomSearch(
#             build_model,
#             objective='val_loss',
#             max_trials=2,
#             directory='tuner_logs',
#             project_name='alt_model_tuning',
#             overwrite=True
#         )
#         pbar.update(1)
#     return tuner

In [None]:
# num_parallel_trials = 2
# tuner = create_tuner_with_progress()

In [None]:
tuner

In [None]:
checkpoint_cb = callbacks.ModelCheckpoint('best_model.keras', save_best_only=True)
earlystop_cb = callbacks.EarlyStopping(patience=10, restore_best_weights=True)

In [None]:
run_tuner_with_progress(tuner, train_data, val_data)

In [None]:
best_hps = tuner.get_best_hyperparameters(1)[0]
model = tuner.hypermodel.build(best_hps)

In [None]:
build_model([64, 64, 128, 1e-4])

In [None]:
history = model.fit(
    train_data, train_data[:, 0],
    validation_data=(val_data, val_data[:, 0]),
    epochs=100,
    callbacks=[checkpoint_cb, earlystop_cb]
)

In [None]:
results = model.evaluate(test_data, test_data[:, 0])
print("Test Loss, Test MAE:", results)

In [None]:
model.save('final_alt_model.h5')

In [None]:
# import tensorflow as tf
# from tensorflow.keras import layers, models, callbacks
# from kerastuner.tuners import BayesianOptimization

# # Define modular ConvLSTM networks
# def build_alt_model(input_shape):
#     """Model for active layer thickness (ALT)."""
#     model = models.Sequential([
#         layers.ConvLSTM2D(32, (3, 3), activation='relu', return_sequences=True, input_shape=input_shape),
#         layers.BatchNormalization(),
#         layers.ConvLSTM2D(64, (3, 3), activation='relu', return_sequences=False),
#         layers.Flatten(),
#         layers.Dense(128, activation='relu'),
#         layers.Dropout(0.3),
#         layers.Dense(1)  # Final output for ALT predictions
#     ])
#     return model

# def build_fc_model(input_shape):
#     """Model for carbon dioxide flux (FC)."""
#     model = models.Sequential([
#         layers.Conv3D(32, (3, 3, 3), activation='relu', input_shape=input_shape),
#         layers.MaxPooling3D((2, 2, 2)),
#         layers.Conv3D(64, (3, 3, 3), activation='relu'),
#         layers.GlobalAveragePooling3D(),
#         layers.Dense(128, activation='relu'),
#         layers.Dropout(0.3),
#         layers.Dense(1)  # Final output for FC predictions
#     ])
#     return model

# def build_fch4_model(input_shape):
#     """Model for methane flux (FCH4)."""
#     model = models.Sequential([
#         layers.Conv3D(32, (3, 3, 3), activation='relu', input_shape=input_shape),
#         layers.MaxPooling3D((2, 2, 2)),
#         layers.Conv3D(64, (3, 3, 3), activation='relu'),
#         layers.GlobalAveragePooling3D(),
#         layers.Dense(128, activation='relu'),
#         layers.Dropout(0.3),
#         layers.Dense(1)  # Final output for FCH4 predictions
#     ])
#     return model

# # Combine modular networks into an ensemble
# def build_ensemble_model(alt_input_shape, fc_input_shape, fch4_input_shape):
#     alt_model = build_alt_model(alt_input_shape)
#     fc_model = build_fc_model(fc_input_shape)
#     fch4_model = build_fch4_model(fch4_input_shape)
    
#     combined = layers.Concatenate()([alt_model.output, fc_model.output, fch4_model.output])
#     x = layers.Dense(128, activation='relu')(combined)
#     x = layers.Dropout(0.3)(x)
#     output = layers.Dense(1)(x)  # Final combined prediction

#     model = models.Model(inputs=[alt_model.input, fc_model.input, fch4_model.input], outputs=output)
#     return model

In [None]:
with h5py.File('fc_df_nonan_norm_5dtensor.h5', 'r') as f:
    fc = f['fc_5dtensor'][:]
print(f"Loaded fc tensor shape: {fc.shape}")

with h5py.File('fch4_df_nonan_norm_5dtensor.h5', 'r') as f:
    fch4 = f['fch4_5dtensor'][:]
print(f"Loaded fch4 tensor shape: {fch4.shape}")

#### Dataframes by feature

In [None]:
with open('/Volumes/JPL/alt/alt.parquet','rb') as f:
    alt=pd.read_parquet(f)

In [None]:
with open('/Volumes/JPL/fch4/fch4.parquet','rb') as f:
    ch4=pd.read_parquet(f)

In [None]:
with open('/Volumes/JPL/fc/fc.parquet','rb') as f:
    co2=pd.read_parquet(f)

In [None]:
merged_df = pd.merge(alt, ch4, on=['datetime', 'lat', 'lon'], how='outer')
del alt, ch4

In [None]:
merged_df = pd.merge(merged_df, co2, on=['datetime', 'lat', 'lon'], how='outer')
del co2

In [None]:
merged_df.columns = ['datetime', 'lat', 'lon', 'alt', 'fch4', 'fc']
merged_df = merged_df.groupby(['datetime', 'lat', 'lon']).mean().reset_index()
merged_df = merged_df.groupby(['datetime', 'lat', 'lon']).sem().reset_index()

# Inspect the result
print(merged_df.head())

In [None]:
#alt.groupby(['datetime','lat','lon'])['alt'].mean().plot();
#ch4.groupby(['datetime','lat','lon'])['fch4'].mean().plot();
#co2.groupby(['datetime','lat','lon'])['fc'].mean().plot();

In [None]:
fig, ax = plt.subplots(figsize=(10,6), dpi=200)
ax.set_title('Spatial aggregation of Active Layer Thickness (ALT), '+r'$\mathregular{1 km^{2}}$', fontsize=12)
ax.plot(alt.groupby(['lat','lon'])['alt'].mean().values.tolist(), label='Monthly mean, ALT')
ax.set_ylabel(r'$\mathregular{m}$', fontsize=12, labelpad=10)
ax.grid(True)
ax.legend(loc='best', fontsize=10)
ax.ticklabel_format(useOffset=False, style='sci')
ax.set_xlabel('Years elapsed, 1800-2100', fontsize=12, labelpad=10)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10,6), dpi=200)
ax.set_title('Spatial aggregation of Methane '+r'($\mathregular{CH}_{4}$) flux, 1 $\mathregular{km^{2}}$', fontsize=12)
ax.plot(ch4.groupby(['lat','lon'])['fch4'].mean().values.tolist(), label="Monthly mean, "+r'$\mathregular{CH_4} {\phi}$',color='magenta')
ax.set_ylabel(r'$\mathregular{nmol CH_4 mol^{-1} km^{-2} month^{-1}}$', fontsize=12, labelpad=10)
ax.grid(True)
ax.legend(loc='best', fontsize=10)
ax.ticklabel_format(useOffset=False, style='sci')
ax.set_xlabel('Years elapsed, 1996-2022', fontsize=12, labelpad=10)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10,6), dpi=200)
ax.set_title('Spatial aggregation of Carbon Dioxide '+r'($\mathregular{CO}_{2}$) flux, 1 $\mathregular{km^{2}}$', fontsize=12)
ax.plot(co2.groupby(['lat','lon'])['fc'].mean().values.tolist(), label="Monthy mean, "+r'$\mathregular{CO_2} {\phi}$',color='springgreen')
ax.set_ylabel(r'$\mathregular{umol CO_2 mol^{-1} km^{-2} month^{-1}}$', fontsize=12, labelpad=10)
ax.grid(True)
ax.legend(loc='best', fontsize=10)
ax.ticklabel_format(useOffset=False, style='sci')
ax.set_xlabel('Years elapsed, 1996-2022', fontsize=12, labelpad=10)
plt.show()

In [None]:
ch4

### ALT

#### Clean

In [None]:
#df=df.replace(0,np.nan).dropna()
alt=alt[alt.alt!=0]
alt['datetime']=pd.to_datetime(alt['datetime'])
alt=alt.sort_values(by='datetime',ascending=True)
alt=alt.reset_index(drop=True)
alt

In [None]:
#alt.to_parquet('/Volumes/JPL/alt.parquet',engine='pyarrow',compression='snappy')

#### Split

In [None]:
# Assuming 'datetime' is of type datetime64 in the dataframe
train = alt[(alt['datetime'] >= '1800-01-01') & (alt['datetime'] <= '2018-12-31')]
valid = alt[(alt['datetime'] >= '2019-01-01') & (alt['datetime'] <= '2021-12-31')]
test = alt[(alt['datetime'] >= '2022-01-01') & (alt['datetime'] <= '2100-12-31')]

In [None]:
# alt[(alt.datetime >='2017-01-01') & (alt.datetime <= '2022-12-31')]

In [None]:
# df_resampled = alt.set_index('datetime').resample('M').mean().reset_index()  # Resample to monthly mean

In [None]:
# train = df_resampled[(df_resampled['datetime'] >= '1800-01-01') & (df_resampled['datetime'] <= '1980-12-31')]
# valid = df_resampled[(df_resampled['datetime'] >= '1981-01-01') & (df_resampled['datetime'] <= '2020-12-31')]
# test = df_resampled[(df_resampled['datetime'] >= '2021-01-01') & (df_resampled['datetime'] <= '2100-12-31')]

In [None]:
train
#plt.plot(train.datetime, train.alt);

In [None]:
valid
#plt.plot(valid.datetime, valid.alt);

In [None]:
test
#plt.plot(test.datetime, test.alt);

In [None]:
#training
alt[(alt['datetime'] >= '1800-01-01') & (alt['datetime'] <= '2018-12-31')]

In [None]:
902418/1167089

In [None]:
start_date = '1800-01-01'
end_date = '2018-12-31'

plt.figure(figsize=(12, 6))
plt.plot(alt['datetime'], alt['alt'], linestyle='-', marker='o', markersize=1, color='blue')

# Set the x-axis limit to the specified date range
plt.xlim(pd.to_datetime(start_date), pd.to_datetime(end_date))

plt.xlabel('Datetime')
plt.ylabel('ALT')
plt.title('ALT over Time')
plt.grid(True)
plt.show()

In [None]:
#validation
alt[(alt['datetime'] >= '2019-01-01') & (alt['datetime'] < '2021-12-31')]

In [None]:
136275/1167089

In [None]:
start_date = '2019-01-01'
end_date = '2021-12-31'

plt.figure(figsize=(12, 6))
plt.plot(alt['datetime'], alt['alt'], linestyle='-', marker='o', markersize=1, color='blue')

# Set the x-axis limit to the specified date range
plt.xlim(pd.to_datetime(start_date), pd.to_datetime(end_date))

plt.xlabel('Datetime')
plt.ylabel('ALT')
plt.title('ALT over Time')
plt.grid(True)
plt.show()

In [None]:
#testing
alt[(alt['datetime'] >= '2022-01-01')]

In [None]:
128396/1167089

In [None]:
start_date = '2021-01-01'
end_date = '2100-12-31'

plt.figure(figsize=(12, 6))
plt.plot(alt['datetime'], alt['alt'], linestyle='-', marker='o', markersize=1, color='blue')

# Set the x-axis limit to the specified date range
plt.xlim(pd.to_datetime(start_date), pd.to_datetime(end_date))

plt.xlabel('Datetime')
plt.ylabel('ALT')
plt.title('ALT over Time')
plt.grid(True)
plt.show()

In [None]:
alt_train=alt[(alt['datetime'] >= '1800-01-01') & (alt['datetime'] <= '2018-12-31')]
alt_valid=alt[(alt['datetime'] >= '2019-01-01') & (alt['datetime'] < '2021-12-31')]
alt_test=alt[(alt['datetime'] >= '2022-01-01')]

In [None]:
alt_train=alt_train[alt_train.alt!=0]
alt_train['datetime']=pd.to_datetime(alt_train['datetime'])
alt_train=alt_train.sort_values(by='datetime',ascending=True)
alt_train=alt_train.reset_index(drop=True)
alt_train

In [None]:
alt_train.to_parquet('/Volumes/JPL/alt_train.parquet',engine='pyarrow',compression='snappy')

In [None]:
alt_valid=alt_valid[alt_valid.alt!=0]
alt_valid['datetime']=pd.to_datetime(alt_valid['datetime'])
alt_valid=alt_valid.sort_values(by='datetime',ascending=True)
alt_valid=alt_valid.reset_index(drop=True)
alt_valid

In [None]:
alt_valid.to_parquet('/Volumes/JPL/alt_valid.parquet',engine='pyarrow',compression='snappy')

In [None]:
alt_test=alt_test[alt_test.alt!=0]
alt_test['datetime']=pd.to_datetime(alt_test['datetime'])
alt_test=alt_test.sort_values(by='datetime',ascending=True)
alt_test=alt_test.reset_index(drop=True)
alt_test

In [None]:
alt_test.to_parquet('/Volumes/JPL/alt_test.parquet',engine='pyarrow',compression='snappy')

### CH4

#### Clean

In [None]:
ch4=ch4[ch4.fch4!=0]

In [None]:
ch4['datetime']=pd.to_datetime(ch4['datetime'])

In [None]:
ch4=ch4.sort_values(by='datetime',ascending=True)

In [None]:
ch4=ch4.reset_index(drop=True)
ch4

In [None]:
#ch4.to_parquet('/Volumes/JPL/fch4.parquet',engine='pyarrow',compression='snappy')

#### Split

In [None]:
#training
ch4[ch4['datetime'] < '2012-01-01']

In [None]:
1472576874/1972395203

In [None]:
start_date = '1996-06-01'
end_date = '2011-12-01'

plt.figure(figsize=(12, 6))
plt.plot(ch4['datetime'], ch4['fch4'], linestyle='-', marker='o', markersize=1, color='blue')

# Set the x-axis limit to the specified date range
plt.xlim(pd.to_datetime(start_date), pd.to_datetime(end_date))

plt.xlabel('Datetime')
plt.ylabel('FCH4')
plt.title('FCH4 over Time')
plt.grid(True)
plt.show()

In [None]:
#validation
ch4[(ch4['datetime'] >= '2012-01-01') & (ch4['datetime'] < '2013-10-01')]

In [None]:
281618588/1972395203

In [None]:
start_date = '2012-01-01'
end_date = '2013-09-01'

plt.figure(figsize=(12, 6))
plt.plot(ch4['datetime'], ch4['fch4'], linestyle='-', marker='o', markersize=1, color='blue')

# Set the x-axis limit to the specified date range
plt.xlim(pd.to_datetime(start_date), pd.to_datetime(end_date))

plt.xlabel('Datetime')
plt.ylabel('FCH4')
plt.title('FCH4 over Time')
plt.grid(True)
plt.show()

In [None]:
#testing
ch4[(ch4['datetime'] >= '2013-10-01')]

In [None]:
218199741/1972395203

In [None]:
start_date = '2013-10-01'
end_date = '2022-10-01'

plt.figure(figsize=(12, 6))
plt.plot(ch4['datetime'], ch4['fch4'], linestyle='-', marker='o', markersize=1, color='blue')

# Set the x-axis limit to the specified date range
plt.xlim(pd.to_datetime(start_date), pd.to_datetime(end_date))

plt.xlabel('Datetime')
plt.ylabel('FCH4')
plt.title('FCH4 over Time')
plt.grid(True)
plt.show()

In [None]:
ch4_train=ch4[ch4['datetime'] < '2012-01-01']

In [None]:
ch4_train=ch4_train[ch4_train.fch4!=0]
ch4_train['datetime']=pd.to_datetime(ch4_train['datetime'])
ch4_train=ch4_train.sort_values(by='datetime',ascending=True)
ch4_train=ch4_train.reset_index(drop=True)
ch4_train

In [None]:
ch4_train.to_parquet('/Volumes/JPL/fch4_train.parquet',engine='pyarrow',compression='snappy')

In [None]:
del ch4_train

In [None]:
ch4_valid=ch4[(ch4['datetime'] >= '2012-01-01') & (ch4['datetime'] < '2013-10-01')]

In [None]:
ch4_valid=ch4_valid[ch4_valid.fch4!=0]
ch4_valid['datetime']=pd.to_datetime(ch4_valid['datetime'])
ch4_valid=ch4_valid.sort_values(by='datetime',ascending=True)
ch4_valid=ch4_valid.reset_index(drop=True)
ch4_valid

In [None]:
ch4_valid.to_parquet('/Volumes/JPL/fch4_valid.parquet',engine='pyarrow',compression='snappy')

In [None]:
del ch4_valid

In [None]:
ch4_test=ch4[(ch4['datetime'] >= '2013-10-01')]

In [None]:
ch4_test=ch4_test[ch4_test.fch4!=0]
ch4_test['datetime']=pd.to_datetime(ch4_test['datetime'])
ch4_test=ch4_test.sort_values(by='datetime',ascending=True)
ch4_test=ch4_test.reset_index(drop=True)
ch4_test

In [None]:
ch4_test.to_parquet('/Volumes/JPL/fch4_test.parquet',engine='pyarrow',compression='snappy')

In [None]:
del ch4_test

### CO2

#### Clean

In [None]:
co2=co2[co2['datetime'] < '2023-01-01']
co2

In [None]:
co2=co2[co2.fc!=0]

In [None]:
co2

In [None]:
co2['datetime']=pd.to_datetime(co2['datetime'])

In [None]:
co2.to_parquet('/Volumes/JPL/fc.parquet',engine='pyarrow',compression='snappy')

In [None]:
import pandas as pd
import numpy as np
with open('/Volumes/JPL/fc.parquet','rb') as f:
    co2=pd.read_parquet(f)

In [None]:
co2=co2.sort_values(by='datetime',ascending=True)

In [None]:
co2.to_parquet('/Volumes/JPL/fc2.parquet',engine='pyarrow',compression='snappy')

In [None]:
with open('/Volumes/JPL/fc2.parquet','rb') as f:
    co2=pd.read_parquet(f)

In [None]:
co2=co2.reset_index(drop=True)
co2

In [None]:
co2.to_parquet('/Volumes/JPL/fc3.parquet',engine='pyarrow',compression='snappy')

In [None]:
with open('/Volumes/JPL/newdf_fc3.parquet','rb') as f:
    co2=pd.read_parquet(f)

#### Split

In [None]:
co2

In [None]:
#training
co2[co2['datetime'] < '2012-01-01']

In [None]:
3147412758/4210946142

In [None]:
start_date = '1996-06-01'
end_date = '2012-01-01'

plt.figure(figsize=(12, 6))
plt.plot(co2['datetime'], co2['fch4'], linestyle='-', marker='o', markersize=1, color='blue')

# Set the x-axis limit to the specified date range
plt.xlim(pd.to_datetime(start_date), pd.to_datetime(end_date))

plt.xlabel('Datetime')
plt.ylabel('FC')
plt.title('FC over Time')
plt.grid(True)
plt.show()

In [None]:
#validation
co2[(co2['datetime'] >= '2012-01-01') & (co2['datetime'] < '2013-10-01')]

In [None]:
610055462/4210946142

In [None]:
start_date = '2012-01-01'
end_date = '2013-10-01'

plt.figure(figsize=(12, 6))
plt.plot(co2['datetime'], co2['fc'], linestyle='-', marker='o', markersize=1, color='blue')

# Set the x-axis limit to the specified date range
plt.xlim(pd.to_datetime(start_date), pd.to_datetime(end_date))

plt.xlabel('Datetime')
plt.ylabel('FC')
plt.title('FC over Time')
plt.grid(True)
plt.show()

In [None]:
#testing
co2[(co2['datetime'] >= '2013-10-01')]

In [None]:
453477922/4210946142

In [None]:
start_date = '2018-08-01'
end_date = '2100-12-01'

plt.figure(figsize=(12, 6))
plt.plot(co2['datetime'], co2['fc'], linestyle='-', marker='o', markersize=1, color='blue')

# Set the x-axis limit to the specified date range
plt.xlim(pd.to_datetime(start_date), pd.to_datetime(end_date))

plt.xlabel('Datetime')
plt.ylabel('FC')
plt.title('FC over Time')
plt.grid(True)
plt.show()

In [None]:
co2_train=co2[co2['datetime'] < '2012-01-01']

In [None]:
co2_train=co2_train[co2_train.fc!=0]
co2_train['datetime']=pd.to_datetime(co2_train['datetime'])
co2_train=co2_train.sort_values(by='datetime',ascending=True)
co2_train=co2_train.reset_index(drop=True)
co2_train

In [None]:
co2_train.to_parquet('/Volumes/JPL/fc_train.parquet',engine='pyarrow',compression='snappy')

In [None]:
del co2_train

In [None]:
co2_valid=co2[(co2['datetime'] >= '2012-01-01') & (co2['datetime'] < '2013-10-01')]

In [None]:
co2_valid=co2_valid[co2_valid.fc!=0]
co2_valid['datetime']=pd.to_datetime(co2_valid['datetime'])
co2_valid=co2_valid.sort_values(by='datetime',ascending=True)
co2_valid=co2_valid.reset_index(drop=True)
co2_valid

In [None]:
co2_valid.to_parquet('/Volumes/JPL/fc_valid.parquet',engine='pyarrow',compression='snappy')

In [None]:
del co2_valid

In [None]:
co2_test=co2[(co2['datetime'] >= '2013-10-01')]

In [None]:
co2_test=co2_test[co2_test.fc!=0]
co2_test['datetime']=pd.to_datetime(co2_test['datetime'])
co2_test=co2_test.sort_values(by='datetime',ascending=True)
co2_test=co2_test.reset_index(drop=True)
co2_test

In [None]:
co2_test.to_parquet('/Volumes/JPL/fc_test.parquet',engine='pyarrow',compression='snappy')

In [None]:
del co2_test

### Standardize

In [None]:
# with open('/Volumes/JPL/alt_train.parquet','rb') as f:
#     alt_train=pd.read_parquet(f)
# with open('/Volumes/JPL/fc_train.parquet','rb') as f:
#     fc_train=pd.read_parquet(f)
# with open('/Volumes/JPL/fch4_train.parquet','rb') as f:
#     fch4_train=pd.read_parquet(f)

In [None]:
with open('/Volumes/JPL/alt_train.parquet','rb') as f:
    alt_train=pd.read_parquet(f)
with open('/Volumes/JPL/alt_valid.parquet','rb') as f:
    alt_valid=pd.read_parquet(f)
with open('/Volumes/JPL/alt_test.parquet','rb') as f:
    alt_test=pd.read_parquet(f)

In [None]:
# with open('/Volumes/JPL/alt_valid.parquet','rb') as f:
#     alt_valid=pd.read_parquet(f)
# with open('/Volumes/JPL/fc_valid.parquet','rb') as f:
#     fc_valid=pd.read_parquet(f)
# with open('/Volumes/JPL/fch4_valid.parquet','rb') as f:
#     fch4_valid=pd.read_parquet(f)

In [None]:
with open('/Volumes/JPL/fc_train.parquet','rb') as f:
    fc_train=pd.read_parquet(f)
with open('/Volumes/JPL/fc_valid.parquet','rb') as f:
    fc_valid=pd.read_parquet(f)
with open('/Volumes/JPL/fc_test.parquet','rb') as f:
    fc_test=pd.read_parquet(f)

In [None]:
# with open('/Volumes/JPL/alt_test.parquet','rb') as f:
#     alt_test=pd.read_parquet(f)
# with open('/Volumes/JPL/fc_test.parquet','rb') as f:
#     fc_test=pd.read_parquet(f)
# with open('/Volumes/JPL/fch4_test.parquet','rb') as f:
#     fch4_test=pd.read_parquet(f)

In [None]:
with open('/Volumes/JPL/fch4_train.parquet','rb') as f:
    fch4_train=pd.read_parquet(f)
with open('/Volumes/JPL/fch4_valid.parquet','rb') as f:
    fch4_valid=pd.read_parquet(f)
with open('/Volumes/JPL/fch4_test.parquet','rb') as f:
    fch4_test=pd.read_parquet(f)

In [None]:
# tf.convert_to_tensor(X.index.values.astype(np.int64))

In [None]:
# def process_dataframe(df, variable_name):
#     # Mask out rows where alt is 0 or NaN
#     mask = (df['alt'] != 0) & (~df['alt'].isna())
#     df_filtered = df[mask]
    
#     # Ensure no NaN or 0 values in `alt`
#     assert df_filtered['alt'].isna().sum() == 0
#     assert (df_filtered['alt'] == 0).sum() == 0
    
#     # Convert to array: datetime, lat, lon, variable_name
#     array = df_filtered[['datetime', 'lat', 'lon', variable_name]].values
#     return array

In [None]:
# process_dataframe(alt_train,'datetime').shape

In [None]:
filtered_alt = pd.DataFrame({'alt': alt.round(4)}).replace(0, np.nan).dropna()
filtered_df = pd.merge(alt, filtered_alt, left_index=True, right_index=True)
aggregated_df = filtered_df.groupby(['datetime', 'lat', 'lon']).agg({'alt_x': 'mean'}).reset_index()
aggregated_df.rename(columns={'alt_x': 'alt'}, inplace=True)
pivot_df = aggregated_df.pivot_table(index='datetime', columns=['lat', 'lon'], values='alt')
pivot_df

In [None]:
pivot_df_numeric = pivot_df.apply(pd.to_numeric, errors='coerce')
pivot_df_flat = pivot_df_numeric.copy()
pivot_df_flat.columns = ['_'.join(map(str, col)) for col in pivot_df.columns]
interpolated_df_flat = pivot_df_flat.interpolate(method='linear', axis=0).interpolate(method='linear', axis=1)
interpolated_df_flat = interpolated_df_flat.ffill().bfill()
remaining_nans = interpolated_df_flat.isna().sum().sum()

In [None]:
multiindex_tuples = [tuple(col.split('_')) for col in interpolated_df_flat.columns]
interpolated_df = interpolated_df_flat.copy()
interpolated_df.columns = pd.MultiIndex.from_tuples(multiindex_tuples)
#print(interpolated_df.head())

In [None]:
# Create a full index with all combinations of datetime, lat, and lon
full_index = pd.MultiIndex.from_product(
    [interpolated_df.index.get_level_values('datetime').unique(),
     interpolated_df.columns.get_level_values(level=0).unique(),
     interpolated_df.columns.get_level_values(level=1).unique()],
    names=['datetime', 'lat', 'lon']
)

# Reindex the DataFrame to include all possible combinations
interpolated_df_reindexed = interpolated_df.reindex(full_index, fill_value=np.nan)

# Verify the new shape of the DataFrame
print(interpolated_df_reindexed.shape)

In [None]:
# nan_mask = nan_mask.reindex_like(interpolated_df)
# assert nan_mask.shape == interpolated_df.shape, "Shapes of nan_mask and interpolated_df do not match!"

In [None]:
# Generate Gaussian noise
noise_std = 0.01  # Adjust the standard deviation as needed
gaussian_noise = np.random.normal(0, noise_std, interpolated_df_reindexed.shape)

# Add Gaussian noise only where NaNs were originally present
interpolated_df_reindexed += interpolated_df_reindexed.isna() * gaussian_noise

In [None]:
n_datetime = len(interpolated_df.index.get_level_values('datetime').unique())
print(f"Number of unique datetime values: {n_datetime}")
n_lat = len(interpolated_df.columns.get_level_values(level=0).unique())
n_lon = len(interpolated_df.columns.get_level_values(level=1).unique())
print(f"Number of unique lat values: {n_lat}")
print(f"Number of unique lon values: {n_lon}")
n_elements = interpolated_df.size
print(f"Total number of elements in the DataFrame: {n_elements}")
expected_size = n_datetime * n_lat * n_lon
print(f"Expected total size: {expected_size}")

In [None]:
# Reshape the reindexed DataFrame to a 4D array
array_data = interpolated_df_reindexed.values.reshape(
    len(interpolated_df_reindexed.index.get_level_values('datetime').unique()),  # Number of unique datetime values
    len(interpolated_df_reindexed.columns.get_level_values(level=0).unique()),   # Number of unique latitude values
    len(interpolated_df_reindexed.columns.get_level_values(level=1).unique()),   # Number of unique longitude values
    1  # Altitude is a single channel
)

# Verify the shape of the resulting array
print(array_data.shape)  # Expected shape should now match (3795, 363, 367, 1)

In [None]:
n_timesteps = array_data.shape[0]
train_size = int(0.7 * n_timesteps)
valid_size = int(0.15 * n_timesteps)
test_size = n_timesteps - train_size - valid_size

X_train = array_data[:train_size]
y_train = array_data[:train_size, :, :, 0]
X_valid = array_data[train_size:train_size + valid_size]
y_valid = array_data[train_size:train_size + valid_size, :, :, 0]
X_test = array_data[train_size + valid_size:]
y_test = array_data[train_size + valid_size:, :, :, 0]

# Convert to TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(32)
valid_dataset = tf.data.Dataset.from_tensor_slices((X_valid, y_valid)).batch(32)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(32)

In [None]:
aggregated_df['datetime'] = pd.to_datetime(aggregated_df['datetime'])
aggregated_df['year'] = aggregated_df['datetime'].dt.year
aggregated_df['month'] = aggregated_df['datetime'].dt.month

# Assuming 'alt' is your original DataFrame with columns ['datetime', 'lat', 'lon', 'alt']

# Convert the 'datetime' column to datetime format
aggregated_df['datetime'] = pd.to_datetime(aggregated_df['datetime'])

# Split datetime into year and month
aggregated_df['year'] = aggregated_df['datetime'].dt.year
aggregated_df['month'] = aggregated_df['datetime'].dt.month

In [None]:
filtered_alt = pd.DataFrame({'alt': aggregated_df['alt'].values.round(4)}).replace(0, np.nan).dropna()
filtered_df = pd.merge(aggregated_df, filtered_alt, left_index=True, right_index=True)
aggregated_df = filtered_df.groupby(['year', 'month', 'lat', 'lon']).agg({'alt_x': 'mean'}).reset_index()
aggregated_df.rename(columns={'alt_x': 'alt'}, inplace=True)

# Get unique years, months, latitudes, and longitudes
unique_years = np.sort(aggregated_df['year'].unique())
unique_months = np.arange(1, 13)  # Since months should always be from 1 to 12
unique_lats = np.sort(aggregated_df['lat'].unique())
unique_lons = np.sort(aggregated_df['lon'].unique())

# Initialize the ndarray with zeros
ndarray = np.zeros((len(unique_years), len(unique_months), len(unique_lats), len(unique_lons)))

# Create mappings from years, months, latitudes, and longitudes to indices
year_to_idx = {year: i for i, year in enumerate(unique_years)}
month_to_idx = {month: i for i, month in enumerate(unique_months)}
lat_to_idx = {lat: i for i, lat in enumerate(unique_lats)}
lon_to_idx = {lon: i for i, lon in enumerate(unique_lons)}

# Populate the ndarray with alt values
for _, row in aggregated_df.iterrows():
    year_idx = year_to_idx[row['year']]
    month_idx = month_to_idx[row['month']]
    lat_idx = lat_to_idx[row['lat']]
    lon_idx = lon_to_idx[row['lon']]
    ndarray[year_idx, month_idx - 1, lat_idx, lon_idx] = row['alt']  # month - 1 for 0-indexing

# Add an additional axis to the ndarray to represent the 'alt' dimension
ndarray = ndarray[..., np.newaxis]

print(f"Final ndarray shape: {ndarray.shape}")  # Should be (years, 12, lats, lons, 1)

# # Filter and aggregate the data
# filtered_alt = pd.DataFrame({'alt': aggregated_df['alt'].values.round(4)}).replace(0, np.nan).dropna()
# filtered_df = pd.merge(aggregated_df, filtered_alt, left_index=True, right_index=True)
# aggregated_df = filtered_df.groupby(['year', 'month', 'lat', 'lon']).agg({'alt_x': 'mean'}).reset_index()
# aggregated_df.rename(columns={'alt_x': 'alt'}, inplace=True)

# # Get unique years, months, latitudes, and longitudes
# unique_years = np.sort(aggregated_df['year'].unique())
# unique_months = np.arange(1, 13)  # Since months should always be from 1 to 12
# unique_lats = np.sort(aggregated_df['lat'].unique())
# unique_lons = np.sort(aggregated_df['lon'].unique())

# # Initialize the ndarray with zeros
# ndarray = np.zeros((len(unique_years), len(unique_months), len(unique_lats), len(unique_lons)))

# # Create mappings from years, months, latitudes, and longitudes to indices
# year_to_idx = {year: i for i, year in enumerate(unique_years)}
# month_to_idx = {month: i for i, month in enumerate(unique_months)}
# lat_to_idx = {lat: i for i, lat in enumerate(unique_lats)}
# lon_to_idx = {lon: i for i, lon in enumerate(unique_lons)}

# # Populate the ndarray with alt values
# for _, row in aggregated_df.iterrows():
#     year_idx = year_to_idx[row['year']]
#     month_idx = month_to_idx[row['month']]
#     lat_idx = lat_to_idx[row['lat']]
#     lon_idx = lon_to_idx[row['lon']]
#     ndarray[year_idx, month_idx - 1, lat_idx, lon_idx] = row['alt']  # month - 1 for 0-indexing

# # Add an additional axis to the ndarray to represent the 'alt' dimension
# ndarray = ndarray[..., np.newaxis]

# print(f"Final ndarray shape: {ndarray.shape}")  # Should be (years, 12, lats, lons, 1)


In [None]:
alt['lat_bin'] = alt['lat'].round(4)  # Adjust the rounding based on your grid resolution
alt['lon_bin'] = alt['lon'].round(4)

alt_pivot = alt.pivot_table(index=['datetime'], columns=['lat_bin', 'lon_bin'], values='alt', fill_value=0)

sequence_length = 12  # Adjust based on your time series needs
X = []
for i in range(len(grid) - sequence_length):
    X.append(grid[i:i + sequence_length])
X = np.array(X)


df = alt.groupby(['datetime', 'lat', 'lon']).agg({'alt': 'mean'}).reset_index()

In [None]:
df.set_index('datetime').resample('M').mean().reset_index()

In [None]:
alt.replace(0, np.nan).dropna()

In [None]:
df.iloc[df.set_index('datetime').resample('M').mean().reset_index().alt.replace(0, np.nan).dropna().index.values.tolist()]

In [None]:
#alt.shape, co2.shape
ch4.shape

In [None]:
# alt=co2; del co2
# alt.columns=['datetime','lat','lon','alt']

In [None]:
#unc = alt.groupby(['datetime', 'lat', 'lon']).agg({'alt': 'sem'}).reset_index()

In [None]:
# df=alt.groupby(['datetime','lat','lon']).agg({'alt': 'mean'}).reset_index()
# df.iloc[df.set_index('datetime').resample('M').mean().reset_index().alt.replace(0,np.nan).dropna().index.values.tolist()].sort_values(by='datetime').reset_index(drop=True)

In [None]:
# Shape of ALT: (1167089, 4)
# datetime	lat	lon	alt
# 0	1800-01-01	68.623871	-149.619370	1.263625e-01
# 1	1800-01-01	69.399323	-150.949722	1.670335e-02
# 2	1800-01-01	65.165298	-164.821640	-1.262362e-01
# 3	1800-01-01	64.874695	-147.681366	1.989791e-02
# 4	1800-01-01	68.623871	-149.619370	1.253314e-01
# ...	...	...	...	...
# 1167084	2100-12-01	63.815472	-144.956192	-5.960464e-08
# 1167085	2100-12-01	68.714645	-149.028976	-5.960464e-08
# 1167086	2100-12-01	68.929497	-150.280441	5.960464e-08
# 1167087	2100-12-01	65.567215	-148.925171	5.960464e-08
# 1167088	2100-12-01	64.869881	-147.739990	5.960464e-08
# 1167089 rows × 4 columns
# Shape of FCH4: (1972395203, 4)
# datetime	lat	lon	fch4
# 0	1996-06-01	69.509682	-148.587189	37.477615
# 1	1996-06-01	69.509682	-148.587189	10.290923
# 2	1996-06-01	69.509682	-148.587189	13.020323
# 3	1996-06-01	69.509682	-148.587189	-7.242540
# 4	1996-06-01	69.509682	-148.587189	5.294002
# ...	...	...	...	...
# 1972395198	2022-10-01	61.268654	-163.228394	0.000449
# 1972395199	2022-10-01	61.268654	-163.228394	0.011420
# 1972395200	2022-10-01	61.268654	-163.228394	-0.010974
# 1972395201	2022-10-01	61.268654	-163.228394	-0.248331
# 1972395202	2022-10-01	61.268654	-163.228394	-0.540498
# 1972395203 rows × 4 columns
# Shape of FC: (4210946142, 4)
# datetime	lat	lon	fc
# 0	1996-06-01	69.509682	-148.587189	3.251806
# 1	1996-06-01	69.509682	-148.587189	3.854896
# 2	1996-06-01	69.509682	-148.587189	-1.799835
# 3	1996-06-01	69.509682	-148.587189	0.554885
# 4	1996-06-01	69.509682	-148.587189	0.926192
# ...	...	...	...	...
# 4210946137	2022-12-01	61.251190	-163.266663	33.813412
# 4210946138	2022-12-01	61.251190	-163.266663	1.768532
# 4210946139	2022-12-01	61.251190	-163.266663	-6.183669
# 4210946140	2022-12-01	61.251190	-163.266663	-29.775229
# 4210946141	2022-12-01	61.268654	-163.228394	1.443291
# 4210946142 rows × 4 columns

In [None]:
def handle_missing_data(df):
    """
    Handles missing data by masking out rows with NaN values.
    
    Parameters:
    - df: The dataframe to process.
    
    Returns:
    - The dataframe with NaN values masked.
    """
    mask = df.isna().any(axis=1)  # Create a mask for rows with NaN values
    df_clean = df.dropna().reset_index(drop=True)  # Drop rows with NaN values
    return df_clean, mask

In [None]:
def clean_data(df, value_column):
    """
    Cleans the dataframe by removing rows where the specified value column is 0 or NaN.
    
    Parameters:
    - df: The dataframe to clean.
    - value_column: The name of the column containing the values to check for 0 or NaN.
    
    Returns:
    - Cleaned dataframe.
    """
    return df[(df[value_column] != 0) & (~df[value_column].isna())].sort_values(by=['datetime', 'lat', 'lon']).reset_index(drop=True)

In [None]:
def pivot_to_sequences(df, n_months=12):
    """
    Converts the cleaned dataframe into sequences suitable for machine learning models.
    
    Parameters:
    - df: The cleaned dataframe.
    - n_months: Number of months to include in each sequence.
    
    Returns:
    - Array of sequences.
    """
    # Pivot the dataframe
    pivot_df = df.pivot_table(index='datetime', columns=['lat', 'lon'], values=['alt'])#, 'fc', 'fch4'])
    
    # Replace 0 and NaN with a mask
    pivot_df = pivot_df.replace(0, np.nan)
    
    sequences = []
    unique_dates = pivot_df.index.unique()
    
    for start in range(len(unique_dates) - n_months + 1):
        end = start + n_months
        seq = pivot_df.loc[unique_dates[start:end]].values
        if seq.shape[0] == n_months:
            sequences.append(np.nan_to_num(seq, nan=0))  # Replace NaN with zero or another value
    
    return np.array(sequences)

In [None]:
alt=pivot_to_sequences(alt)

In [None]:
pd.DataFrame(alt[0,0,:]).replace(0,np.nan).dropna()

In [None]:
alt['datetime'] = pd.to_datetime(alt['datetime'])
alt = alt.groupby(['datetime', 'lat', 'lon']).agg({'alt': 'mean'}).reset_index()

filtered_alt = pd.DataFrame({'alt': alt.alt.values.round(4)}).replace(0, np.nan).dropna()
filtered_df = pd.merge(alt, filtered_alt, left_index=True, right_index=True)
aggregated_df = filtered_df.groupby(['datetime', 'lat', 'lon']).agg({'alt_x': 'mean'}).reset_index()
aggregated_df.rename(columns={'alt_x': 'alt'}, inplace=True)
aggregated_df

In [None]:
# grid_resolution = 1.0  # km

# aggregated_df['lat_bin'] = np.floor(aggregated_df['lat'] / grid_resolution)
# aggregated_df['lon_bin'] = np.floor(aggregated_df['lon'] / grid_resolution)

# tensor_df = aggregated_df.pivot_table(
#     index=['lat_bin', 'lon_bin', 'datetime'],
#     values=['alt'],
#     aggfunc='mean'
# ).fillna(0)  # Replace NaN with 0 (you can choose another strategy here)

In [None]:
# tensor_df.stack()

In [None]:
# # Converting to 5D tensor (samples, timesteps, height, width, channels)
# # Assuming the dataset is sorted by datetime, we can reshape directly
# tensor = tensor_df.values.reshape(
#     (-1, len(aggregated_df['datetime'].unique()), len(aggregated_df['lat_bin'].unique()), \
#      len(aggregated_df['lon_bin'].unique()), 3))

In [None]:
filtered_df

In [None]:
aggregated_df.alt.plot();

In [None]:
aggregated_df=aggregated_df.set_index('datetime').resample('M').mean().reset_index()
aggregated_df.alt.plot();
aggregated_df

In [None]:
#SHAPE, ALT
# aggregated_df[:int(3565*0.7)]
# aggregated_df[int(3565*0.7):(int(3565*0.7)+int(3565*.15))]
# aggregated_df[int(3565*0.7)+int(3565*.15):]

In [None]:
#SHAPE, FCH4
# aggregated_df[:int(317*0.7)]
# aggregated_df[(int(317*0.7)):(int(317*0.7)+int(317*.15))]
# aggregated_df[int(317*0.7)+int(317*.15):]

# aggregated_df[:int(317*0.7)]
# aggregated_df[int(317*0.7):(int(317*0.7)+int(317*.15))]
# aggregated_df[int(317*0.7)+int(317*.15):];

In [None]:
#SHAPE, FC
#aggregated_df[:int(319*0.7)]
#aggregated_df[(int(319*0.7)):(int(319*0.7)+int(319*.15))]
#aggregated_df[int(319*0.7)+int(319*.15):]

# aggregated_df[:int(319*0.7)]
# aggregated_df[int(319*0.7):(int(319*0.7)+int(319*.15))]
# aggregated_df[int(319*0.7)+int(319*.15):];

In [None]:
aggregated_df['year'] = aggregated_df['datetime'].dt.year
aggregated_df['month'] = aggregated_df['datetime'].dt.month

In [None]:
aggregated_df=aggregated_df[aggregated_df.isna()!=True]
aggregated_df=aggregated_df.reset_index(drop=True)
aggregated_df=aggregated_df.sort_values(by='datetime')
print("Checking for NaN values before processing:")
print(aggregated_df.isna().sum())

In [None]:
aggregated_df = aggregated_df.dropna(subset=['lat', 'lon', 'alt'])
aggregated_df

In [None]:
aggregated_df=aggregated_df[aggregated_df.isna()!=True]
aggregated_df=aggregated_df.reset_index(drop=True)
aggregated_df=aggregated_df.sort_values(by='datetime')
print("Checking for NaN values before processing:")
print(aggregated_df.isna().sum())

In [None]:
aggregated_df

In [None]:
# #SHAPE, ALT
# aggregated_df[:int(3241*0.7)]
# aggregated_df[int(3241*0.7):(int(3241*0.7)+int(3241*.15))]
# aggregated_df[int(3241*0.7)+int(3241*.15):];

In [None]:
#SHAPE, FC
# aggregated_df[:int(244*0.7)]
# aggregated_df[int(244*0.7):(int(244*0.7)+int(244*.15))]
# aggregated_df[int(244*0.7)+int(244*.15):];

In [None]:
#SHAPE, FCH4
#aggregated_df[:int(226*0.7)]
#aggregated_df[int(226*0.7):(int(226*0.7)+int(226*.15))]
#aggregated_df[int(226*0.7)+int(226*.15):]

In [None]:
#unique_dts = aggregated_df['datetime'].unique()
unique_years = np.sort(aggregated_df['year'].unique())
unique_months = np.sort(aggregated_df['month'].unique())
unique_lats = np.sort(aggregated_df['lat'].unique())
unique_lons = np.sort(aggregated_df['lon'].unique())

In [None]:
# Check for NaN values in the merged dataframe
print(aggregated_df.isna().sum())

# Fill or handle NaN values appropriately
aggregated_df.fillna(0, inplace=True)  # Example: filling NaN with 0

In [None]:
aggregated_df.interpolate(method='linear', inplace=True)

In [None]:
# Example: Check normalization
min_value = aggregated_df[['alt']].min()
if (min_value < 0).any():
    print("Warning: Negative values found before log transformation!")

# Apply log transformation if necessary and ensure it's done safely
aggregated_df[['alt']] = aggregated_df[['alt']].apply(lambda x: np.log1p(x - x.min() + 1))

In [None]:
ndarray = np.zeros((len(unique_years), len(unique_months), len(unique_lats), len(unique_lons)))

# Create mappings from years, months, latitudes, and longitudes to indices
year_to_idx = {year: i for i, year in enumerate(unique_years)}
month_to_idx = {month: i for i, month in enumerate(unique_months)}
lat_to_idx = {lat: i for i, lat in enumerate(unique_lats)}
lon_to_idx = {lon: i for i, lon in enumerate(unique_lons)}

In [None]:
# Populate the ndarray with alt values
for _, row in aggregated_df.iterrows():
    year_idx = year_to_idx[row['year']]
    month_idx = month_to_idx[row['month']]
    lat_idx = lat_to_idx[row['lat']]
    lon_idx = lon_to_idx[row['lon']]
    ndarray[year_idx, month_idx - 1, lat_idx, lon_idx] = row['alt']  # month - 1 for 0-indexing

In [None]:
ndarray = ndarray[..., np.newaxis]
print(f"Final ndarray shape: {ndarray.shape}")

In [None]:
n_timesteps = ndarray.shape[0]
train_size = int(0.7 * n_timesteps)
valid_size = int(0.15 * n_timesteps)
test_size = n_timesteps - train_size - valid_size

X_train = ndarray[:train_size]
y_train = ndarray[:train_size, :, :, :, 0]
X_valid = ndarray[train_size:train_size + valid_size]
y_valid = ndarray[train_size:train_size + valid_size, :, :, :, 0]
X_test = ndarray[train_size + valid_size:]
y_test = ndarray[train_size + valid_size:, :, :, :, 0]

In [None]:
X_train.shape, X_valid.shape, X_test.shape

In [None]:
y_train.shape, y_valid.shape, y_test.shape

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Create a scaler for each training, validation, and test dataset
scaler = MinMaxScaler()

# FOR ALT
# Flatten the data for scaling
X_train_flat = X_train.reshape(-1, 1)
X_valid_flat = X_valid.reshape(-1, 1)
X_test_flat = X_test.reshape(-1, 1)
# Fit the scaler on the training data and transform all datasets
X_train_scaled = scaler.fit_transform(X_train_flat).reshape(X_train.shape)
X_valid_scaled = scaler.transform(X_valid_flat).reshape(X_valid.shape)
X_test_scaled = scaler.transform(X_test_flat).reshape(X_test.shape)
#uncertainty = np.abs(1/alt.alt.std())

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
valid_dataset = tf.data.Dataset.from_tensor_slices((X_valid, y_valid))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))

batch_size = 1
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
valid_dataset = valid_dataset.shuffle(buffer_size=1024).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
test_dataset = test_dataset.shuffle(buffer_size=1024).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

# train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(1)
# valid_dataset = tf.data.Dataset.from_tensor_slices((X_valid, y_valid)).batch(1)
# test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(1)
# # train_dataset = tf.data.Dataset.from_tensor_slices((X_train_scaled, y_train)).batch(32)
# # valid_dataset = tf.data.Dataset.from_tensor_slices((X_valid_scaled, y_valid)).batch(32)
# # test_dataset = tf.data.Dataset.from_tensor_slices((X_test_scaled, y_test)).batch(32)

train_dataset.element_spec[0].shape, valid_dataset.element_spec[0].shape, test_dataset.element_spec[0].shape

# Example of iterating through the dataset
for batch in train_dataset.take(1):
    X_train_batch, y_train_batch = batch
    print(X_train_batch.shape, y_train_batch.shape)  # This should be the batched shapes
for batch in valid_dataset.take(1):
    X_valid_batch, y_valid_batch = batch
    print(X_valid_batch.shape, y_valid_batch.shape)  # This should be the batched shapes
for batch in test_dataset.take(1):
    X_test_batch, y_test_batch = batch
    print(X_test_batch.shape, y_test_batch.shape)  # This should be the batched shapes

In [None]:
# import tensorflow as tf
# from tensorflow.keras import layers, models

# # Build the model
# def model(hp):
#     model = models.Sequential()
    
#     # ConvLSTM2D layer
#     model.add(layers.ConvLSTM2D(
#         filters=32,
#         kernel_size=(3, 3),
#         activation='relu',
#         padding='same',
#         return_sequences=True,
#         #input_shape=(12, 559, 561, 1)
#         input_shape=(12, 186, 186, 1)
#     ))
    
#     # Another ConvLSTM2D layer
#     model.add(layers.ConvLSTM2D(
#         filters=32,
#         kernel_size=(3, 3),
#         activation='relu',
#         padding='same',
#         return_sequences=True
#     ))
    
#     model.add(layers.BatchNormalization())
        
#     model.add(layers.Dropout(0.2))
    
#     model.add(layers.ConvLSTM2D(
#         filters=hp.Int('filters_2', min_value=16, max_value=64, step=16), 
#         kernel_size=hp.Choice('kernel_size_2', values=[1, 3]),
#         padding='same', 
#         return_sequences=True,
#         activation=hp.Choice('activation_function_2', values=['relu','sigmoid', 'exponential', 'gelu', 'elu', 'linear', 'selu', 'swish','leaky_relu']), #'tanh','softmax',
#     ))
        
#     model.add(layers.BatchNormalization())
        
#     model.add(layers.Dropout(0.2))
    
#     # Final Conv3D layer to match the output dimensions
#     model.add(layers.Conv3D(
#         filters=1,  # Single channel output
#         kernel_size=(3, 3, 3),
#         activation='linear',
#         padding='same'
#     ))
    
#     # Compile the model
#     model.compile(optimizer='adam', loss='mse')
    
#     # Summary of the model
#     model.summary()

In [None]:
# from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

# # Callbacks setup
# callbacks = [
#     ModelCheckpoint('model_checkpoint.keras', save_best_only=True),
#     EarlyStopping(monitor='val_loss', patience=5)
# ]

In [None]:
# # Train the model
# history = model.fit(
#     train_dataset, 
#     epochs=5, 
#     validation_data=valid_dataset, 
#     callbacks=callbacks
# )

In [None]:
# plt.plot(history.history['loss']);
# #plt.plot(history.history['val_loss']);

In [None]:
# plt.plot(history.history['loss']);
# #plt.plot(history.history['val_loss']);

In [None]:
#aggregated_df[168:]
#aggregated_df[168:168+36]
aggregated_df[168+36:]

In [None]:
X_train.shape

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import Adam

def build_model(hp):
    model = models.Sequential()
    
    model.add(layers.ConvLSTM2D(
        filters=hp.Int('filters_1', min_value=16, max_value=64, step=16), 
        kernel_size=hp.Choice('kernel_size_1', values=[1, 3]),
        input_shape=(None, 559, 561, 1),
        #input_shape=(X_train.shape[1], 1, 1, X_train.shape[3]),
        padding='same', 
        return_sequences=True,
        activation=hp.Choice('activation_function_1', values=['relu','sigmoid',  'gelu', 'elu', 'linear', 'selu', 'swish','leaky_relu']), #'tanh','softmax',
    ))
    
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(0.2))

    model.add(layers.ConvLSTM2D(
        filters=hp.Int('filters_2', min_value=16, max_value=64, step=16), 
        kernel_size=hp.Choice('kernel_size_2', values=[1, 3]),
        padding='same', 
        return_sequences=True,
        activation=hp.Choice('activation_function_2', values=['relu','sigmoid', 'gelu', 'elu', 'linear', 'selu', 'swish','leaky_relu']), #'tanh','softmax',
    ))
    
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(0.2))

    # Extract the last time step
    #model.add(Lambda(lambda x: x[:, -1, :, :, :]))  # Shape becomes [batch_size, height, width, channels]

    # Conv2D Layer for output (single frame prediction)
    # model.add(Conv2D(
    #     filters=1,  # Ensure only one output channel to match y_train's shape
    #     kernel_size=(1, 1),  # Use a 2D kernel size
    #     activation=hp.Choice('activation_function_3', values=['relu','tanh','softmax','sigmoid','leaky_relu','swish']),
    #     padding='same',
    #     data_format='channels_last'
    # ))

    model.add(layers.Conv3D(
        filters=1,  # Single channel output
        kernel_size=(3, 3, 3),
        activation=hp.Choice('activation_function_3', values=['relu','sigmoid', 'gelu', 'elu', 'linear', 'selu', 'swish','leaky_relu']), #'tanh','softmax',
        padding='same',
        data_format='channels_last'
    )
             )

    # Compile the model
    model.compile(
        optimizer=Adam(clipvalue=1.0),  # Set the clip value
        loss='mse',
        metrics = ['mae','mse','mape','accuracy']
    )

    model.summary()
    
    return model

In [None]:
#batch_size = 32  # Adjust based on memory and GPU capability
#train_dataset = train_dataset.cache().shuffle(1000).batch(batch_size).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
#valid_dataset = valid_dataset.batch(batch_size).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
#test_dataset = test_dataset.batch(batch_size).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

# Define callbacks
early_stopping_cb = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
lr_scheduler_cb = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5)
checkpoint_cb = ModelCheckpoint('model_checkpoint.keras', save_best_only=True)

In [None]:
from keras_tuner import BayesianOptimization

# Define the tuner
tuner = BayesianOptimization(
    build_model,
    objective='val_loss',
    max_trials=10,
    executions_per_trial=1,
    #directory='/Volumes/JPL/alt_train_new',
    directory='/Volumes/JPL/altnew_train_new',
    #project_name='alt_train_conv3dlstm_tuning',
    project_name='alt_train_conv3dlstm_tuning',
    overwrite=False,
)

tuner.search_space_summary()

In [None]:
# import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

# from tensorflow.keras.mixed_precision import set_global_policy
# set_global_policy('float32')

#X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1, 1, X_train.shape[3]))
#X_valid = X_valid.reshape((X_valid.shape[0], X_valid.shape[1], 1, 1, X_valid.shape[3]))
#X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1, 1, X_test.shape[3]))
#tuner.search(train_dataset, epochs=10, validation_data=valid_dataset, callbacks=[early_stopping_cb, lr_scheduler_cb])
#tuner.search(X_train, y_train, epochs=100, validation_data=(X_valid, y_valid), callbacks=[early_stopping_cb, lr_scheduler_cb])

In [None]:
tuner.search(train_dataset, epochs=10, validation_data=valid_dataset, callbacks=[early_stopping_cb, lr_scheduler_cb])

In [None]:
best_model = tuner.get_best_models(num_models=1)[0]

In [None]:
#plot_model(best_model, to_file='/Volumes/JPL/alt_model.png', show_shapes=True, show_layer_names=True, dpi=300)
plot_model(best_model, to_file='/Volumes/JPL/co2_model.png', show_shapes=True, show_layer_names=True, dpi=300)

In [None]:
best_model.summary()

In [None]:
tuner.results_summary()

In [None]:
# Evaluate the best model on the test data
test_loss = best_model.evaluate(test_dataset)
print(f'Test Loss: {test_loss}')

In [None]:
early_stopping_cb = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
lr_scheduler_cb = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5)
checkpoint_cb = ModelCheckpoint('model_checkpoint.keras', save_best_only=True)

history = best_model.fit(train_dataset, validation_data=valid_dataset, epochs=100, callbacks=[early_stopping_cb, lr_scheduler_cb])

In [None]:
#history = best_model.fit(train_dataset, epochs=10, validation_data=valid_dataset, callbacks=[tensorboard_callback])

In [None]:
final_test_loss, final_test_mae = best_model.evaluate(train_dataset)
print(f"Final Test Loss: {final_test_loss}, Final Test MAE: {final_test_mae}")

In [None]:
best_model.evaluate(train_dataset)

In [None]:
predictions = best_model.predict(test_dataset)

In [None]:
# If using a scaler, inverse transform the predictions and actual values
predictions_original = scaler.inverse_transform(predictions.reshape(-1, 1)).reshape(predictions.shape)
actual_values_original = scaler.inverse_transform(y_test.reshape(-1, 1)).reshape(y_test.shape)

In [None]:
# Compare first few predictions with actual values
for i in range(4):
    print(f"Prediction {i+1}: {predictions_original[i, :, :, 0]}")
    print(f"Actual Value {i+1}: {actual_values_original[i, :, :, 0]}")

In [None]:
predictions_original.shape, actual_values_original.shape

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.title('Predicted (Original Scale)')
plt.imshow(predictions_original[0, 7, :, :, 0], cmap='plasma')
plt.colorbar()

plt.subplot(1, 2, 2)
plt.title('Actual (Original Scale)')
plt.imshow(actual_values_original[0, 7, :, :], cmap='plasma')
plt.colorbar()

plt.show()

In [None]:
from tensorflow.keras.layers import Dropout
from tensorflow.keras import backend as K

def model_with_dropout(hp):
    model = Sequential()
    
    # ConvLSTM2D layer to process the sequence data
    model.add(ConvLSTM2D(
        filters=hp.Int('filters_1', min_value=16, max_value=64, step=16), 
        kernel_size=hp.Choice('kernel_size_1', values=[1, 3]),
        padding='same', 
        return_sequences=True,
        activation=hp.Choice('activation_function_1', values=['relu','sigmoid', 'exponential', 'gelu', 'elu', 'linear', 'selu', 'swish','leaky_relu']), #'tanh','softmax',
        input_shape=(12, 165, 165, 1),
    ))
    
    model.add(BatchNormalization())
    model.add(Dropout(0.5))  # Dropout layer for uncertainty
    
    # Another ConvLSTM2D layer
    model.add(ConvLSTM2D(
        filters=hp.Int('filters_2', min_value=16, max_value=64, step=16), 
        kernel_size=hp.Choice('kernel_size_2', values=[1, 3]),
        padding='same',
        activation=hp.Choice('activation_function_2', values=['relu','sigmoid', 'exponential', 'gelu', 'elu', 'linear', 'selu', 'swish','leaky_relu']), #'tanh','softmax',
        return_sequences=True
    ))
    
    model.add(BatchNormalization())
    model.add(Dropout(0.5))  # Dropout layer for uncertainty
    
    # Final ConvLSTM2D layer without return_sequences
    model.add(ConvLSTM2D(
        filters=hp.Int('filters_3', min_value=16, max_value=64, step=16), 
        kernel_size=hp.Choice('kernel_size_3', values=[1, 3]),
        padding='same', 
        activation=hp.Choice('activation_function_3', values=['relu','sigmoid', 'exponential', 'gelu', 'elu', 'linear', 'selu', 'swish','leaky_relu']), #'tanh','softmax',
        return_sequences=False
    ))
    
    model.add(BatchNormalization())
    model.add(Dropout(0.5))  # Dropout layer for uncertainty

    model.add(layers.Conv3D(
        filters=1,  # Single channel output
        kernel_size=(1, 1, 1),
        activation=hp.Choice('activation_function_3', values=['relu','sigmoid', 'exponential', 'gelu', 'elu', 'linear', 'selu', 'swish','leaky_relu']), #'tanh','softmax',
        padding='same',
        data_format='channels_last'
    ))
    
    # # Conv2D to reduce dimensions
    # model.add(Conv2D(
    #     filters=1, 
    #     kernel_size=1, 
    #     activation='sigmoid', 
    #     padding='same'))
    
    # model.add(Flatten())
    # model.add(Dense(1, activation='linear'))
    
    model.compile(optimizer=Adam(lr=1e-4), loss='mse', metrics=['mae','mse','accuracy'])
    
    return model

In [None]:
# Using Monte Carlo Dropout
def monte_carlo_prediction(model, x_data, n_samples=50):
    predictions = [model.predict(x_data) for _ in range(n_samples)]
    predictions = np.array(predictions)
    
    # Mean and variance
    mean_prediction = np.mean(predictions, axis=0)
    uncertainty = np.std(predictions, axis=0)
    
    return mean_prediction, uncertainty

In [None]:
zip(*[monte_carlo_prediction(model, X_test, n_samples=50) for model in models])

In [None]:
tunerX.oracle.trials.values()

In [None]:
def ensemble_predictions(models, x_data):
    predictions = np.array([model.predict(x_data) for model in models])
    mean_prediction = np.mean(predictions, axis=0)
    uncertainty = np.std(predictions, axis=0)
    
    return mean_prediction, uncertainty

In [None]:
def aggregate_ensemble_uncertainty(mean_preds, uncertainties):
    """
    Aggregate predictions and uncertainties from ensemble models.
    
    Parameters:
    - mean_preds: Array of mean predictions from ensemble models.
    - uncertainties: Array of uncertainties from ensemble models.
    
    Returns:
    - Combined mean prediction and uncertainty.
    """
    combined_mean = np.mean(mean_preds, axis=0)
    combined_uncertainty = np.sqrt(np.sum(np.square(uncertainties), axis=0) / len(uncertainties))
    return combined_mean, combined_uncertainty

In [None]:
# Assuming models is a list of trained models
mean_preds, uncertainties = zip(*[monte_carlo_prediction(model, X_test, n_samples=50) for model in models])
combined_mean, combined_uncertainty = aggregate_ensemble_uncertainty(mean_preds, uncertainties)

In [None]:
# import tensorflow as tf
# import numpy as np
# import matplotlib.pyplot as plt

# # Function to compute Grad-CAM
# def make_gradcam_heatmap(img_array, model, last_conv_layer_name, classifier_layer_names):
#     grad_model = tf.keras.models.Model(
#         [model.inputs], [model.get_layer(last_conv_layer_name).output, model.output]
#     )
#     with tf.GradientTape() as tape:
#         conv_outputs, predictions = grad_model(img_array)
#         loss = predictions[:, tf.argmax(predictions[0])]
#     grads = tape.gradient(loss, conv_outputs)
#     pooled_grads = tf.reduce_mean(grads, axis=(0, 1, 2))
#     conv_outputs = conv_outputs[0]
#     heatmap = conv_outputs @ pooled_grads[..., tf.newaxis]
#     heatmap = tf.squeeze(heatmap)
#     heatmap = tf.maximum(heatmap, 0) / tf.math.reduce_max(heatmap)
#     return heatmap.numpy()
    
# # Assume the model is already defined and trained
# img = np.random.random((1, 559, 561, 1))  # Example 2D slice of input
# last_conv_layer_name = "conv3d_layer"  # Replace with your Conv3D layer name
# classifier_layer_names = ["dense"]  # Replace with your dense layers

# heatmap = make_gradcam_heatmap(img, model, last_conv_layer_name, classifier_layer_names)

# # Plot the heatmap
# plt.matshow(heatmap)
# plt.show()

In [None]:
# import pandas as pd

# # Collect the layer information
# layer_info = []
# for layer in model.layers:
#     layer_info.append({
#         "Layer Name": layer.name,
#         "Layer Type": layer.__class__.__name__,
#         "Input Shape": layer.input_shape,
#         "Output Shape": layer.output_shape,
#         "Number of Parameters": layer.count_params(),
#         "Activation": layer.activation.__name__ if hasattr(layer, 'activation') else 'N/A'
#     })

# # Create a DataFrame
# df = pd.DataFrame(layer_info)
# print(df)

# # Optionally, save it to a CSV
# df.to_csv("model_summary.csv", index=False)

In [None]:
num_steps = 50
lats = 128
lons = 128
features = 4
out_feats = 3

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import ConvLSTM2D, Dense, Flatten

# Example: ConvLSTM2D Model
model = Sequential([
    ConvLSTM2D(filters=32, kernel_size=(3, 3), activation='relu', input_shape=(X.shape[1], X.shape[2], X.shape[3], X.shape[4])),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(1, activation='linear')
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.fit(X, Y, epochs=10, batch_size=32, validation_split=0.2)

In [None]:
n_timesteps = ndarray.shape[0]
train_size = int(0.7 * n_timesteps)
valid_size = int(0.15 * n_timesteps)
test_size = n_timesteps - train_size - valid_size

X_train = ndarray[:train_size]
y_train = ndarray[:train_size, :, :, :, 0]
X_valid = ndarray[train_size:train_size + valid_size]
y_valid = ndarray[train_size:train_size + valid_size, :, :, :, 0]
X_test = ndarray[train_size + valid_size:]
y_test = ndarray[train_size + valid_size:, :, :, :, 0]

train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(1)
valid_dataset = tf.data.Dataset.from_tensor_slices((X_valid, y_valid)).batch(1)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(1)

# X_train = tf.convert_to_tensor(X_train, dtype=tf.float32)
# y_train = tf.convert_to_tensor(y_train, dtype=tf.float32)
# X_valid = tf.convert_to_tensor(X_valid, dtype=tf.float32)
# y_valid = tf.convert_to_tensor(y_valid, dtype=tf.float32)
# X_test = tf.convert_to_tensor(X_test, dtype=tf.float32)
# y_test = tf.convert_to_tensor(y_test, dtype=tf.float32)

# X_train = X_train[..., np.newaxis]  # Shape becomes (batch_size, time_steps, rows, cols, 1)
# X_valid = X_valid[..., np.newaxis]
# X_test = X_test[..., np.newaxis]
# y_train = y_train[..., np.newaxis]
# y_valid = y_valid[..., np.newaxis]
# y_test = y_test[..., np.newaxis]

# X_train = tf.transpose(X_train, perm=[0, 4, 1, 2, 3])  # Shape becomes (batch_size, 1, rows, cols, channels)
# X_valid = tf.transpose(X_valid, perm=[0, 4, 1, 2, 3])
# X_test = tf.transpose(X_test, perm=[0, 4, 1, 2, 3])

# train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))#.batch(32)
# valid_dataset = tf.data.Dataset.from_tensor_slices((X_valid, y_valid))#.batch(32)
# test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))#.batch(32)

# train_dataset = train_dataset.batch(32)
# valid_dataset = valid_dataset.batch(32)
# test_dataset = test_dataset.batch(32)

# #X = ndarray[..., :-1]
# #y = ndarray[..., -1:]
# #X = X.reshape(X.shape[0], X.shape[1], X.shape[2], X.shape[3])
# #y = y.reshape(y.shape[0], y.shape[1], y.shape[2], y.shape[3])

# #X_tensor = tf.convert_to_tensor(X, dtype=tf.float32)
# #y_tensor = tf.convert_to_tensor(y, dtype=tf.float32)
# #dataset = tf.data.Dataset.from_tensor_slices((X_tensor, y_tensor))
# #batch_size = 32
# #dataset = dataset.shuffle(buffer_size=len(X)).batch(batch_size)
# #alt_dataset = dataset#.take(train_size)
# #alt_dataset

In [None]:
#train_dataset
for data in train_dataset.take(1):
    print(data)

In [None]:
#valid_dataset
for data in valid_dataset.take(1):
   print(data)

In [None]:
#test_dataset
for data in test_dataset.take(1):
  print(data)

In [None]:
print(ndarray.shape)  # Ensure this outputs the correct shape

In [None]:
# filtered_alt = pd.DataFrame({'alt': alt_valid.alt.values.round(4)}).replace(0, np.nan).dropna()
# filtered_df = pd.merge(alt_valid, filtered_alt, left_index=True, right_index=True)
# aggregated_df = filtered_df.groupby(['datetime', 'lat', 'lon']).agg({'alt_x': 'mean'}).reset_index()
# aggregated_df.rename(columns={'alt_x': 'alt'}, inplace=True)
# pivot_df = aggregated_df.pivot_table(index='datetime', columns=['lat', 'lon'], values='alt')
# unique_dts = aggregated_df['datetime'].unique()
# unique_lats = np.sort(aggregated_df['lat'].unique())
# unique_lons = np.sort(aggregated_df['lon'].unique())
# ndarray = np.zeros((len(unique_dts), len(unique_lats), len(unique_lons)))
# lat_to_idx = {lat: i for i, lat in enumerate(unique_lats)}
# lon_to_idx = {lon: i for i, lon in enumerate(unique_lons)}
# dt_to_idx = {dt: i for i, dt in enumerate(unique_dts)}
# for _, row in aggregated_df.iterrows():
#     dt_idx = dt_to_idx[row['datetime']]
#     lat_idx = lat_to_idx[row['lat']]
#     lon_idx = lon_to_idx[row['lon']]
#     ndarray[dt_idx, lat_idx, lon_idx] = row['alt']
# ndarray = ndarray[..., np.newaxis]
# X = ndarray[..., :-1]
# y = ndarray[..., -1:]
# X = X.reshape(X.shape[0], X.shape[1], X.shape[2], X.shape[3])
# y = y.reshape(y.shape[0], y.shape[1], y.shape[2], y.shape[3])
# X_tensor = tf.convert_to_tensor(X, dtype=tf.float32)
# y_tensor = tf.convert_to_tensor(y, dtype=tf.float32)
# dataset = tf.data.Dataset.from_tensor_slices((X_tensor, y_tensor))
# batch_size = 32
# dataset = dataset.shuffle(buffer_size=len(X)).batch(batch_size)
# valid_dataset = dataset#.take(train_size)
# valid_dataset

In [None]:
print(ndarray.shape)  # Ensure this outputs the correct shape

In [None]:
# filtered_alt = pd.DataFrame({'alt': alt_test.alt.values.round(4)}).replace(0, np.nan).dropna()
# filtered_df = pd.merge(alt_test, filtered_alt, left_index=True, right_index=True)
# aggregated_df = filtered_df.groupby(['datetime', 'lat', 'lon']).agg({'alt_x': 'mean'}).reset_index()
# aggregated_df.rename(columns={'alt_x': 'alt'}, inplace=True)
# pivot_df = aggregated_df.pivot_table(index='datetime', columns=['lat', 'lon'], values='alt')
# unique_dts = aggregated_df['datetime'].unique()
# unique_lats = np.sort(aggregated_df['lat'].unique())
# unique_lons = np.sort(aggregated_df['lon'].unique())
# ndarray = np.zeros((len(unique_dts), len(unique_lats), len(unique_lons)))
# lat_to_idx = {lat: i for i, lat in enumerate(unique_lats)}
# lon_to_idx = {lon: i for i, lon in enumerate(unique_lons)}
# dt_to_idx = {dt: i for i, dt in enumerate(unique_dts)}
# for _, row in aggregated_df.iterrows():
#     dt_idx = dt_to_idx[row['datetime']]
#     lat_idx = lat_to_idx[row['lat']]
#     lon_idx = lon_to_idx[row['lon']]
#     ndarray[dt_idx, lat_idx, lon_idx] = row['alt']
# ndarray = ndarray[..., np.newaxis]
# X = ndarray[..., :-1]
# y = ndarray[..., -1:]
# X = X.reshape(X.shape[0], X.shape[1], X.shape[2], X.shape[3])
# y = y.reshape(y.shape[0], y.shape[1], y.shape[2], y.shape[3])
# X_tensor = tf.convert_to_tensor(X, dtype=tf.float32)
# y_tensor = tf.convert_to_tensor(y, dtype=tf.float32)
# dataset = tf.data.Dataset.from_tensor_slices((X_tensor, y_tensor))
# batch_size = 32
# dataset = dataset.shuffle(buffer_size=len(X)).batch(batch_size)
# test_dataset = dataset#.take(train_size)
# test_dataset

In [None]:
print(ndarray.shape)  # Ensure this outputs the correct shape

In [None]:
X_train.shape, X_valid.shape, X_test.shape

In [None]:
y_train.shape, y_valid.shape, y_test.shape

In [None]:
train_dataset.element_spec[0]

In [None]:
model = tf.keras.Sequential([
    #tf.keras.layers.ConvLSTM2D(64, kernel_size=(3, 3), input_shape=(X_train.shape[1], X_train.shape[2], X_train.shape[3], 1), return_sequences=True),
    
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(1)
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])

model.summary()

# Train the model using the training and validation datasets
history = model.fit(train_dataset, epochs=10, validation_data=valid_dataset)

# Evaluate the model on the test dataset
test_loss, test_mae = model.evaluate(test_dataset)
print(f'Test Loss: {test_loss}, Test MAE: {test_mae}')

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, ConvLSTM2D, BatchNormalization, Conv2D, Flatten, Dense
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import tensorflow as tf

# Build the model
tf.keras.backend.clear_session()

# Define the simplified model
model = models.Sequential()

model.add(layers.ConvLSTM2D(
    filters=32,
    kernel_size=(3, 3),
    padding="same",
    return_sequences=False,  # Set this to False if you don't need the time dimension in subsequent layers
    input_shape=(1, 363, 367, 1)  # timesteps, height, width, channels
))

# Flatten the time dimension if it still exists
# This step is not necessary if return_sequences=False
model.add(layers.Reshape((363, 367, 32)))  # Reshape to (height, width, channels)

# Add final Conv2D layer to match y_true's shape
model.add(layers.Conv2D(
    filters=1,  # Single channel output
    kernel_size=(3, 3),
    activation="linear",
    padding="same"
))

def custom_loss(y_true, y_pred):
    return tf.reduce_mean(tf.square(y_true - y_pred), axis=-1)

# Compile the model
model.compile(optimizer="adam", loss=custom_loss)

train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(4)
valid_dataset = tf.data.Dataset.from_tensor_slices((X_valid, y_valid)).batch(4)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(4)

# Train the model
history = model.fit(
    train_dataset,
    epochs=10,  # Reduce epochs for testing
    validation_data=valid_dataset
)

# model = Sequential([
#     layers.ConvLSTM2D(filters=32, kernel_size=(3, 3), input_shape=(1, 363, 367, 1), padding='same', return_sequences=True),
#     #layers.BatchNormalization(),
#     layers.ConvLSTM2D(filters=32, kernel_size=(3, 3), padding='same', return_sequences=True),
#     #layers.BatchNormalization(),
#     layers.ConvLSTM2D(filters=32, kernel_size=(3, 3), padding='same', return_sequences=False),
#     #layers.BatchNormalization(),
#     layers.Conv2D(filters=1, kernel_size=(3, 3), activation='linear', padding='same')
# ])

# Compile the model
#model.compile(optimizer='adam', loss='mse')#, metrics=['mae','mse','accuracy',\
                                            #         keras.metrics.RootMeanSquaredError(),\
                                             #        'mean_absolute_percentage_error','cosine_similarity'])

# def custom_loss(y_true, y_pred):
#     # Ensure that y_pred matches y_true's shape directly
#     return tf.reduce_mean(tf.square(y_true - y_pred), axis=-1)

# model.compile(optimizer="adam", loss=custom_loss)

model.summary()

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(8)
valid_dataset = tf.data.Dataset.from_tensor_slices((X_valid, y_valid)).batch(8)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(8)

In [None]:
# X_train = X_train[..., np.newaxis]  # Ensures that shape is (batch_size, time_steps, height, width, channels)
# y_train = y_train[..., np.newaxis]
X_train.shape, X_valid.shape, X_test.shape

In [None]:
y_train.shape, y_valid.shape, y_test.shape

In [None]:
# Check and print the shapes of the datasets
for X_batch, y_batch in train_dataset.take(1):
    print(f'X_train batch shape: {X_batch.shape}')
    print(f'y_train batch shape: {y_batch.shape}')

In [None]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
    tf.keras.callbacks.ModelCheckpoint('best_model.keras', save_best_only=True)
]

In [None]:
# Set up callbacks for early stopping and model checkpointing
# callbacks = [
#     tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
#     tf.keras.callbacks.ModelCheckpoint('best_model.keras', save_best_only=True, monitor='val_loss')
# ]
# callbacks = [
#     ModelCheckpoint('best_model.keras', save_best_only=True, monitor='val_loss', mode='min'),
#     EarlyStopping(monitor='val_loss', patience=10, mode='min')
# ]

In [None]:
tf.keras.backend.clear_session()

In [None]:
# import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

# from tensorflow.keras.mixed_precision import set_global_policy
# set_global_policy('float32')

# train_dataset = train_dataset.batch(8)
# valid_dataset = valid_dataset.batch(8)

# Train the model
history = model.fit(
    train_dataset, 
    epochs=50,  # Increase this value if the model needs more training
    validation_data=valid_dataset, 
    callbacks=callbacks
)

In [None]:
# Evaluate the model on the test set
test_loss, test_mae = model.evaluate(test_dataset)
print(f'Test Loss: {test_loss}, Test MAE: {test_mae}')

In [None]:
# Make predictions
predictions = model.predict(test_dataset)

In [None]:
import keras_tuner as kt

def model_builder(hp):
    model = models.Sequential()
    
    # ConvLSTM layer
    model.add(layers.ConvLSTM2D(
        filters=hp.Int('filters', min_value=32, max_value=128, step=16),
        kernel_size=(3, 3), 
        input_shape=input_shape, 
        padding='same', 
        return_sequences=True))
    
    model.add(layers.BatchNormalization())
    
    model.add(layers.ConvLSTM2D(
        filters=hp.Int('filters', min_value=32, max_value=128, step=16),
        kernel_size=(3, 3), 
        padding='same', 
        return_sequences=False))
    
    model.add(layers.BatchNormalization())
    
    model.add(layers.Conv2D(
        filters=hp.Int('filters', min_value=32, max_value=128, step=16),
        kernel_size=(3, 3), 
        padding='same', 
        activation='relu'))
    
    model.add(layers.Flatten())
    
    model.add(layers.Dense(hp.Int('units', min_value=32, max_value=128, step=16), activation='relu'))
    
    model.add(layers.Dense(1, activation='linear'))
    
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

tuner = kt.Hyperband(
    model_builder,
    objective='val_loss',
    max_epochs=50,
    factor=3,
    directory='my_dir',
    project_name='conv_lstm_tuning'
)

tuner.search(train_dataset, epochs=50, validation_data=valid_dataset)

# Get the best model and hyperparameters
best_model = tuner.get_best_models(num_models=1)[0]
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# Print the best hyperparameters
print(f'Best hyperparameters: {best_hps.values}')

# Evaluate the best model on the test set
best_model.evaluate(test_dataset)

In [None]:
alt_array = pivot_df.values.reshape((len(pivot_df.index), len(pivot_df.columns.levels[0]), len(pivot_df.columns.levels[1]), 1))

print(alt_array.shape)  # Expected shape: (n_datetimes, n_lats, n_lons, 1)

In [None]:
# Example: Expand dimensions for LSTM or Conv2DLSTM (if necessary)
# Assuming we want to add a time dimension:
X = np.expand_dims(alt_array, axis=1)  # Shape becomes (n_datetimes, 1, n_lats, n_lons, 1)

# Or stack multiple time steps if needed
# Example: Create sequences of length 10 (timesteps)
X = np.stack([alt_array[i:i+10] for i in range(len(alt_array)-9)], axis=0)  # Shape becomes (n_sequences, 10, n_lats, n_lons, 1)

# Y would be the next time step's alt value (for predictive modeling)
Y = alt_array[10:]  # Corresponding target values


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import ConvLSTM2D, Dense, Flatten

# Example: ConvLSTM2D Model
model = Sequential([
    ConvLSTM2D(filters=32, kernel_size=(3, 3), activation='relu', input_shape=(X.shape[1], X.shape[2], X.shape[3], X.shape[4])),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(1, activation='linear')
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.fit(X, Y, epochs=10, batch_size=32, validation_split=0.2)


In [None]:
dts = tf.convert_to_tensor(alt_train.datetime.values.astype(np.float32))
lat = tf.convert_to_tensor(alt_train.lat.values.astype(np.float32))
lon = tf.convert_to_tensor(alt_train.lon.values.astype(np.float32))
alt = tf.convert_to_tensor(alt_train.alt.values.astype(np.float32))

inputs = tf.stack([dts, lat, lon], axis=1)

dataset = tf.data.Dataset.from_tensor_slices((inputs, alt))

# Define batch size
batch_size = 32

# Shuffle and batch the dataset
dataset = dataset.shuffle(buffer_size=len(alt_train)).batch(batch_size)

train_dataset = dataset.take(1104980)

In [None]:
dts = tf.convert_to_tensor(alt_valid.datetime.values.astype(np.float32))
lat = tf.convert_to_tensor(alt_valid.lat.values.astype(np.float32))
lon = tf.convert_to_tensor(alt_valid.lon.values.astype(np.float32))
alt = tf.convert_to_tensor(alt_valid.alt.values.astype(np.float32))

inputs = tf.stack([dts, lat, lon], axis=1)

dataset = tf.data.Dataset.from_tensor_slices((inputs, alt))

# Define batch size
batch_size = 32

# Shuffle and batch the dataset
dataset = dataset.shuffle(buffer_size=len(alt_valid)).batch(batch_size)

valid_dataset = dataset.take(36708)

In [None]:
# Define a simple model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(3,)),  # Assuming datetime, lat, lon as inputs
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1)  # Output is alt
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])

In [None]:
# Train the model using the training and validation datasets
history = model.fit(train_dataset, epochs=10, validation_data=valid_dataset)

In [None]:
# Evaluate the model on the test dataset
test_loss, test_mae = model.evaluate(test_dataset)
print(f'Test Loss: {test_loss}, Test MAE: {test_mae}')

In [None]:
def build_model(hp):
    model = Sequential()
    
    model.add(ConvLSTM2D(
        filters=hp.Int('filters_1', min_value=16, max_value=64, step=16), 
        kernel_size=hp.Choice('kernel_size_1', values=[1, 1]),
        input_shape=(12, 1, 1, 360),
        #input_shape=(X_train.shape[1], 1, 1, X_train.shape[3]),
        padding='same', 
        return_sequences=True,
        activation=hp.Choice('activation_function_1', values=['relu','tanh','softmax','sigmoid','leaky_relu','swish'])
    ))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))

    model.add(ConvLSTM2D(
        filters=hp.Int('filters_2', min_value=16, max_value=64, step=16), 
        kernel_size=hp.Choice('kernel_size_2', values=[1, 1]),
        padding='same', 
        return_sequences=True,
        activation=hp.Choice('activation_function_2', values=['relu','tanh','softmax','sigmoid','leaky_relu','swish'])
    ))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))

    # Extract the last time step
    model.add(Lambda(lambda x: x[:, -1, :, :, :]))  # Shape becomes [batch_size, height, width, channels]

    # Conv2D Layer for output (single frame prediction)
    model.add(Conv2D(
        filters=1,  # Ensure only one output channel to match y_train's shape
        kernel_size=(1, 1),  # Use a 2D kernel size
        activation=hp.Choice('activation_function_3', values=['relu','tanh','softmax','sigmoid','leaky_relu','swish']),
        padding='same',
        data_format='channels_last'
    ))

    # Compile the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(
            hp.Choice('learning_rate', values=[1e-3, 1e-4, 1e-5])
        ),
        loss='mse',
        metrics=['mae','mse','mape','accuracy']
    )
    
    return model
    
batch_size = 32  # Adjust based on memory and GPU capability

train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_dataset = train_dataset.cache().shuffle(1000).batch(batch_size).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

valid_dataset = tf.data.Dataset.from_tensor_slices((X_valid, y_valid))
valid_dataset = valid_dataset.batch(batch_size).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
test_dataset = test_dataset.batch(batch_size).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

# Define callbacks
early_stopping_cb = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
lr_scheduler_cb = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2)
checkpoint_cb = ModelCheckpoint('model_checkpoint.keras', save_best_only=True)

from keras_tuner import BayesianOptimization

# Define the tuner
tuner = BayesianOptimization(
    build_model,
    objective='val_loss',
    max_trials=10,
    executions_per_trial=1,
    directory='alt_train',
    project_name='alt_train_conv3dlstm_tuning',
    #directory='ch4_train',
    #project_name='ch4_train_conv3dlstm_tuning',
    overwrite=True,
)

tuner.search_space_summary()

X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1, 1, X_train.shape[3]))
X_valid = X_valid.reshape((X_valid.shape[0], X_valid.shape[1], 1, 1, X_valid.shape[3]))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1, 1, X_test.shape[3]))

#tuner.search(train_dataset, epochs=100, validation_data=valid_dataset, callbacks=[early_stopping_cb, lr_scheduler_cb])
tuner.search(X_train, y_train, epochs=100, validation_data=(X_valid, y_valid), callbacks=[early_stopping_cb, lr_scheduler_cb])

best_model = tuner.get_best_models(num_models=1)[0]

plot_model(best_model, to_file='/Volumes/JPL/alt_model.png', show_shapes=True, show_layer_names=True, dpi=300)

best_model.summary()

tuner.results_summary()

# Evaluate the best model on the test data
test_loss = best_model.evaluate(X_test, y_test)
print(f'Test Loss: {test_loss}')

#history=# Continue training (fine-tuning) the best model with additional data or epochs
history = best_model.fit(X_train, y_train, epochs=epochs, validation_data=(X_valid, y_valid), 
                 epochs2=additional_epochs, validation_data=(X_valid, y_valid))

final_test_loss, final_test_mae = best_model.evaluate(train_dataset)
print(f"Final Test Loss: {final_test_loss}, Final Test MAE: {final_test_mae}")

predictions = best_model.predict(test_dataset)

# If using a scaler, inverse transform the predictions and actual values
predictions_original = scaler_alt.inverse_transform(predictions.reshape(-1, 1)).reshape(predictions.shape)
actual_values_original = scaler_alt.inverse_transform(y_test.reshape(-1, 1)).reshape(y_test.shape)

# Compare first few predictions with actual values
for i in range(5):
    print(f"Prediction {i+1}: {predictions_original[i, :, :, 0]}")
    print(f"Actual Value {i+1}: {actual_values_original[i, :, :, 0]}")
    
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.title('Predicted (Original Scale)')
plt.imshow(predictions_original[0, :, :, 0], cmap='viridis')
plt.colorbar()

plt.subplot(1, 2, 2)
plt.title('Actual (Original Scale)')
plt.imshow(actual_values_original[0, :, :, 0], cmap='viridis')
plt.colorbar()

plt.show()

#### Old

In [None]:
def process_dataframe(df, variable_name):
    # Mask out rows where alt is 0 or NaN
    mask = (df['alt'] != 0) & (~df['alt'].isna())
    df_filtered = df[mask]
    
    # Ensure no NaN or 0 values in `alt`
    assert df_filtered['alt'].isna().sum() == 0
    assert (df_filtered['alt'] == 0).sum() == 0
    
    # Convert to array: datetime, lat, lon, variable_name
    array = df_filtered[['datetime', 'lat', 'lon', variable_name]].values
    return array

In [None]:
alt_train_array = process_dataframe(alt_train, 'alt')
alt_valid_array = process_dataframe(alt_valid, 'alt')
alt_test_array = process_dataframe(alt_test, 'alt')

In [None]:
alt_combined_array = np.vstack([alt_train_array, alt_valid_array, alt_test_array])

In [None]:
alt_combined_array.shape

In [None]:
combined_df = pd.concat([alt_train, alt_valid, alt_test], ignore_index=True)

# 2. Mask out rows where alt is 0 or NaN
filtered_df = combined_df[(combined_df['alt'] != 0) & (combined_df['alt'].notna())]

# 3. Extract year and month from datetime
filtered_df['year'] = pd.to_datetime(filtered_df['datetime']).dt.year
filtered_df['month'] = pd.to_datetime(filtered_df['datetime']).dt.month

# 4. Group by year, month, lat, and lon
grouped_df = filtered_df.groupby(['year', 'month', 'lat', 'lon'])

# 5. Aggregate the alt values - here we take the mean as an example
aggregated_df = grouped_df['alt'].mean().reset_index()

In [None]:
final_array = aggregated_df.to_numpy()

In [None]:
final_df = pd.DataFrame(final_array, columns=['year', 'month', 'lat', 'lon', 'alt'])

# 1. Create a grid for latitudes and longitudes
unique_latitudes = sorted(final_df['lat'].unique())
unique_longitudes = sorted(final_df['lon'].unique())

# 2. Pivot the DataFrame to organize the data into grid format for each year and month
pivot_df = final_df.pivot_table(index=['year', 'month'], columns=['lat', 'lon'], values='alt')

# 3. Reindex the pivot table to ensure it includes all latitude-longitude combinations
pivot_df = pivot_df.reindex(pd.MultiIndex.from_product([unique_latitudes, unique_longitudes], names=['lat', 'lon']), axis=1)

# 4. Fill missing values (you can fill with 0, NaN, or use interpolation)
pivot_df = pivot_df.fillna(0)  # or .fillna(np.nan) or .interpolate()

# 5. Confirm the shape before reshaping
print("Shape of pivot_df before reshaping:", pivot_df.shape)

# 6. Reshape the data correctly based on the existing pivot_df shape
# Here, len(unique_latitudes) * len(unique_longitudes) should equal 133221
rows = len(unique_latitudes)
cols = len(unique_longitudes)

# Reshape pivot_df values to match (samples, rows, cols, channels=1)
data_4d = pivot_df.values.reshape((len(pivot_df), rows, cols, 1))

# Since the samples are already in a single timestep, we don't need further reshaping
# Final shape should be (samples, 1, rows, cols, channels)
conv2dlstm_input = data_4d.reshape((len(pivot_df), 1, rows, cols, 1))

# Verify the shape
print("Final Conv2DLSTM input shape:", conv2dlstm_input.shape)

In [None]:
conv2dlstm_input.shape

In [None]:
pivot_df_train=pivot_df[:2628]
print("Shape of pivot_df before reshaping:", pivot_df_train.shape)
rows = len(unique_latitudes)
cols = len(unique_longitudes)
data_4d_train = pivot_df_train.values.reshape((len(pivot_df_train), rows, cols, 1))
conv2dlstm_input_train = data_4d_train.reshape((len(pivot_df_train), 1, rows, cols, 1))
print("Final Conv2DLSTM input shape:", conv2dlstm_input_train.shape)

In [None]:
pivot_df_valid=pivot_df[2628:3039]
print("Shape of pivot_df before reshaping:", pivot_df_valid.shape)
rows = len(unique_latitudes)
cols = len(unique_longitudes)
data_4d_valid = pivot_df_valid.values.reshape((len(pivot_df_valid), rows, cols, 1))
conv2dlstm_input_valid = data_4d_valid.reshape((len(pivot_df_valid), 1, rows, cols, 1))
print("Final Conv2DLSTM input shape:", conv2dlstm_input_valid.shape)

In [None]:
pivot_df_test=pivot_df[3039:]
print("Shape of pivot_df before reshaping:", pivot_df_test.shape)
rows = len(unique_latitudes)
cols = len(unique_longitudes)
data_4d_test = pivot_df_test.values.reshape((len(pivot_df_test), rows, cols, 1))
conv2dlstm_input_test = data_4d_test.reshape((len(pivot_df_test), 1, rows, cols, 1))
print("Final Conv2DLSTM input shape:", conv2dlstm_input_test.shape)

In [None]:
X_train = conv2dlstm_input_train
y_train = X_train[:, :, :, :, 0]
y_train = y_train.reshape((y_train.shape[0], y_train.shape[1], y_train.shape[2], y_train.shape[3], 1))

In [None]:
X_valid = conv2dlstm_input_valid
y_valid = X_valid[:, :, :, :, 0]
y_valid = y_valid.reshape((y_valid.shape[0], y_valid.shape[1], y_valid.shape[2], y_valid.shape[3], 1))

In [None]:
X_test = conv2dlstm_input_test
y_test = X_test[:, :, :, :, 0]
y_test = y_test.reshape((y_test.shape[0], y_test.shape[1], y_test.shape[2], y_test.shape[3], 1))

In [None]:
# y_train = y_train.reshape((y_train.shape[0], 1, 363, 367, 1))
# y_valid = y_valid.reshape((y_valid.shape[0], 1, 363, 367, 1))
# y_test = y_test.reshape((y_test.shape[0], 1, 363, 367, 1))

# y_train = y_train.reshape(-1, 1)
# y_valid = y_valid.reshape(-1, 1)
# y_test = y_test.reshape(-1, 1)

# # Expanding the labels across the spatial dimensions
# y_train_tiled = y_train[:, np.newaxis, np.newaxis, np.newaxis, np.newaxis]
# y_train_tiled = np.tile(y_train_tiled, (1, 1, 363, 367, 1))

# y_valid_tiled = y_valid[:, np.newaxis, np.newaxis, np.newaxis, np.newaxis]
# y_valid_tiled = np.tile(y_valid_tiled, (1, 1, 363, 367, 1))

# y_test_tiled = y_test[:, np.newaxis, np.newaxis, np.newaxis, np.newaxis]
# y_test_tiled = np.tile(y_test_tiled, (1, 1, 363, 367, 1))

In [None]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_valid shape:", X_valid.shape)
print("y_valid shape:", y_valid.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

In [None]:
# # Assuming y_train, y_valid, y_test are scalar values per image
# y_train_reshaped = np.tile(y_train[:, np.newaxis, np.newaxis], (1, 363, 367))
# y_train_reshaped = y_train_reshaped.reshape(-1, 1, 363, 367, 1)

# y_valid_reshaped = np.tile(y_valid[:, np.newaxis, np.newaxis], (1, 363, 367))
# y_valid_reshaped = y_valid_reshaped.reshape(-1, 1, 363, 367, 1)

# y_test_reshaped = np.tile(y_test[:, np.newaxis, np.newaxis], (1, 363, 367))
# y_test_reshaped = y_test_reshaped.reshape(-1, 1, 363, 367, 1)

In [None]:
# # Use a small subset of the data for quick debugging
# X_train_small = X_train[:100]
# y_train_small = y_train[:100]
# X_valid_small = X_valid[:20]
# y_valid_small = y_valid[:20]

In [None]:
# import tensorflow as tf
# import tensorflow_probability as tfp

# tfpl = tfp.layers

# def build_bnn_model():
#     input_layer = tf.keras.layers.Input(shape=(X_train.shape[1:]))  # 5D input shape
    
#     # Use ConvLSTM2D directly for the 5D input
#     x = tf.keras.layers.ConvLSTM2D(filters=32, kernel_size=(3, 3), padding='same', return_sequences=False, activation='relu')(input_layer)
    
#     # Flatten the output before passing it to the DenseVariational layer
#     x = tf.keras.layers.Flatten()(x)
    
#     # Manually create a variational distribution
#     def posterior_mean_field(kernel_size, bias_size=0, dtype=None):
#         n = kernel_size + bias_size
#         c = np.log(np.expm1(1.))
#         return tf.keras.Sequential([
#             tfpl.VariableLayer(2 * n, dtype=dtype),
#             tfpl.DistributionLambda(lambda t: tfp.distributions.Independent(
#                 tfp.distributions.Normal(loc=t[..., :n],
#                                          scale=1e-5 + tf.nn.softplus(c + t[..., n:])),
#                 reinterpreted_batch_ndims=1)),
#         ])

#     def prior_trainable(kernel_size, bias_size=0, dtype=None):
#         n = kernel_size + bias_size
#         return tf.keras.Sequential([
#             tfpl.VariableLayer(n, dtype=dtype),
#             tfpl.DistributionLambda(lambda t: tfp.distributions.Independent(
#                 tfp.distributions.Normal(loc=t, scale=1),
#                 reinterpreted_batch_ndims=1)),
#         ])
    
#     output_layer = tfpl.DenseVariational(1, make_posterior_fn=posterior_mean_field, make_prior_fn=prior_trainable)(x)
    
#     model = tf.keras.Model(inputs=input_layer, outputs=output_layer)
    
#     model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    
#     return model

# # Train the model
# bnn_model = build_bnn_model()
# bnn_model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=10, batch_size=32)

In [None]:
# # Predict with the BNN
# n_samples = 100  # Number of samples from the posterior distribution
# predictions = np.array([bnn_model.predict(X_test, batch_size=32, verbose=0) for _ in range(n_samples)])

# # Calculate mean and uncertainty (standard deviation) of predictions
# predictions_mean = np.mean(predictions, axis=0)
# predictions_std = np.std(predictions, axis=0)  # Uncertainty estimate

In [None]:
# import tensorflow as tf
# from tensorflow.keras import layers

# # Define a sampling layer for the VAE
# class Sampling(layers.Layer):
#     def call(self, inputs):
#         z_mean, z_log_var = inputs
#         batch = tf.shape(z_mean)[0]
#         dim = tf.shape(z_mean)[1]
#         epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
#         return z_mean + tf.exp(0.5 * z_log_var) * epsilon

# # Build the VAE model
# def build_vae_model(latent_dim=2):
#     # Encoder
#     input_layer = tf.keras.layers.Input(shape=(X_train.shape[1:]))
    
#     x = tf.keras.layers.Conv2D(32, (3, 3), activation='relu', strides=2, padding='same')(input_layer)
#     x = tf.keras.layers.Conv2D(64, (3, 3), activation='relu', strides=2, padding='same')(x)
#     x = tf.keras.layers.Flatten()(x)
#     x = tf.keras.layers.Dense(16, activation='relu')(x)
    
#     z_mean = tf.keras.layers.Dense(latent_dim)(x)
#     z_log_var = tf.keras.layers.Dense(latent_dim)(x)
#     z = Sampling()([z_mean, z_log_var])
    
#     encoder = tf.keras.Model(input_layer, [z_mean, z_log_var, z], name="encoder")
    
#     # Decoder
#     latent_inputs = tf.keras.layers.Input(shape=(latent_dim,))
#     x = tf.keras.layers.Dense(7 * 7 * 64, activation='relu')(latent_inputs)
#     x = tf.keras.layers.Reshape((7, 7, 64))(x)
#     x = tf.keras.layers.Conv2DTranspose(64, (3, 3), activation='relu', strides=2, padding='same')(x)
#     x = tf.keras.layers.Conv2DTranspose(32, (3, 3), activation='relu', strides=2, padding='same')(x)
#     output_layer = tf.keras.layers.Conv2DTranspose(1, (3, 3), activation='sigmoid', padding='same')(x)
    
#     decoder = tf.keras.Model(latent_inputs, output_layer, name="decoder")
    
#     # VAE Model
#     outputs = decoder(encoder(input_layer)[2])
#     vae = tf.keras.Model(input_layer, outputs, name="vae")
    
#     reconstruction_loss = tf.keras.losses.mse(input_layer, outputs)
#     reconstruction_loss *= X_train.shape[1] * X_train.shape[2]
    
#     kl_loss = 1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
#     kl_loss = tf.reduce_mean(kl_loss) * -0.5
    
#     vae_loss = tf.reduce_mean(reconstruction_loss + kl_loss)
#     vae.add_loss(vae_loss)
#     vae.compile(optimizer='adam')
    
#     return vae

# # Train the VAE model
# vae_model = build_vae_model()
# vae_model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=10, batch_size=32)

# # Predict with the VAE and obtain uncertainty in the latent space
# z_mean, z_log_var, _ = vae_model.layers[1](X_test)  # Encoder output
# uncertainty = tf.exp(0.5 * z_log_var)  # This gives the uncertainty in the latent space


In [None]:
# def build_mc_dropout_model():
#     input_layer = tf.keras.layers.Input(shape=(X_train.shape[1:]))
    
#     # Example Conv2DLSTM Layer
#     x = tf.keras.layers.ConvLSTM2D(filters=32, kernel_size=(3, 3), activation='relu', return_sequences=True)(input_layer)
#     x = MCDropout(0.5)(x)
    
#     x = tf.keras.layers.ConvLSTM2D(filters=32, kernel_size=(3, 3), activation='relu')(x)
#     x = MCDropout(0.5)(x)
    
#     x = tf.keras.layers.Flatten()(x)
#     x = tf.keras.layers.Dense(1)(x)  # Output a single value
    
#     output_layer = tf.keras.layers.Reshape((1,))(x)  # Ensure the output shape is (batch_size, 1)
    
#     model = tf.keras.Model(inputs=input_layer, outputs=output_layer)
    
#     model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    
#     return model

# # Now train the model
# mc_dropout_model = build_mc_dropout_model()
# mc_dropout_model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=10, batch_size=32)

In [None]:
# n_samples = 100  # Number of stochastic forward passes
# predictions = np.array([mc_dropout_model.predict(X_test, batch_size=32, verbose=0) for _ in range(n_samples)])

# # Calculate mean and uncertainty (standard deviation) of predictions
# predictions_mean = np.mean(predictions, axis=0)
# predictions_std = np.std(predictions, axis=0)  # This is your uncertainty estimate

In [None]:
# # Example of combining predictions from the above models
# combined_predictions = np.stack([predictions_mc, predictions_bnn, predictions_vae], axis=-1)
# combined_mean = np.mean(combined_predictions, axis=-1)
# combined_std = np.std(combined_predictions, axis=-1)  # Combined uncertainty

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from kerastuner.tuners import BayesianOptimization
from sklearn.metrics import mean_squared_error
import numpy as np
import kerastuner as kt

In [None]:
model = models.Sequential()

model.add(layers.ConvLSTM2D(
    filters=32,
    kernel_size=(1, 1),
    activation='relu',
    input_shape=(X_train.shape[1], X_train.shape[2], X_train.shape[3], X_train.shape[4]),
    return_sequences=False
))

model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(1, activation='linear'))

model.compile(optimizer='adam', loss='mse', metrics=['mae'])

history = model.fit(X_train, y_train, epochs=10)

In [None]:
def build_simple_model(hp):
    model = models.Sequential()
    
    model.add(layers.ConvLSTM2D(
        filters=32,
        kernel_size=(3, 3),
        activation='relu',
        input_shape=(X_train.shape[1], X_train.shape[2], X_train.shape[3], X_train.shape[4]),
        return_sequences=False
    ))
    
    model.add(layers.Flatten())
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(1, activation='linear'))
    
    model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
                  loss='mse',
                  metrics=['mae'])
    
    return model

In [None]:


tuner = BayesianOptimization(
    build_model,
    objective='val_loss',
    max_trials=20,
    directory='/Volumes/JPL/alt_train_new',
    project_name='conv_lstm_tuning'
)

# Print the search space summary
tuner.search_space_summary()

In [None]:
print("X_train shape:", X_train.shape)
print("X_valid shape:", X_valid.shape)
print("X_test shape:", X_test.shape)

In [None]:
tuner.search(
    X_train, 
    y_train,
    epochs=10,
    validation_data=(X_valid, y_valid)
)

In [None]:
# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# Print the best hyperparameters
print(f"""
The optimal number of filters is {best_hps.get('filters')} 
The optimal number of units in the fully connected layer is {best_hps.get('units')}
The optimal learning rate for the optimizer is {best_hps.get('learning_rate')}
""")

In [None]:
# Build the model with the best hyperparameters
model = tuner.hypermodel.build(best_hps)

# Train the model
history = model.fit(X_train, y_train, 
                    epochs=50, 
                    validation_data=(X_valid, y_valid),
                    batch_size=32)

# Evaluate on test data
test_loss, test_mae = model.evaluate(X_test, y_test)
print(f'Test Loss: {test_loss}, Test MAE: {test_mae}')

In [None]:
def build_complex_model(hp):
    model = models.Sequential()
    
    # Encoder: ConvLSTM layers
    model.add(layers.ConvLSTM2D(
        filters=hp.Int('encoder_filters', min_value=32, max_value=128, step=16),
        kernel_size=(3, 3),
        activation='relu',
        input_shape=(X_train.shape[1], X_train.shape[2], X_train.shape[3], X_train.shape[4]),
        return_sequences=True
    ))
    
    model.add(layers.ConvLSTM2D(
        filters=hp.Int('encoder_filters2', min_value=32, max_value=128, step=16),
        kernel_size=(3, 3),
        activation='relu',
        return_sequences=False
    ))
    
    # Decoder: Conv2DTranspose layers
    model.add(layers.Conv2DTranspose(
        filters=hp.Int('decoder_filters', min_value=32, max_value=128, step=16),
        kernel_size=(3, 3),
        activation='relu'
    ))
    
    model.add(layers.Conv2DTranspose(
        filters=1,
        kernel_size=(3, 3),
        activation='linear'
    ))
    
    # Compile the model
    model.compile(optimizer=tf.keras.optimizers.Adam(
                    hp.Choice('learning_rate', [1e-3, 1e-4])),
                  loss='mse',
                  metrics=['mae'])
    
    return model

In [None]:
def build_model_with_uncertainty(hp):
    model = models.Sequential()
    
    model.add(layers.ConvLSTM2D(
        filters=hp.Int('filters', min_value=32, max_value=128, step=16),
        kernel_size=(3, 3),
        activation='relu',
        input_shape=(X_train.shape[1], X_train.shape[2], X_train.shape[3], X_train.shape[4]),
        return_sequences=True
    ))
    
    model.add(layers.Dropout(0.5))  # Apply dropout
    
    model.add(layers.ConvLSTM2D(
        filters=hp.Int('filters2', min_value=32, max_value=128, step=16),
        kernel_size=(3, 3),
        activation='relu',
        return_sequences=False
    ))
    
    model.add(layers.Dropout(0.5))  # Apply dropout
    
    model.add(layers.Flatten())
    model.add(layers.Dense(hp.Int('units', min_value=32, max_value=256, step=32), activation='relu'))
    model.add(layers.Dense(1, activation='linear'))
    
    model.compile(optimizer=tf.keras.optimizers.Adam(hp.Choice('learning_rate', [1e-3, 1e-4])),
                  loss='mse',
                  metrics=['mae'])
    
    return model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import ConvLSTM2D, BatchNormalization, Conv3D, Conv2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import GlobalAveragePooling3D

def model(hp):
    model = Sequential()
    
    # ConvLSTM2D layer to process the sequence data
    model.add(ConvLSTM2D(
        #filters=hp.Int('filters_1', min_value=16, max_value=64, step=16), 
        filters = hp.Int('filters_1', min_value=8, max_value=32, step=8),
        kernel_size=hp.Choice('kernel_size_1', values=[1, 3]),
        padding='same', 
        return_sequences=True,
        activation=hp.Choice('activation_function_1', values=['relu','tanh','sigmoid','leaky_relu','swish']),
        input_shape=(12, 363, 367, 1)
    )
             )
    
    model.add(BatchNormalization())
    
    # Another ConvLSTM2D layer
    model.add(ConvLSTM2D(
        #filters=hp.Int('filters_2', min_value=16, max_value=32, step=16), 
        filters=32,
        #kernel_size=hp.Choice('kernel_size_2', values=[1, 3]),
        kernel_size=(1,1),
        padding='same',
        #activation=hp.Choice('activation_function_2', values=['relu','tanh','sigmoid','leaky_relu','swish']),
        activation='leaky_relu',
        return_sequences=False
    )
             )
    
    model.add(BatchNormalization())
    
    # # # Final ConvLSTM2D layer without return_sequences
    # # model.add(ConvLSTM2D(
    # #     #filters=hp.Int('filters_3', min_value=16, max_value=32, step=16), 
    # #     filters=32,
    # #     #kernel_size=hp.Choice('kernel_size_3', values=[1, 3]),
    # #     kernel_size=(1,1),
    # #     padding='same', 
    # #     #activation=hp.Choice('activation_function_3', values=['relu','tanh','softmax','sigmoid','leaky_relu','swish']),
    # #     activation='swish',
    # #     return_sequences=False
    # # )
    # #          )
    
    # # model.add(BatchNormalization())

    # model.add(Lambda(lambda x: x[:, -1, :, :, :]))  # Shape becomes [batch_size, height, width, channels]
    
    # # Use Conv2D to reduce to the target output shape
    # model.add(Conv2D(
    #         filters=1,
    #         #kernel_size=hp.Choice('kernel_size_4', values=[1, 3]),
    #         kernel_size=(1,1),
    #         #activation=hp.Choice('activation_function_4', values=['relu','tanh','softmax','sigmoid','leaky_relu','swish']),
    #         activation='softmax',
    #         padding='same'
    #         #data_format='channels_last'
    # )
    #          )

    # model.add(Flatten())

    model.add(GlobalAveragePooling2D())
    
    model.add(Dense(
        units=hp.Int('units', min_value=64, max_value=256, step=64),
        #units=hp.Int('filters_4', min_value=64, max_value=128, step=16),
        #activation=hp.Choice('activation_function_5', values=['relu','tanh','softmax','sigmoid','leaky_relu','swish']),
        activation='relu',
    )
             )
    
    model.add(Dense(
        units=1, 
        activation='linear'
    )
             )
    
    # Compile the model
    model.compile(
        optimizer=Adam(
            #hp.Choice('learning_rate', values=[1e-3, 1e-4, 1e-5])
            hp.Choice('learning_rate', values=[1e-3, 1e-4])
        ),
        loss='mse', 
        metrics=['mae','mse','accuracy']
    )
    
    # Display the model's architecture
    #model.summary()

    return model

In [None]:
# Define callbacks
early_stopping_cb = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
lr_scheduler_cb = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5)
checkpoint_cb = ModelCheckpoint('model_checkpoint.keras', save_best_only=True)

In [None]:
from keras_tuner import BayesianOptimization

tuner = BayesianOptimization(
    model,
    objective='val_loss',
    max_trials=5,
    executions_per_trial=8,
    directory='/Volumes/JPL/alt_train_new',
    project_name='alt_train_conv3dlstm_tuning',
    #directory='ch4_train',
    #project_name='ch4_train_conv3dlstm_tuning',
    overwrite=True,
)

In [None]:
original_size = y_train.size
new_shape = (4, original_size // (4 * 1 * 1), 1, 1)
y_train = y_train.reshape(new_shape)
y_train.shape

# batch_size = 4  # Number of samples per batch
# height = 363    # Height of the image
# width = 367     # Width of the image
# channels = 1    # Number of channels

# expected_size = batch_size * height * width * channels

# # Check if expected size matches actual size
# if expected_size == y_train.size:
#     y_train = y_train.reshape((batch_size, height, width, channels))
# else:
#     print("Reshape dimensions do not match the array size. Adjust the dimensions accordingly.")

In [None]:
if y_train.shape[0] == 4 and X_train.shape[0] == 2628:
    y_train = np.repeat(y_train, 657, axis=0)  # Assuming 657 repetitions to match 2628 samples

In [None]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

In [None]:
tuner.search(
    X_train,
    y_train,
    epochs=5,
    batch_size = 4,
    validation_data=(X_valid, y_valid),
    verbose = 1
    #callbacks=[early_stopping_cb, lr_scheduler_cb]
)

In [None]:
tuner.search(
    X_train_small,
    y_train_small,
    epochs=5,
    batch_size=4,
    validation_data=(X_valid_small, y_valid_small),
    verbose=1
)

In [None]:
import numpy as np

# Check for NaNs or infinities in the data
assert not np.isnan(X_train).any(), "X_train contains NaN values"
assert not np.isinf(X_train).any(), "X_train contains infinite values"
assert not np.isnan(y_train).any(), "y_train contains NaN values"
assert not np.isinf(y_train).any(), "y_train contains infinite values"

# Repeat for validation data
assert not np.isnan(X_valid).any(), "X_valid contains NaN values"
assert not np.isinf(X_valid).any(), "X_valid contains infinite values"
assert not np.isnan(y_valid).any(), "y_valid contains NaN values"
assert not np.isinf(y_valid).any(), "y_valid contains infinite values"


In [None]:
import tensorflow as tf
tf.keras.backend.clear_session()

In [None]:
alt
reshaped_df=alt[(alt['alt'] != 0) & (~alt['alt'].isna())].sort_values(by='datetime').reset_index(drop=True)
reshaped_df['monthly_diff'] = reshaped_df.groupby(['lat', 'lon'])['alt'].diff()
reshaped_df=reshaped_df.replace(0,np.nan).dropna()
reshaped_df=reshaped_df.sort_values(by='datetime')
reshaped_df=reshaped_df.reset_index(drop=True)
reshaped_df

In [None]:
len(np.unique(reshaped_df.datetime)), len(np.unique(reshaped_df.lat)),len(np.unique(reshaped_df.lon)), len(np.unique(reshaped_df.alt)),\
len(np.unique(reshaped_df.monthly_diff))

In [None]:
pd.DataFrame(pd.to_datetime(np.unique(reshaped_df.datetime)).strftime('%Y-%m'))

In [None]:
-1.135481e-0-2.092486e-01

In [None]:
# Triple-check for any remaining NaN values
assert not reshaped_df.isna().any().any(), "There are still NaN values in the DataFrame."

In [None]:
reshaped_df['year'] = reshaped_df['datetime'].dt.year
reshaped_df['month'] = reshaped_df['datetime'].dt.month

unique_years = reshaped_df['year'].unique()
unique_months = np.arange(1, 13)  # Since months should always be from 1 to 12
unique_lats = np.sort(reshaped_df['lat'].unique())
unique_lons = np.sort(reshaped_df['lon'].unique())

ndarray = np.full((len(unique_years), len(unique_months), len(unique_lats), len(unique_lons), 1), np.nan)

year_to_idx = {year: i for i, year in enumerate(unique_years)}
month_to_idx = {month: i for i, month in enumerate(unique_months)}
lat_to_idx = {lat: i for i, lat in enumerate(unique_lats)}
lon_to_idx = {lon: i for i, lon in enumerate(unique_lons)}

In [None]:
for _, row in reshaped_df.iterrows():
    year_idx = year_to_idx[row['year']]
    month_idx = month_to_idx[row['month']]
    lat_idx = lat_to_idx[row['lat']]
    lon_idx = lon_to_idx[row['lon']]
    ndarray[year_idx, month_idx - 1, lat_idx, lon_idx, 0] = row['monthly_diff']

In [None]:
assert not np.isnan(ndarray).any(), "There are still NaN values in the ndarray."

In [None]:
nan_indices = np.argwhere(np.isnan(ndarray))

In [None]:
missing_combinations = []
for idx in nan_indices:
    year = unique_years[idx[0]]
    month = unique_months[idx[1]]
    lat = unique_lats[idx[2]]
    lon = unique_lons[idx[3]]
    missing_combinations.append((year, month, lat, lon))

In [None]:
print("Some missing (year, month, lat, lon) combinations:")
for combo in missing_combinations[:10]:  # Only show the first 10 for brevity
    print(combo)

In [None]:
import scipy.ndimage

# Define a function to interpolate over missing values in a 4D ndarray
def interpolate_nan_values(ndarray):
    # Create an array of indices where NaNs are present
    nan_mask = np.isnan(ndarray)
    
    # Use linear interpolation on the masked NaN values
    ndarray_interpolated = scipy.ndimage.morphology.distance_transform_edt(~nan_mask, return_distances=False, return_indices=True)
    ndarray_interpolated = ndarray[tuple(ndarray_interpolated)]
    
    return ndarray_interpolated

In [None]:
# Perform the interpolation
ndarray_interpolated = interpolate_nan_values(ndarray)

# Ensure no NaNs are left
assert not np.isnan(ndarray_interpolated).any(), "There are still NaN values in the interpolated ndarray."

# Print the shape to confirm
print("Shape of the interpolated ndarray:", ndarray_interpolated.shape)

In [None]:
# Check for any remaining NaN values
print("Any NaN values left?:", np.cumsum(np.isnan(ndarray_interpolated).any()))

In [None]:
# Perform the interpolation
ndarray_interpolated = interpolate_nan_values(ndarray_interpolated)

# Ensure no NaNs are left
assert not np.isnan(ndarray_interpolated).any(), "There are still NaN values in the interpolated ndarray."

# Print the shape to confirm
print("Shape of the interpolated ndarray:", ndarray_interpolated.shape)

In [None]:
valid_mask = ~np.isnan(ndarray)
ndarray_valid = ndarray[valid_mask].reshape(-1, 5)  # Adjust dimensions as needed

# Print the shape to confirm
print("Shape of the valid-only ndarray:", ndarray_valid.shape)

In [None]:
# Print the shape to confirm
print("Shape of the reshaped ndarray:", ndarray.shape)

In [None]:
ndarray_interpolated[0,0,:,0]

In [None]:
alt_clean = alt[(alt['alt'] != 0) & (~alt['alt'].isna())].sort_values(by='datetime').reset_index(drop=True)
alt_clean_pivot = alt_clean.pivot_table(index='datetime', columns=['lat', 'lon'], values='alt')

def pivot_to_sequence_with_mask(pivot_df, n_months=12):
    # Replace zeros with NaN for consistency in masking
    pivot_df_masked = pivot_df.replace(0, np.nan)
    
    sequences = []
    masks = []
    unique_dates = pivot_df_masked.index.unique()
    
    for start in range(len(unique_dates) - n_months + 1):
        end = start + n_months
        seq = pivot_df_masked.loc[unique_dates[start:end]].values
        
        if seq.shape[0] == n_months:  # Ensure we have the correct number of months
            seq = seq.reshape(1, n_months, *seq.shape[1:], 1)
            mask = ~np.isnan(seq)  # Mask is True where the data is valid (not NaN)
            sequences.append(np.nan_to_num(seq, nan=0))  # Replace NaN with zero or another value
            masks.append(mask)
    
    if sequences:
        sequences = np.concatenate(sequences, axis=0)
        masks = np.concatenate(masks, axis=0)
        return sequences, masks
    else:
        raise ValueError("No valid sequences found. Check your data and preprocessing steps.")

alt_seq, alt_mask = pivot_to_sequence_with_mask(alt_clean_pivot)

In [None]:
alt_df=alt_clean_pivot.stack(level=[0, 1]).reset_index()
alt_df.columns = ['datetime', 'lat', 'lon', 'alt']

In [None]:
alt_seq.shape

In [None]:
alt_train.shape

In [None]:
def clean_data(df):
    #return df.dropna().loc[(df.iloc[:, 3] != 0)].sort_values(by='datetime').reset_index(drop=True)
    return df[(df[value_column] != 0) & (~df[value_column].isna())].sort_values(by=['datetime', 'lat', 'lon']).reset_index(drop=True)

In [None]:
alt_train_clean = alt_train[(alt_train['alt'] != 0) & (~alt_train['alt'].isna())].sort_values(by='datetime').reset_index(drop=True)
alt_valid_clean = alt_valid[(alt_valid['alt'] != 0) & (~alt_valid['alt'].isna())].sort_values(by='datetime').reset_index(drop=True)
alt_test_clean = alt_test[(alt_test['alt'] != 0) & (~alt_test['alt'].isna())].sort_values(by='datetime').reset_index(drop=True)

In [None]:
alt_train_clean.shape

In [None]:
alt_train_pivot = alt_train_clean.pivot_table(index='datetime', columns=['lat', 'lon'], values='alt')
alt_valid_pivot = alt_valid_clean.pivot_table(index='datetime', columns=['lat', 'lon'], values='alt')
alt_test_pivot = alt_test_clean.pivot_table(index='datetime', columns=['lat', 'lon'], values='alt')

In [None]:
alt_train_pivot.shape

In [None]:
def pivot_to_sequence_with_mask(pivot_df, n_months=12):
    # Replace zeros with NaN for consistency in masking
    pivot_df_masked = pivot_df.replace(0, np.nan)
    
    sequences = []
    masks = []
    unique_dates = pivot_df_masked.index.unique()
    
    for start in range(len(unique_dates) - n_months + 1):
        end = start + n_months
        seq = pivot_df_masked.loc[unique_dates[start:end]].values
        
        if seq.shape[0] == n_months:  # Ensure we have the correct number of months
            seq = seq.reshape(1, n_months, *seq.shape[1:], 1)
            mask = ~np.isnan(seq)  # Mask is True where the data is valid (not NaN)
            sequences.append(np.nan_to_num(seq, nan=0))  # Replace NaN with zero or another value
            masks.append(mask)
    
    if sequences:
        sequences = np.concatenate(sequences, axis=0)
        masks = np.concatenate(masks, axis=0)
        return sequences, masks
    else:
        raise ValueError("No valid sequences found. Check your data and preprocessing steps.")

In [None]:
alt_train_seq, alt_train_mask = pivot_to_sequence_with_mask(alt_train_pivot)
alt_valid_seq, alt_valid_mask = pivot_to_sequence_with_mask(alt_valid_pivot)
alt_test_seq, alt_test_mask = pivot_to_sequence_with_mask(alt_test_pivot)

In [None]:
alt_train_seq.shape

In [None]:
plt.plot(alt_train_seq[-1,4,:,0]);
plt.plot(alt_train_seq[-1,7,:,0]);

In [None]:
plt.plot(alt_train_seq[0,4,:,0]);
plt.plot(alt_train_seq[0,7,:,0]);

In [None]:
alt_train_seq.shape, alt_valid_seq.shape, alt_test_seq.shape

In [None]:
alt_train.lat.nunique(), alt_train.lon.nunique()

In [None]:
unique_lats = alt_train_clean['lat'].unique()
unique_lons = alt_train_clean['lon'].unique()

lat_count = len(unique_lats)
lon_count = len(unique_lons)

print(f"Number of unique latitudes: {lat_count}")
print(f"Number of unique longitudes: {lon_count}")

In [None]:
import pandas as pd
import numpy as np

# Define the size of the bins
lat_bin_size = 0.001  # Adjust this based on the data
lon_bin_size = 0.001  # Adjust this based on the data

# Create new columns for binned latitude and longitude
alt_train_clean['lat_bin'] = (alt_train_clean['lat'] // lat_bin_size) * lat_bin_size
alt_train_clean['lon_bin'] = (alt_train_clean['lon'] // lon_bin_size) * lon_bin_size

# Now check the number of unique bins
unique_lat_bins = alt_train_clean['lat_bin'].unique()
unique_lon_bins = alt_train_clean['lon_bin'].unique()

lat_count = len(unique_lat_bins)
lon_count = len(unique_lon_bins)

print(f"Number of unique lat bins: {lat_count}")
print(f"Number of unique lon bins: {lon_count}")
print(f"lat_count * lon_count = {lat_count * lon_count}")

In [None]:
import pandas as pd
import numpy as np

# Define the size of the bins
lat_bin_size = 0.001  # Adjust this based on the data
lon_bin_size = 0.001  # Adjust this based on the data

# Create new columns for binned latitude and longitude
alt_valid_clean['lat_bin'] = (alt_valid_clean['lat'] // lat_bin_size) * lat_bin_size
alt_valid_clean['lon_bin'] = (alt_valid_clean['lon'] // lon_bin_size) * lon_bin_size

# Now check the number of unique bins
unique_lat_bins = alt_train_clean['lat_bin'].unique()
unique_lon_bins = alt_train_clean['lon_bin'].unique()

lat_count = len(unique_lat_bins)
lon_count = len(unique_lon_bins)

print(f"Number of unique lat bins: {lat_count}")
print(f"Number of unique lon bins: {lon_count}")
print(f"lat_count * lon_count = {lat_count * lon_count}")

In [None]:
import pandas as pd
import numpy as np

# Define the size of the bins
lat_bin_size = 0.001  # Adjust this based on the data
lon_bin_size = 0.001  # Adjust this based on the data

# Create new columns for binned latitude and longitude
alt_test_clean['lat_bin'] = (alt_test_clean['lat'] // lat_bin_size) * lat_bin_size
alt_test_clean['lon_bin'] = (alt_test_clean['lon'] // lon_bin_size) * lon_bin_size

# Now check the number of unique bins
unique_lat_bins = alt_train_clean['lat_bin'].unique()
unique_lon_bins = alt_train_clean['lon_bin'].unique()

lat_count = len(unique_lat_bins)
lon_count = len(unique_lon_bins)

print(f"Number of unique lat bins: {lat_count}")
print(f"Number of unique lon bins: {lon_count}")
print(f"lat_count * lon_count = {lat_count * lon_count}")

In [None]:
# Re-pivot the data using the new binned latitude and longitude
alt_train_pivot = alt_train_clean.pivot_table(index='datetime', columns=['lat_bin', 'lon_bin'], values='alt')
alt_valid_pivot = alt_valid_clean.pivot_table(index='datetime', columns=['lat_bin', 'lon_bin'], values='alt')
alt_test_pivot = alt_test_clean.pivot_table(index='datetime', columns=['lat_bin', 'lon_bin'], values='alt')

# Create sequences with the updated grid
alt_train_seq, alt_train_mask = pivot_to_sequence_with_mask(alt_train_pivot, n_months=12)
alt_valid_seq, alt_valid_mask = pivot_to_sequence_with_mask(alt_valid_pivot, n_months=12)
alt_test_seq, alt_test_mask = pivot_to_sequence_with_mask(alt_test_pivot, n_months=12)

# The new shape should reflect the grid size
print(alt_train_seq.shape)  # Expected shape: (samples, 12, 72, 103, 1)
print(alt_valid_seq.shape)
print(alt_test_seq.shape)

In [None]:
print(list(divisorGenerator(356)))

In [None]:
alt_train_seq = alt_train_seq.reshape((alt_train_seq.shape[0], alt_train_seq.shape[1], 303, 335, alt_train_seq.shape[3]))

In [None]:
# max_lat = 303
# max_lon = 335

# alt_train_seq_padded = pad_sequences(alt_train_seq, max_lat, max_lon)
# alt_valid_seq_padded = pad_sequences(alt_valid_seq, max_lat, max_lon)
# alt_test_seq_padded = pad_sequences(alt_test_seq, max_lat, max_lon)

In [None]:
alt_train_seq = alt_train_seq.reshape((alt_train_seq.shape[0], alt_train_seq.shape[1], 303, 335, alt_train_seq.shape[3]))
alt_valid_seq = alt_valid_seq.reshape((alt_valid_seq.shape[0], alt_valid_seq.shape[1], 303, 335, alt_valid_seq.shape[3]))
alt_test_seq = alt_test_seq.reshape((alt_test_seq.shape[0], alt_test_seq.shape[1], 303, 335, alt_test_seq.shape[3]))

In [None]:
X_alt_train = alt_train_seq[:, :, :, :, 0]  # Shape: (samples, 12, 72, 103)
y_alt_train = alt_train_seq[:, :, :, :, 1]  # Shape: (samples, 12, 72, 103)

X_alt_valid = alt_valid_seq[:, :, :, :, 0]
y_alt_valid = alt_valid_seq[:, :, :, :, 1]

X_alt_test = alt_test_seq[:, :, :, :, 0]
y_alt_test = alt_test_seq[:, :, :, :, 1]

In [None]:
alt_train_seq = alt_train_seq.reshape((alt_train_seq.shape[0], alt_train_seq.shape[1], \
                                       alt_train_seq.shape[2], 1, alt_train_seq.shape[3]))
alt_valid_seq = alt_valid_seq.reshape((alt_valid_seq.shape[0], alt_valid_seq.shape[1], \
                                       alt_valid_seq.shape[2], 1, alt_valid_seq.shape[3]))
alt_test_seq = alt_test_seq.reshape((alt_test_seq.shape[0], alt_test_seq.shape[1], \
                                     alt_test_seq.shape[2], 1, alt_test_seq.shape[3]))

In [None]:
max_lat = max([seq.shape[2] for seq in [alt_train_seq, alt_valid_seq, alt_test_seq]])
max_lon = max([seq.shape[3] for seq in [alt_train_seq, alt_valid_seq, alt_test_seq]])

In [None]:
def pad_sequences(sequences, max_lat, max_lon):
    padded_sequences = []
    for seq in sequences:
        # Check if the sequence has the expected dimensions (5D: batch, time, lat, lon, feature)
        if seq.ndim != 5:
            raise ValueError(f"Expected a 5D sequence, but got {seq.ndim}D sequence.")
        
        # Get the current shape
        current_lat = seq.shape[2]
        current_lon = seq.shape[3]
        
        # Calculate the padding amounts
        lat_diff = max_lat - current_lat
        lon_diff = max_lon - current_lon
        
        # Apply padding
        if lat_diff >= 0 and lon_diff >= 0:
            padded_seq = np.pad(seq, ((0, 0), (0, 0), (0, lat_diff), (0, lon_diff), (0, 0)), 'constant', constant_values=0)
            padded_sequences.append(padded_seq)
        else:
            raise ValueError("Padding dimensions cannot be negative. Check the dimensions of your sequences.")
    
    return np.array(padded_sequences)

In [None]:
def convert_to_5d(sequences):
    """Convert 4D sequences (samples, time_steps, lat, lon) to 5D by adding a feature dimension."""
    return np.expand_dims(sequences, axis=-1)

In [None]:
alt_train_seq_5d = convert_to_5d(alt_train_seq)
alt_valid_seq_5d = convert_to_5d(alt_valid_seq)
alt_test_seq_5d = convert_to_5d(alt_test_seq)

In [None]:
# alt_train_mask_seq_5d = convert_to_5d(alt_train_mask)
# alt_valid_mask_seq_5d = convert_to_5d(alt_valid_mask)
# alt_test_mask_seq_5d = convert_to_5d(alt_test_mask)

In [None]:
max_lat = max([seq.shape[2] for seq in [alt_train_seq_5d, alt_valid_seq_5d, alt_test_seq_5d]])
max_lon = max([seq.shape[3] for seq in [alt_train_seq_5d, alt_valid_seq_5d, alt_test_seq_5d]])

In [None]:
# max_lat_masked = max([seq.shape[2] for seq in [alt_train_mask_seq_5d, \
#                                                alt_valid_mask_seq_5d, \
#                                                alt_test_mask_seq_5d]])
# max_lon_masked = max([seq.shape[3] for seq in [alt_train_mask_seq_5d, \
#                                                alt_valid_mask_seq_5d, \
#                                                alt_test_mask_seq_5d]])

In [None]:
alt_train_seq_padded = pad_sequences(alt_train_seq_5d, max_lat, max_lon)
alt_valid_seq_padded = pad_sequences(alt_valid_seq_5d, max_lat, max_lon)
alt_test_seq_padded = pad_sequences(alt_test_seq_5d, max_lat, max_lon)

In [None]:
#pd.DataFrame(alt_train_seq_padded[0,7,:,:,0,0].flatten()).replace(0,np.nan).dropna().plot();
alt_train_seq_padded.shape,alt_valid_seq_padded.shape,alt_test_seq_padded.shape

In [None]:
def adjust_shape(sequences):
    """Remove unnecessary dimensions from sequences."""
    return np.squeeze(sequences, axis=-1)

alt_train_seq_padded = adjust_shape(alt_train_seq_padded)
alt_valid_seq_padded = adjust_shape(alt_valid_seq_padded)
alt_test_seq_padded = adjust_shape(alt_test_seq_padded)

# Verify the shapes
print(alt_train_seq_padded.shape)
print(alt_valid_seq_padded.shape)
print(alt_test_seq_padded.shape)

In [None]:
def split_X_y(sequences):
    # X is the first four dimensions (datetime, lat, lon)
    X = sequences[:, :, :, :]
    
    # y is the last dimension (alt)
    y = sequences[:, :, :, :, 0]
    
    return X, y

# Apply the function to each dataset
X_alt_train, y_alt_train = split_X_y(alt_train_seq_padded)
X_alt_valid, y_alt_valid = split_X_y(alt_valid_seq_padded)
X_alt_test, y_alt_test = split_X_y(alt_test_seq_padded)

print(X_alt_train.shape, y_alt_train.shape)
print(X_alt_valid.shape, y_alt_valid.shape)
print(X_alt_test.shape, y_alt_test.shape)

In [None]:
# Define callbacks
early_stopping_cb = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
lr_scheduler_cb = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2)
checkpoint_cb = ModelCheckpoint('model_checkpoint.keras', save_best_only=True)

In [None]:
from keras_tuner import BayesianOptimization

tuner = BayesianOptimization(
    model,
    objective='val_loss',
    max_trials=10,
    executions_per_trial=1,
    directory='alt_train2',
    project_name='alt_train_conv3dlstm_tuning',
    #directory='ch4_train',
    #project_name='ch4_train_conv3dlstm_tuning',
    overwrite=True,
)

In [None]:
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1, 1, X_train.shape[3]))
X_valid = X_valid.reshape((X_valid.shape[0], X_valid.shape[1], 1, 1, X_valid.shape[3]))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1, 1, X_test.shape[3]))

#tuner.search(train_dataset, epochs=100, validation_data=valid_dataset, callbacks=[early_stopping_cb, lr_scheduler_cb])
tuner.search(X_train, y_train, epochs=100, validation_data=(X_valid, y_valid), callbacks=[early_stopping_cb, lr_scheduler_cb])

In [None]:
tuner.search(
    X_alt_train,
    y_alt_train,
    epochs=10,
    batch_size = 16,
    validation_data=(X_alt_valid, y_alt_valid),
    callbacks=[early_stopping_cb, lr_scheduler_cb]
)

In [None]:
fc_train_clean = clean_data(fc_train)
fc_valid_clean = clean_data(fc_valid)
fc_test_clean = clean_data(fc_test)

In [None]:
fc_train_pivot = fc_train_clean.pivot_table(index='datetime', columns=['lat', 'lon'], values='fc')
fc_valid_pivot = fc_valid_clean.pivot_table(index='datetime', columns=['lat', 'lon'], values='fc')
fc_test_pivot = fc_test_clean.pivot_table(index='datetime', columns=['lat', 'lon'], values='fc')

In [None]:
fch4_train_clean = clean_data(fch4_train)
fch4_valid_clean = clean_data(fch4_valid)
fch4_test_clean = clean_data(fch4_test)

In [None]:
fch4_train_pivot = fch4_train_clean.pivot_table(index='datetime', columns=['lat', 'lon'], values='fch4')
fch4_valid_pivot = fch4_valid_clean.pivot_table(index='datetime', columns=['lat', 'lon'], values='fch4')
fch4_test_pivot = fch4_test_clean.pivot_table(index='datetime', columns=['lat', 'lon'], values='fch4')

In [None]:
import numpy as np
import pandas as pd

def create_sequences(df, n_months=12):
    df['datetime'] = pd.to_datetime(df['datetime'])
    df.sort_values(by=['datetime', 'lat', 'lon'], inplace=True)
    
    unique_dates = sorted(df['datetime'].unique())
    unique_lats = sorted(df['lat'].unique())
    unique_lons = sorted(df['lon'].unique())
    
    sequences = []
    
    for start in range(len(unique_dates) - n_months + 1):
        end = start + n_months
        seq = df[(df['datetime'].isin(unique_dates[start:end]))]
        
        # Pivot the table and fill missing values
        seq_pivot = seq.pivot_table(index=['datetime'], columns=['lat', 'lon'], values=df.columns[3])
        seq_pivot = seq_pivot.reindex(index=unique_dates[start:end], columns=pd.MultiIndex.from_product([unique_lats, unique_lons]), fill_value=np.nan)
        
        # Reshape into 5D array
        seq_reshaped = seq_pivot.values.reshape(1, n_months, len(unique_lats), len(unique_lons), 1)
        sequences.append(seq_reshaped)
    
    if len(sequences) > 0:
        return np.concatenate(sequences, axis=0)
    else:
        raise ValueError("No valid sequences found. Check your data and preprocessing steps.")

In [None]:
# # Check for missing lat/lon combinations
# lat_lon_combinations = alt_train_clean.groupby(['lat', 'lon']).size()
# missing_combinations = lat_lon_combinations[lat_lon_combinations < 12]
# print(f"Number of missing combinations: {missing_combinations.count()}")

# # Check the number of unique datetime entries
# unique_dates_count = alt_train_clean['datetime'].nunique()
# print(f"Number of unique datetime entries: {unique_dates_count}")

# # Check the overall data distribution by datetime
# datetime_distribution = alt_train_clean['datetime'].value_counts()
# print(datetime_distribution)

In [None]:
alt_train_seq = create_sequences(alt_train_clean)
alt_valid_seq = create_sequences(alt_valid_clean)
alt_test_seq = create_sequences(alt_test_clean)

In [None]:
import h5py

# Save sequences to an HDF5 file
with h5py.File('/Volumes/JPL/alt_train_valid_test_sequences.h5', 'w') as hf:
    hf.create_dataset('alt_train', data=alt_train_seq)
    hf.create_dataset('alt_valid', data=alt_valid_seq)
    hf.create_dataset('alt_test', data=alt_test_seq)

In [None]:
np.save('/Volumes/JPL/alt_train_seq.npy',alt_train_seq)
np.save('/Volumes/JPL/alt_valid_seq.npy',alt_valid_seq)
np.save('/Volumes/JPL/alt_test_seq.npy',alt_test_seq)

In [None]:
fc_train_seq = create_sequences(fc_train_clean)
fc_valid_seq = create_sequences(fc_valid_clean)
fc_test_seq = create_sequences(fc_test_clean)

In [None]:
np.save('/Volumes/JPL/fc_train_seq.npy',fc_train_seq)
np.save('/Volumes/JPL/fc_valid_seq.npy',fc_valid_seq)
np.save('/Volumes/JPL/fc_test_seq.npy',fc_test_seq)

In [None]:
fch4_train_seq = create_sequences(fch4_train_clean)
fch4_valid_seq = create_sequences(fch4_valid_clean)
fch4_test_seq = create_sequences(fch4_test_clean)

In [None]:
np.save('/Volumes/JPL/fc_train_seq.npy',fc_train_seq)
np.save('/Volumes/JPL/fc_valid_seq.npy',fc_valid_seq)
np.save('/Volumes/JPL/fc_test_seq.npy',fc_test_seq)

In [None]:
alt_train_sequences = numpy.load('/Volumes/JPL/alt_train_sequences.parquet')
alt_train_targets = numpy.load('/Volumes/JPL/alt_train_targets.parquet')

#### Merge

In [None]:
alt_train.shape, fc_train.shape, fch4_train.shape

In [None]:
alt_valid.shape, fc_valid.shape, fch4_valid.shape

In [None]:
alt_test.shape, fc_test.shape, fch4_test.shape

In [None]:
#alt_train=alt_train.interpolate(method='linear', limit_direction='forward').dropna()

In [None]:
merged_train.to_parquet('/Volumes/JPL/merged_train_altfc.parquet',engine='pyarrow',compression='snappy')

In [None]:
del merged_train

In [None]:
del alt_train, fc_train

In [None]:
with open('/Volumes/JPL/merged_train_altfc.parquet','rb') as f:
    merged_train=pd.read_parquet(f)

In [None]:
merged_train = pd.merge(merged_train, fch4_train, on=['datetime', 'lat', 'lon'], how='outer')
merged_train

In [None]:
merged_train.to_parquet('/Volumes/JPL/merged_train.parquet',engine='pyarrow',compression='snappy')

In [None]:
with open('/Volumes/JPL/merged_train.parquet','rb') as f:
    merged_train=pd.read_parquet(f)

In [None]:
merged_train = pd.merge(merged_train, fch4_train, on=['datetime', 'lat', 'lon'], how='outer')

In [None]:
del merged_train

In [None]:
with open('/Volumes/JPL/merged_train.parquet','rb') as f:
    merged_train=pd.read_parquet(f)

In [None]:
merged_valid = pd.merge(alt_valid, fc_valid, on=['datetime', 'lat', 'lon'], how='outer')
merged_valid = pd.merge(merged_valid, fch4_valid, on=['datetime', 'lat', 'lon'], how='outer')
merged_valid.fillna(0, inplace=True)

merged_test = pd.merge(alt_test, fc_test, on=['datetime', 'lat', 'lon'], how='outer')
merged_test = pd.merge(merged_test, fch4_test, on=['datetime', 'lat', 'lon'], how='outer')
merged_test.fillna(0, inplace=True)

merged_valid.to_parquet('/Volumes/JPL/merged_valid.parquet',engine='pyarrow',compression='snappy')

In [None]:
merged_test.to_parquet('/Volumes/JPL/merged_test.parquet',engine='pyarrow',compression='snappy')

In [None]:
merged_train = pd.merge(alt_train, fc_train, on=['datetime', 'lat', 'lon'], how='outer')
merged_train = pd.merge(merged_train, fch4_train, on=['datetime', 'lat', 'lon'], how='outer')
merged_train.fillna(0, inplace=True)
merged_train.to_parquet('/Volumes/JPL/merged_train.parquet',engine='pyarrow',compression='snappy')

In [None]:
merged_valid = pd.merge(alt_valid, fc_valid, on=['datetime', 'lat', 'lon'], how='outer')
merged_valid = pd.merge(merged_valid, fch4_valid, on=['datetime', 'lat', 'lon'], how='outer')
merged_valid.fillna(0, inplace=True)
merged_valid.to_parquet('/Volumes/JPL/merged_valid.parquet',engine='pyarrow',compression='snappy')

In [None]:
merged_test = pd.merge(alt_test, fc_test, on=['datetime', 'lat', 'lon'], how='outer')
merged_test = pd.merge(merged_test, fch4_test, on=['datetime', 'lat', 'lon'], how='outer')
merged_test.fillna(0, inplace=True)
merged_test.to_parquet('/Volumes/JPL/merged_test.parquet',engine='pyarrow',compression='snappy')

In [None]:

with open('/Volumes/JPL/merged_valid.parquet','rb') as f:
    merged_valid=pd.read_parquet(f)
with open('/Volumes/JPL/merged_test.parquet','rb') as f:
    merged_test=pd.read_parquet(f)

#### Individual

In [None]:
def create_sequences_with_masking(df, sequence_length=12):
    sequences = []
    targets = []
    unique_times = df['datetime'].unique()
    unique_lat = df['lat'].unique()
    unique_lon = df['lon'].unique()
    
    lat_to_idx = {lat: idx for idx, lat in enumerate(unique_lat)}
    lon_to_idx = {lon: idx for idx, lon in enumerate(unique_lon)}
    
    for i in range(len(unique_times) - sequence_length):
        time_slice = unique_times[i:i+sequence_length]
        sequence_data = df[df['datetime'].isin(time_slice)].sort_values(by=['datetime', 'lat', 'lon'])
        
        grid_sequence = np.full((sequence_length, len(unique_lat), len(unique_lon)), np.nan)
        
        for j, time in enumerate(time_slice):
            time_data = sequence_data[sequence_data['datetime'] == time]
            for _, row in time_data.iterrows():
                if row['alt'] != 0:  # Ignore zero values
                    lat_idx = lat_to_idx[row['lat']]
                    lon_idx = lon_to_idx[row['lon']]
                    grid_sequence[j, lat_idx, lon_idx] = row['alt']
        
        # Mask out NaN values and keep only valid data
        mask = ~np.isnan(grid_sequence)
        if np.sum(mask) > 0:  # Ensure that there is valid data
            sequences.append(grid_sequence[mask])
        
        target_time = unique_times[i + sequence_length]
        target_data = df[df['datetime'] == target_time].sort_values(by=['lat', 'lon'])
        target_grid = np.full((len(unique_lat), len(unique_lon)), np.nan)
        for _, row in target_data.iterrows():
            if row['alt'] != 0:  # Ignore zero values
                lat_idx = lat_to_idx[row['lat']]
                lon_idx = lon_to_idx[row['lon']]
                target_grid[lat_idx, lon_idx] = row['alt']
        
        target_mask = ~np.isnan(target_grid)
        if np.sum(target_mask) > 0:
            targets.append(target_grid[target_mask])
    
    return np.array(sequences), np.array(targets)

In [None]:
alt_train_sequences, alt_train_targets = create_sequences(alt_train)
alt_train_sequences=alt_train_sequences.reshape(3121, 12, 361, 365,1)
alt_train_targets=alt_train_targets.reshape(3121, 361, 365, 1)
alt_train_sequences.shape, alt_train_targets.shape

In [None]:
alt_train_sequences

In [None]:
np.save('/Volumes/JPL/alt_train_sequences.parquet',alt_train_sequences)
np.save('/Volumes/JPL/alt_train_targets.parquet',alt_train_targets)

In [None]:
alt_train_sequences = numpy.load('/Volumes/JPL/alt_train_sequences.parquet')
alt_train_targets = numpy.load('/Volumes/JPL/alt_train_targets.parquet')

In [None]:
alt_valid_sequences, alt_valid_targets = create_sequences(alt_valid)
alt_valid_sequences=alt_valid_sequences.reshape(73, 12, 96, 96, 1)
alt_valid_targets=alt_valid_targets.reshape(73, 96, 96, 1)
alt_valid_sequences.shape, alt_valid_targets.shape

In [None]:
alt_valid_sequences.to_parquet('/Volumes/JPL/alt_valid_sequences.parquet',engine='pyarrow',compression='snappy')
alt_valid_targets.to_parquet('/Volumes/JPL/alt_valid_targets.parquet',engine='pyarrow',compression='snappy')

In [None]:
alt_test_sequences, alt_test_targets = create_sequences(alt_test)
alt_test_sequences=alt_test_sequences.reshape(566, 12, 41, 41, 1)
alt_test_targets=alt_test_targets.reshape(566, 41, 41, 1)
alt_test_sequences.shape, alt_test_targets.shape

In [None]:
alt_test_sequences.to_parquet('/Volumes/JPL/alt_test_sequences.parquet',engine='pyarrow',compression='snappy')
alt_test_targets.to_parquet('/Volumes/JPL/alt_test_targets.parquet',engine='pyarrow',compression='snappy')

In [None]:
alt_train_sequences = numpy.load('/Volumes/JPL/alt_train_sequences.parquet')
alt_train_targets = numpy.load('/Volumes/JPL/alt_train_targets.parquet')

In [None]:
alt_valid_sequences = numpy.load('/Volumes/JPL/alt_valid_sequences.parquet')
alt_valid_targets = numpy.load('/Volumes/JPL/alt_valid_targets.parquet')

In [None]:
alt_test_sequences = numpy.load('/Volumes/JPL/alt_test_sequences.parquet')
alt_test_targets = numpy.load('/Volumes/JPL/alt_test_targets.parquet')

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import ConvLSTM2D, BatchNormalization, Conv3D, Conv2D
from tensorflow.keras.optimizers import Adam

def model(hp):
    model = Sequential()
    
    # ConvLSTM2D layer to process the sequence data
    model.add(ConvLSTM2D(
        filters=hp.Int('filters_1', min_value=16, max_value=64, step=16), 
        kernel_size=hp.Choice('kernel_size_1', values=[1, 3]),
        padding='same', 
        return_sequences=True,
        activation=hp.Choice('activation_function_1', values=['relu','tanh','softmax','sigmoid','leaky_relu','swish']),
        input_shape=(12, 361, 365, 1)
    )
             )
    
    model.add(BatchNormalization())
    
    # Another ConvLSTM2D layer
    model.add(ConvLSTM2D(
        filters=hp.Int('filters_2', min_value=16, max_value=64, step=16), 
        kernel_size=hp.Choice('kernel_size_2', values=[1, 3]),
        padding='same',
        activation=hp.Choice('activation_function_2', values=['relu','tanh','softmax','sigmoid','leaky_relu','swish']),
        return_sequences=True
    )
             )
    
    model.add(BatchNormalization())
    
    # Final ConvLSTM2D layer without return_sequences
    model.add(ConvLSTM2D(
        filters=hp.Int('filters_3', min_value=16, max_value=64, step=16), 
        kernel_size=hp.Choice('kernel_size_3', values=[1, 3]),
        padding='same', 
        activation=hp.Choice('activation_function_3', values=['relu','tanh','softmax','sigmoid','leaky_relu','swish']),
        return_sequences=False
    )
             )
    
    model.add(BatchNormalization())

    #model.add(Lambda(lambda x: x[:, -1, :, :, :]))  # Shape becomes [batch_size, height, width, channels]
    
    # Use Conv2D to reduce to the target output shape
    model.add(Conv2D(
            filters=1,
            kernel_size=hp.Choice('kernel_size_4', values=[1, 3]),
            activation=hp.Choice('activation_function_4', values=['relu','tanh','softmax','sigmoid','leaky_relu','swish']),
            padding='same'
            #data_format='channels_last'
    )
             )
    
    # Compile the model
    model.compile(
        optimizer=Adam(
            hp.Choice('learning_rate', values=[1e-3, 1e-4, 1e-5])
        ),
        loss='mse', 
        metrics=['mae','mse','mape','accuracy']
    )
    
    # Display the model's architecture
    #model.summary()

    return model

In [None]:
# Define callbacks
early_stopping_cb = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
lr_scheduler_cb = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2)
checkpoint_cb = ModelCheckpoint('model_checkpoint.keras', save_best_only=True)

In [None]:
from keras_tuner import BayesianOptimization

tuner = BayesianOptimization(
    model,
    objective='val_loss',
    max_trials=10,
    executions_per_trial=1,
    directory='alt_train2',
    project_name='alt_train_conv3dlstm_tuning',
    #directory='ch4_train',
    #project_name='ch4_train_conv3dlstm_tuning',
    overwrite=True,
)

In [None]:
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1, 1, X_train.shape[3]))
X_valid = X_valid.reshape((X_valid.shape[0], X_valid.shape[1], 1, 1, X_valid.shape[3]))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1, 1, X_test.shape[3]))

#tuner.search(train_dataset, epochs=100, validation_data=valid_dataset, callbacks=[early_stopping_cb, lr_scheduler_cb])
tuner.search(X_train, y_train, epochs=100, validation_data=(X_valid, y_valid), callbacks=[early_stopping_cb, lr_scheduler_cb])

In [None]:
tuner.search(
    alt_train_sequences,
    alt_train_targets,
    epochs=10,
    batch_size = 16,
    validation_data=(alt_valid_sequences, alt_valid_targets),
    callbacks=[early_stopping_cb, lr_scheduler_cb]
)

In [None]:
fc_train_sequences, fc_train_targets = create_sequences(fc_train)
fc_valid_sequences, fc_valid_targets = create_sequences(fc_valid)
fc_test_sequences, fc_test_targets = create_sequences(fc_test)

In [None]:
fch4_train_sequences, fch4_train_targets = create_sequences(fch4_train)
fch4_valid_sequences, fch4_valid_targets = create_sequences(fch4_valid)
fch4_test_sequences, fch4_test_targets = create_sequences(fch4_test)

In [None]:
# Define the dimensions based on your data
num_lat = len(alt_train['lat'].unique())
num_lon = len(alt_train['lon'].unique())

# Reshape the sequences for each feature
alt_train_sequences = alt_train_sequences.reshape(alt_train_sequences.shape[0], 12, num_lat, num_lon, 1)
alt_valid_sequences = alt_valid_sequences.reshape(alt_valid_sequences.shape[0], 12, num_lat, num_lon, 1)
alt_test_sequences = alt_test_sequences.reshape(alt_test_sequences.shape[0], 12, num_lat, num_lon, 1)

fc_train_sequences = fc_train_sequences.reshape(fc_train_sequences.shape[0], 12, num_lat, num_lon, 1)
fc_valid_sequences = fc_valid_sequences.reshape(fc_valid_sequences.shape[0], 12, num_lat, num_lon, 1)
fc_test_sequences = fc_test_sequences.reshape(fc_test_sequences.shape[0], 12, num_lat, num_lon, 1)

fch4_train_sequences = fch4_train_sequences.reshape(fch4_train_sequences.shape[0], 12, num_lat, num_lon, 1)
fch4_valid_sequences = fch4_valid_sequences.reshape(fch4_valid_sequences.shape[0], 12, num_lat, num_lon, 1)
fch4_test_sequences = fch4_test_sequences.reshape(fch4_test_sequences.shape[0], 12, num_lat, num_lon, 1)


In [None]:
from scipy.interpolate import interp1d
from sklearn.utils import resample

alt_train['datetime'] = pd.to_datetime(alt_train['datetime'])
alt_train = alt_train.sort_values(by=['lat', 'lon', 'datetime'])
alt_valid['datetime'] = pd.to_datetime(alt_valid['datetime'])
alt_valid = alt_valid.sort_values(by=['lat', 'lon', 'datetime'])
alt_test['datetime'] = pd.to_datetime(alt_test['datetime'])
alt_test = alt_test.sort_values(by=['lat', 'lon', 'datetime'])

alt_train=alt_train.reset_index(drop=True)
alt_valid=alt_valid.reset_index(drop=True)
alt_test=alt_train.reset_index(drop=True)

In [None]:
#alt_train=alt_train.groupby(['datetime', 'lat', 'lon']).agg({'alt': 'mean'}).reset_index()

In [None]:
from sklearn.utils import resample

def bootstrap_uncertainty(group, n_iterations=1000):
    bootstrapped_means = []
    for _ in range(n_iterations):
        bootstrapped_sample = resample(group['alt'].dropna())
        if len(bootstrapped_sample) > 1:
            interpolated = bootstrapped_sample.interpolate()
            bootstrapped_means.append(interpolated.mean())
        else:
            bootstrapped_means.append(bootstrapped_sample.iloc[0])
    
    # Calculate the standard deviation of the bootstrapped means
    return np.std(bootstrapped_means)

def create_sequences(data, sequence_length=12):
    sequences = []
    for i in range(len(data) - sequence_length + 1):
        sequences.append(data[i:i + sequence_length])
    return np.array(sequences)

In [None]:
alt_train['bootstrap_uncertainty'] = alt_train.groupby(['lat', 'lon']).apply(bootstrap_uncertainty).\
reset_index(drop=True)

In [None]:
alt_valid['bootstrap_uncertainty'] = alt_valid.groupby(['lat', 'lon']).apply(bootstrap_uncertainty).\
reset_index(drop=True)

In [None]:
alt_test['bootstrap_uncertainty'] = alt_test.groupby(['lat', 'lon']).apply(bootstrap_uncertainty).\
reset_index(drop=True)

In [None]:
alt_train_df = alt_train[['datetime', 'lat', 'lon', 'alt', 'bootstrap_uncertainty']].copy()
alt_train_pivot_df = alt_train_df.pivot_table(index='datetime', columns=['lat', 'lon'], 
                                                    values=['alt', 'bootstrap_uncertainty'])
alt_train_sequences = create_sequences(alt_train_pivot_df['alt'].values, sequence_length=12)
alt_train_uncertainty_sequences = create_sequences(alt_train_pivot_df['bootstrap_uncertainty'].values, 
                                                   sequence_length=12)

In [None]:
alt_valid_df = alt_valid[['datetime', 'lat', 'lon', 'alt', 'bootstrap_uncertainty']].copy()
alt_valid_pivot_df = alt_valid_df.pivot_table(index='datetime', columns=['lat', 'lon'], 
                                                    values=['alt', 'bootstrap_uncertainty'])
alt_valid_sequences = create_sequences(alt_valid_pivot_df['alt'].values, sequence_length=12)
alt_valid_uncertainty_sequences = create_sequences(alt_valid_pivot_df['bootstrap_uncertainty'].values, 
                                                   sequence_length=12)

In [None]:
alt_test_df = alt_train[['datetime', 'lat', 'lon', 'alt', 'bootstrap_uncertainty']].copy()
alt_test_pivot_df = alt_test_df.pivot_table(index='datetime', columns=['lat', 'lon'], 
                                                  values=['alt', 'bootstrap_uncertainty'])
alt_test_sequences = create_sequences(alt_test_pivot_df['alt'].values, sequence_length=12)
alt_test_uncertainty_sequences = create_sequences(alt_test_pivot_df['bootstrap_uncertainty'].values, 
                                                  sequence_length=12)

In [None]:
alt_train_lat_unique = np.sort(alt_train['lat'].unique())
alt_train_lon_unique = np.sort(alt_train['lon'].unique())

In [None]:
alt_train_sequences.shape, alt_valid_sequences.shape, alt_test_sequences.shape

In [None]:
alt_train_array = alt_train_sequences.reshape(alt_train_sequences.shape[0], 12, len(alt_train_lat_unique), 
                                                    len(alt_train_lon_unique), 1)
alt_train_uncertainty_array = alt_train_uncertainty_sequences.reshape(alt_train_uncertainty_sequences.shape[0], 12, 
                                                            len(alt_train_lat_unique), len(alt_train_lon_unique), 1)
alt_train_combined_input = np.concatenate([alt_train_array, alt_train_uncertainty_array], axis=-1)

In [None]:
final_array = sequences.reshape(sequences.shape[0], 12, len(lat_unique), len(lon_unique), 1)
uncertainty_array = uncertainty_sequences.reshape(uncertainty_sequences.shape[0], 12, len(lat_unique), len(lon_unique), 1)

# If needed, you can concatenate both arrays to feed into a model
combined_input = np.concatenate([final_array, uncertainty_array], axis=-1)

In [None]:
final_array = sequences.reshape(sequences.shape[0], 12, len(lat_unique), len(lon_unique), 1)
uncertainty_array = uncertainty_sequences.reshape(uncertainty_sequences.shape[0], 12, len(lat_unique), len(lon_unique), 1)

# If needed, you can concatenate both arrays to feed into a model
combined_input = np.concatenate([final_array, uncertainty_array], axis=-1)

In [None]:
df_grouped = alt_train.groupby(['lat', 'lon', 'datetime']).apply(interpolate_with_uncertainty).reset_index(drop=False)

In [None]:
df_grouped

In [None]:
pivot_df = df_grouped.pivot_table(index='datetime', columns=['lat', 'lon'], values=['interpolated_alt', 'uncertainty'])

In [None]:
def create_sequences(data, sequence_length=12):
    sequences = []
    for i in range(len(data) - sequence_length + 1):
        sequences.append(data[i:i + sequence_length])
    return np.array(sequences)

In [None]:
sequences = create_sequences(pivot_df['interpolated_alt'].values, sequence_length=12)

In [None]:
uncertainty_sequences = create_sequences(pivot_df['uncertainty'].values, sequence_length=12)

In [None]:
final_array = sequences.reshape(sequences.shape[0], 12, len(lat_unique), len(lon_unique), 1)

In [None]:
uncertainty_array = uncertainty_sequences.reshape(uncertainty_sequences.shape[0], 12, len(lat_unique), len(lon_unique), 1)

In [None]:
print(final_array.shape)  # Shape of the interpolated alt data
print(uncertainty_array.shape)  # Shape of the associated uncertainty

In [None]:
combined_input = np.concatenate([final_array, uncertainty_array], axis=-1)

print(combined_input.shape)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import ConvLSTM2D, BatchNormalization, Conv3D, Flatten, Dense, Dropout

# Assuming your input shape is (samples, 12, lat, lon, 1) for alt data
# If including uncertainty as a second channel, the shape would be (samples, 12, lat, lon, 2)

# Define the model
model = Sequential()

# ConvLSTM2D layer: This layer captures the spatiotemporal patterns in the data
model.add(ConvLSTM2D(filters=64, 
                     kernel_size=(3, 3), 
                     input_shape=(12, len(lat_unique), len(lon_unique), 1),  # Use (12, lat, lon, 2) if including uncertainty
                     padding='same', 
                     return_sequences=True))
model.add(BatchNormalization())  # Normalize after each ConvLSTM2D layer to stabilize training

# Adding more ConvLSTM2D layers for deeper modeling of the temporal-spatial data
model.add(ConvLSTM2D(filters=64, 
                     kernel_size=(3, 3), 
                     padding='same', 
                     return_sequences=True))
model.add(BatchNormalization())

# Final ConvLSTM2D layer before flattening the outputs
model.add(ConvLSTM2D(filters=64, 
                     kernel_size=(3, 3), 
                     padding='same', 
                     return_sequences=False))  # return_sequences=False to get a 3D output (lat, lon, filters)
model.add(BatchNormalization())

# Optionally, apply a 3D convolution to extract features across the depth of the ConvLSTM output
model.add(Conv3D(filters=64, 
                 kernel_size=(3, 3, 3), 
                 activation='relu', 
                 padding='same'))

# Flatten the 3D output to feed into dense layers
model.add(Flatten())

# Fully connected layers to learn from the high-level features
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))  # Dropout for regularization
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='linear'))  # Final output layer (regression output for alt value)

# Compile the model
model.compile(optimizer='adam', loss='mse')

# Print model summary to inspect architecture
model.summary()

In [None]:
# Train the model
history = model.fit(combined_input, y_train, 
                    validation_data=(X_valid, y_valid), 
                    epochs=100, 
                    batch_size=32, 
                    callbacks=[early_stopping_cb, lr_scheduler_cb])

# Evaluate on test set
test_loss = model.evaluate(X_test, y_test)
print(f'Test Loss: {test_loss}')

In [None]:
alt_train=alt_train.interpolate(method='linear', limit_direction='forward').dropna()
alt_train_pivot = alt_train.pivot_table(index='datetime', columns=['lat', 'lon'], values='alt')
alt_train_pivot = alt_train_pivot.fillna(0)
alt_valid=alt_valid.interpolate(method='linear', limit_direction='forward').dropna()
alt_valid_pivot = alt_valid.pivot_table(index='datetime', columns=['lat', 'lon'], values='alt')
alt_valid_pivot = alt_valid_pivot.fillna(0)
alt_test=alt_test.interpolate(method='linear', limit_direction='forward').dropna()
alt_test_pivot = alt_test.pivot_table(index='datetime', columns=['lat', 'lon'], values='alt')
alt_test_pivot = alt_test_pivot.fillna(0)

In [None]:
common_coords = set(alt_train_pivot.columns).intersection(set(alt_valid_pivot.columns)).intersection(set(alt_test_pivot.columns))

In [None]:
alt_train_pivot = alt_train_pivot[list(common_coords)]
spatial_coords = np.array(alt_train_pivot.columns.values.tolist())
alt_train_data = alt_train_pivot.values
alt_train_data = alt_train_data.reshape((alt_train_data.shape[0], alt_train_data.shape[1], 1))
spatial_coords_repeated = np.repeat(spatial_coords[np.newaxis, :, :], alt_train_data.shape[0], axis=0)
alt_train_with_coords = np.concatenate([alt_train_data, spatial_coords_repeated], axis=2)

In [None]:
alt_train_with_coords.shape

In [None]:
alt_valid_pivot = alt_valid_pivot[list(common_coords)]
spatial_coords = np.array(alt_valid_pivot.columns.values.tolist())
alt_valid_data = alt_valid_pivot.values
alt_valid_data = alt_valid_data.reshape((alt_valid_data.shape[0], alt_valid_data.shape[1], 1))
spatial_coords_repeated = np.repeat(spatial_coords[np.newaxis, :, :], alt_valid_data.shape[0], axis=0)
alt_valid_with_coords = np.concatenate([alt_valid_data, spatial_coords_repeated], axis=2)

In [None]:
alt_valid_with_coords.shape

In [None]:
alt_test_pivot = alt_test_pivot[list(common_coords)]
spatial_coords = np.array(alt_test_pivot.columns.values.tolist())
alt_test_data = alt_test_pivot.values
alt_test_data = alt_test_data.reshape((alt_test_data.shape[0], alt_test_data.shape[1], 1))
spatial_coords_repeated = np.repeat(spatial_coords[np.newaxis, :, :], alt_test_data.shape[0], axis=0)
alt_test_with_coords = np.concatenate([alt_test_data, spatial_coords_repeated], axis=2)

In [None]:
alt_test_with_coords.shape

In [None]:
def create_sequences_with_spatial(data, sequence_length):
    X, y = [], []
    for i in range(len(data) - sequence_length):
        X.append(data[i:i + sequence_length])  # Include both temporal and spatial data
        y.append(data[i + sequence_length, :, 0])  # Predict only the temporal data ('alt')
    return np.array(X), np.array(y)

In [None]:
sequence_length = 12
X_train, y_train = create_sequences_with_spatial(alt_train_with_coords, sequence_length)

In [None]:
sequence_length = 12
X_valid, y_valid = create_sequences_with_spatial(alt_valid_with_coords, sequence_length)

In [None]:
sequence_length = 12
X_test, y_test = create_sequences_with_spatial(alt_test_with_coords, sequence_length)

In [None]:
print("X_train shape:", X_train.shape)  # Expected: (num_samples, sequence_length, num_locations, 3)
print("y_train shape:", y_train.shape)  # Expected: (num_samples, num_locations)
print("X_valid shape:", X_valid.shape)
print("y_valid shape:", y_valid.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

In [None]:
X_train.reshape(3121,12,41,3)[:,:,:,1].shape #lat
X_train.reshape(3121,12,41,3)[:,:,:,2].shape #lon

In [None]:
X_train.reshape(3121,12,368,368,1)

In [None]:
final_array = np.zeros((3121, 12, 368, 368)) 

In [None]:
lat_values = X_train[:,:,:,1]
lon_values = X_train[:,:,:,2]

In [None]:
lat_index_mapping = {}  # Dictionary to map latitude values to grid indices
lon_index_mapping = {}  # Dictionary to map longitude values to grid indices

# Example of filling in the array
for i in range(3121):
    for j in range(12):
        for k in range(alt_train_data.shape[2]):  # Loop through the spatial locations
            lat_index = lat_index_mapping[lat_values[i, j, k]]  # Map the latitude to grid index
            lon_index = lon_index_mapping[lon_values[i, j, k]]  # Map the longitude to grid index
            final_array[i, j, lat_index, lon_index] = alt_train_data[i, j, k, 0]

In [None]:
# plt.plot(alt_train_data[0,:,:]);
# plt.plot(alt_train_data[-1,:,:]);

In [None]:
alt_train_data.reshape((alt_train_data.shape[0], 184alt_train_data.shape[1], 1))

In [None]:
combined_df = pd.concat([alt_train, alt_valid, alt_test])

In [None]:
# alt_train['lat_grid'] = alt_train['lat'].round(3)  # or use a more sophisticated spatial binning technique
# alt_train['lon_grid'] = alt_train['lon'].round(3)
# alt_train_agg = alt_train.groupby(['datetime', 'lat_grid', 'lon_grid']).mean().reset_index()
# alt_train_agg

In [None]:
combined_df=combined_df.interpolate(method='linear', limit_direction='forward').dropna()
combined_df_pivot = combined_df.pivot_table(index='datetime', columns=['lat', 'lon'], values='alt')
combined_df_pivot = combined_df_pivot.fillna(0)
combined_df_data = combined_df_pivot.values
combined_df_data = combined_df_data.reshape((combined_df_data.shape[0], combined_df_data.shape[1], 1))

In [None]:
combined_df = ch4.interpolate(method='linear', limit_direction='forward').dropna()
combined_df_pivot = combined_df.pivot_table(index='datetime', columns=['lat', 'lon'], values='fch4')
combined_df_pivot = combined_df_pivot.fillna(0)
combined_df_data = combined_df_pivot.values
combined_df_data = combined_df_data.reshape((combined_df_data.shape[0], combined_df_data.shape[1], 1))

In [None]:
ch4.shape

In [None]:
alt['lat_bin'] = alt['lat'].round(3)  # Adjust the rounding based on your grid resolution
alt['lon_bin'] = alt['lon'].round(3)

In [None]:
ch4['lat_bin'] = ch4['lat'].round(3)  # Adjust the rounding based on your grid resolution
ch4['lon_bin'] = ch4['lon'].round(3)

In [None]:
alt=alt.sort_values(by='datetime',ascending=True)
alt=alt.reset_index(drop=True)
alt

In [None]:
ch4=ch4.sort_values(by='datetime',ascending=True)
ch4=ch4.reset_index(drop=True)
ch4

In [None]:
alt_pivot = alt.pivot_table(index=['datetime'], columns=['lat_bin', 'lon_bin'], values='alt', fill_value=0)
alt_pivot

In [None]:
ch4_pivot = ch4.pivot_table(index=['datetime'], columns=['lat_bin', 'lon_bin'], values='fch4', fill_value=0)
ch4_pivot

In [None]:
grid = alt_pivot.values.reshape(alt_pivot.shape[0], 1, *alt_pivot.shape[1:])
grid

In [None]:
grid = ch4_pivot.values.reshape(ch4_pivot.shape[0], 1, *ch4_pivot.shape[1:])
grid

In [None]:
grid.shape

In [None]:
def create_sequences(data, sequence_length):
    X = []
    y = []
    for i in range(len(data) - sequence_length):
        X.append(data[i:i + sequence_length])
        y.append(data[i + sequence_length])
    return np.array(X), np.array(y)

# Assuming grid has the shape (time_steps, rows, columns, 1)
X, y = create_sequences(grid, sequence_length=12)

In [None]:
X.shape, y.shape

In [None]:
with open('/Volumes/JPL/alt_valid.parquet','rb') as f:
    alt_valid=pd.read_parquet(f)

In [None]:
train_size = int(0.7 * len(X))
valid_size = int(0.15 * len(X))
test_size = len(X) - train_size - valid_size

X_train = X[:train_size]
y_train = y[:train_size]

X_valid = X[train_size:train_size + valid_size]
y_valid = y[train_size:train_size + valid_size]

X_test = X[train_size + valid_size:]
y_test = y[train_size + valid_size:]

In [None]:
y_train = y_train.reshape((y_train.shape[0], 1, y_train.shape[1], y_train.shape[2], 1))
y_valid = y_valid.reshape((y_valid.shape[0], 1, y_valid.shape[1], y_valid.shape[2], 1))
y_test = y_test.reshape((y_test.shape[0], 1, y_test.shape[1], y_test.shape[2], 1))

In [None]:
# train_alt=alt[alt.datetime<='2017-06-19']
# valid_alt=alt[(alt.datetime >= '2017-07-01') & (alt.datetime <= '2021-12-01')]
# test_alt=alt[alt.datetime>='2022-01-01']

In [None]:
# train_alt.shape, valid_alt.shape, test_alt.shape

In [None]:
# train_alt['lat_bin'] = train_alt['lat'].round(3)
# train_alt['lon_bin'] = train_alt['lon'].round(3)
# valid_alt['lat_bin'] = valid_alt['lat'].round(3)
# valid_alt['lon_bin'] = valid_alt['lon'].round(3)
# test_alt['lat_bin'] = test_alt['lat'].round(3)
# test_alt['lon_bin'] = test_alt['lon'].round(3)

In [None]:
# train_alt_pivot = train_alt.pivot_table(index=['datetime'], columns=['lat_bin', 'lon_bin'], values='alt', fill_value=0)
# valid_alt_pivot = valid_alt.pivot_table(index=['datetime'], columns=['lat_bin', 'lon_bin'], values='alt', fill_value=0)
# test_alt_pivot = test_alt.pivot_table(index=['datetime'], columns=['lat_bin', 'lon_bin'], values='alt', fill_value=0)

In [None]:
# train_alt_grid = train_alt_pivot.values.reshape(train_alt_pivot.shape[0], 1, *train_alt_pivot.shape[1:])
# valid_alt_grid = valid_alt_pivot.values.reshape(valid_alt_pivot.shape[0], 1, *valid_alt_pivot.shape[1:])
# test_alt_grid = test_alt_pivot.values.reshape(test_alt_pivot.shape[0], 1, *test_alt_pivot.shape[1:])

In [None]:
def create_sequences(data, sequence_length):
    X = []
    y = []
    for i in range(len(data) - sequence_length):
        X.append(data[i:i + sequence_length])
        y.append(data[i + sequence_length])
    return np.array(X), np.array(y)

In [None]:
train_alt_X, train_alt_y = create_sequences(train_alt_grid, sequence_length=12)
valid_alt_X, valid_alt_y = create_sequences(valid_alt_grid, sequence_length=12)
test_alt_X, test_alt_y = create_sequences(test_alt_grid, sequence_length=12)

In [None]:
train_alt_X.shape, train_alt_y.shape, valid_alt_X.shape, valid_alt_y.shape, test_alt_X.shape, test_alt_y.shape

In [None]:
train_alt_y = train_alt_y.reshape((train_alt_y.shape[0], 1, train_alt_y.shape[1], train_alt_y.shape[2]))
valid_alt_y = valid_alt_y.reshape((valid_alt_y.shape[0], 1, valid_alt_y.shape[1], valid_alt_y.shape[2]))
test_alt_y = test_alt_y.reshape((test_alt_y.shape[0], 1, test_alt_y.shape[1], test_alt_y.shape[2]))

In [None]:
train_alt_X.shape, train_alt_y.shape, valid_alt_X.shape, valid_alt_y.shape, test_alt_X.shape, test_alt_y.shape

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_sc = scaler.fit_transform(X_train.reshape(-1, X_train.shape[-1])).reshape(X_train.shape)
X_valid_sc = scaler.transform(X_valid.reshape(-1, X_valid.shape[-1])).reshape(X_valid.shape)
X_test_sc = scaler.transform(X_test.reshape(-1, X_test.shape[-1])).reshape(X_test.shape)

In [None]:
X_train_sc = X_train_sc.reshape((X_train_sc.shape[0], X_train_sc.shape[1], 1, 1, X_train_sc.shape[3]))
valid_alt_X = valid_alt_X.reshape((valid_alt_X.shape[0], valid_alt_X.shape[1], 1, 1, valid_alt_X.shape[3]))
test_alt_X = test_alt_X.reshape((test_alt_X.shape[0], test_alt_X.shape[1], 1, 1, test_alt_X.shape[3]))

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_sc = scaler.fit_transform(X.reshape(-1, X.shape[-1])).reshape(X.shape)

In [None]:
X_sc.shape

In [None]:
def build_model(hp):
    model = Sequential()
    
    model.add(ConvLSTM2D(
        filters=hp.Int('filters_1', min_value=16, max_value=64, step=16), 
        kernel_size=hp.Choice('kernel_size_1', values=[1, 1]),
        input_shape=(12, 1, 1, 360),
        #input_shape=(X_train.shape[1], 1, 1, X_train.shape[3]),
        padding='same', 
        return_sequences=True,
        activation=hp.Choice('activation_function_1', values=['relu','tanh','softmax','sigmoid','leaky_relu','swish'])
    ))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))

    model.add(ConvLSTM2D(
        filters=hp.Int('filters_2', min_value=16, max_value=64, step=16), 
        kernel_size=hp.Choice('kernel_size_2', values=[1, 1]),
        padding='same', 
        return_sequences=True,
        activation=hp.Choice('activation_function_2', values=['relu','tanh','softmax','sigmoid','leaky_relu','swish'])
    ))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))

    # Extract the last time step
    model.add(Lambda(lambda x: x[:, -1, :, :, :]))  # Shape becomes [batch_size, height, width, channels]

    # Conv2D Layer for output (single frame prediction)
    model.add(Conv2D(
        filters=1,  # Ensure only one output channel to match y_train's shape
        kernel_size=(1, 1),  # Use a 2D kernel size
        activation=hp.Choice('activation_function_3', values=['relu','tanh','softmax','sigmoid','leaky_relu','swish']),
        padding='same',
        data_format='channels_last'
    ))

    # Compile the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(
            hp.Choice('learning_rate', values=[1e-3, 1e-4, 1e-5])
        ),
        loss='mse',
        metrics=['mae','mse','mape','accuracy']
    )
    
    return model

In [None]:
print(X_train.shape)  # Check the shape of the training data
print(X_valid.shape) 

In [None]:
batch_size = 32  # Adjust based on memory and GPU capability

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_dataset = train_dataset.cache().shuffle(1000).batch(batch_size).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

valid_dataset = tf.data.Dataset.from_tensor_slices((X_valid, y_valid))
valid_dataset = valid_dataset.batch(batch_size).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
test_dataset = test_dataset.batch(batch_size).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [None]:
# Define callbacks
early_stopping_cb = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
lr_scheduler_cb = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2)
checkpoint_cb = ModelCheckpoint('model_checkpoint.keras', save_best_only=True)

In [None]:
from keras_tuner import BayesianOptimization

# Define the tuner
tuner = BayesianOptimization(
    build_model,
    objective='val_loss',
    max_trials=10,
    executions_per_trial=1,
    directory='alt_train',
    project_name='alt_train_conv3dlstm_tuning',
    #directory='ch4_train',
    #project_name='ch4_train_conv3dlstm_tuning',
    overwrite=True,
)

In [None]:
print(X_train.shape)  # Check the shape of the training data
print(X_valid.shape)  # Check the shape of the validation data

In [None]:
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1, 1, X_train.shape[3]))
X_valid = X_valid.reshape((X_valid.shape[0], X_valid.shape[1], 1, 1, X_valid.shape[3]))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1, 1, X_test.shape[3]))

In [None]:
print(X_train.shape)  # Check the shape of the training data
print(X_valid.shape)  # Check the shape of the validation data

In [None]:
#tuner.search(train_dataset, epochs=100, validation_data=valid_dataset, callbacks=[early_stopping_cb, lr_scheduler_cb])
tuner.search(X_train, y_train, epochs=100, validation_data=(X_valid, y_valid), callbacks=[early_stopping_cb, lr_scheduler_cb])

In [None]:
best_model = tuner.get_best_models(num_models=1)[0]

In [None]:
plot_model(best_model, to_file='/Volumes/JPL/alt_model.png', show_shapes=True, show_layer_names=True, dpi=300)

In [None]:
best_model.summary()

In [None]:
tuner.results_summary()

In [None]:
# Evaluate the best model on the test data
test_loss = best_model.evaluate(X_test, y_test)
print(f'Test Loss: {test_loss}')

In [None]:
#history=# Continue training (fine-tuning) the best model with additional data or epochs
history = best_model.fit(X_train, y_train, epochs=epochs, validation_data=(X_valid, y_valid), 
                 epochs2=additional_epochs, validation_data=(X_valid, y_valid))

In [None]:
final_test_loss, final_test_mae = best_model.evaluate(train_dataset)
print(f"Final Test Loss: {final_test_loss}, Final Test MAE: {final_test_mae}")

In [None]:
predictions = best_model.predict(test_dataset)

# If using a scaler, inverse transform the predictions and actual values
predictions_original = scaler_alt.inverse_transform(predictions.reshape(-1, 1)).reshape(predictions.shape)
actual_values_original = scaler_alt.inverse_transform(y_test.reshape(-1, 1)).reshape(y_test.shape)

# Compare first few predictions with actual values
for i in range(5):
    print(f"Prediction {i+1}: {predictions_original[i, :, :, 0]}")
    print(f"Actual Value {i+1}: {actual_values_original[i, :, :, 0]}")
    
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.title('Predicted (Original Scale)')
plt.imshow(predictions_original[0, :, :, 0], cmap='viridis')
plt.colorbar()

plt.subplot(1, 2, 2)
plt.title('Actual (Original Scale)')
plt.imshow(actual_values_original[0, :, :, 0], cmap='viridis')
plt.colorbar()

plt.show()

In [None]:
X_train
y_train

In [None]:
df_grouped = ch4.groupby(['datetime', 'lat', 'lon']).agg({'fch4': ['mean', 'std', 'min', 'max', 'count']}).reset_index()
df_grouped.columns = ['datetime', 'lat', 'lon', 'fch4_mean', 'fch4_std', 'fch4_min', 'fch4_max', 'fch4_count']

In [None]:
df_grouped = co2.groupby(['datetime', 'lat', 'lon']).agg({'fc': ['mean', 'std', 'min', 'max', 'count']}).reset_index()
df_grouped.columns = ['datetime', 'lat', 'lon', 'fc_mean', 'fc_std', 'fc_min', 'fc_max', 'fc_count']

In [None]:
df_pivoted = ch4.pivot_table(index=['lat', 'lon'], columns='datetime', values='fch4').reset_index()

In [None]:
df_pivoted = co2.pivot_table(index=['lat', 'lon'], columns='datetime', values='fc').reset_index()

In [None]:
df_pivoted.interpolate(method='linear', limit_direction='both', inplace=True)

In [None]:
df_pivoted

In [None]:
df_pivoted.iloc[:,174:223]

In [None]:
train_df=df_pivoted.iloc[:,:174]
valid_df=pd.concat([df_pivoted.iloc[:,:2],df_pivoted.iloc[:,174:223]],axis=1)
test_df=pd.concat([df_pivoted.iloc[:,:2],df_pivoted.iloc[:,223:]],axis=1)

In [None]:
lat_lon_columns = ['lat', 'lon']
time_series_columns = train_df.columns.difference(lat_lon_columns)
data = train_df[time_series_columns].values
sequence_length = 12  # for example, using 12 months to predict the next month
X_train = []
y_train = []

for i in range(len(data) - sequence_length):
    X_train.append(data[i:i+sequence_length])
    y_train.append(data[i+sequence_length])

X_train = np.array(X_train)
y_train = np.array(y_train)

lat_lon_values = train_df[lat_lon_columns].values[:len(X_train)]
lat_lon_repeated = np.repeat(lat_lon_values[:, np.newaxis, :], sequence_length, axis=1)

X_train = np.concatenate([lat_lon_repeated, X_train], axis=2)

In [None]:
lat_lon_columns = ['lat', 'lon']
time_series_columns = valid_df.columns.difference(lat_lon_columns)
data = valid_df[time_series_columns].values
sequence_length = 12  # for example, using 12 months to predict the next month
X_valid = []
y_valid = []

for i in range(len(data) - sequence_length):
    X_valid.append(data[i:i+sequence_length])
    y_valid.append(data[i+sequence_length])

X_valid = np.array(X_valid)
y_valid = np.array(y_valid)

lat_lon_values = valid_df[lat_lon_columns].values[:len(X_valid)]
lat_lon_repeated = np.repeat(lat_lon_values[:, np.newaxis, :], sequence_length, axis=1)

X_valid = np.concatenate([lat_lon_repeated, X_valid], axis=2)

In [None]:
lat_lon_columns = ['lat', 'lon']
time_series_columns = test_df.columns.difference(lat_lon_columns)
data = test_df[time_series_columns].values
sequence_length = 12  # for example, using 12 months to predict the next month
X_test = []
y_test = []

for i in range(len(data) - sequence_length):
    X_test.append(data[i:i+sequence_length])
    y_test.append(data[i+sequence_length])

X_test = np.array(X_test)
y_test = np.array(y_test)

lat_lon_values = test_df[lat_lon_columns].values[:len(X_test)]
lat_lon_repeated = np.repeat(lat_lon_values[:, np.newaxis, :], sequence_length, axis=1)

X_test = np.concatenate([lat_lon_repeated, X_test], axis=2)

In [None]:
print("X_train shape:", X_train.shape)  # Expected: (num_samples, sequence_length, num_features + 2)
print("y_train shape:", y_train.shape)  # Expected: (num_samples, num_features)

In [None]:
print("X_valid shape:", X_valid.shape)  # Expected: (num_samples, sequence_length, num_features + 2)
print("y_valid shape:", y_valid.shape)  # Expected: (num_samples, num_features)

In [None]:
print("X_test shape:", X_test.shape)  # Expected: (num_samples, sequence_length, num_features + 2)
print("y_test shape:", y_test.shape)  # Expected: (num_samples, num_features)

In [None]:
27*12

In [None]:
X_train_reshaped = X_train.reshape((X_train.shape[0], X_train.shape[1], 1, 1, X_train.shape[2]))
X_valid_reshaped = X_valid.reshape((X_valid.shape[0], X_valid.shape[1], 1, 1, X_valid.shape[2]))
X_test_reshaped = X_test.reshape((X_test.shape[0], X_test.shape[1], 1, 1, X_test.shape[2]))

In [None]:
def build_model(hp):
    model = Sequential()
    
    model.add(ConvLSTM2D(
        filters=hp.Int('filters_1', min_value=16, max_value=64, step=16), 
        kernel_size=hp.Choice('kernel_size_1', values=[1, 1]),
        input_shape=(X_train_reshaped.shape[1], 1, 1, X_train_reshaped.shape[4]),
        padding='same', 
        return_sequences=True,
        activation=hp.Choice('activation_function_1', values=['relu','tanh','softmax','sigmoid','leaky_relu','swish'])
    ))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))

    model.add(ConvLSTM2D(
        filters=hp.Int('filters_2', min_value=16, max_value=64, step=16), 
        kernel_size=hp.Choice('kernel_size_2', values=[1, 1]),
        padding='same', 
        return_sequences=True,
        activation=hp.Choice('activation_function_2', values=['relu','tanh','softmax','sigmoid','leaky_relu','swish'])
    ))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))

    # Extract the last time step
    model.add(Lambda(lambda x: x[:, -1, :, :, :]))  # Shape becomes [batch_size, height, width, channels]

    # Conv2D Layer for output (single frame prediction)
    model.add(Conv2D(
        filters=1,  # Ensure only one output channel to match y_train's shape
        kernel_size=(1, 1),  # Use a 2D kernel size
        activation=hp.Choice('activation_function_3', values=['relu','tanh','softmax','sigmoid','leaky_relu','swish']),
        padding='same',
        data_format='channels_last'
    ))

    # Compile the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(
            hp.Choice('learning_rate', values=[1e-3, 1e-4, 1e-5])
        ),
        loss='mse',
        metrics=['mae','mse','mape','accuracy']
    )
    
    return model

In [None]:
def build_model(hp):
    model = Sequential()

    model.add(ConvLSTM2D(
        filters=64,
        kernel_size=(1,1),
        input_shape=(X_train_reshaped.shape[1], 1, 1, X_train_reshaped.shape[4]),
        padding='same', 
        return_sequences=True,
        activation='swish',
    ))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    
    model.add(ConvLSTM2D(
        filters=32,
        kernel_size=(1,1),
        padding='same', 
        return_sequences=True,
        activation='relu',
    ))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    
    # Extract the last time step
    model.add(Lambda(lambda x: x[:, -1, :, :, :]))  # Shape becomes [batch_size, height, width, channels]
    
    # Conv2D Layer for output (single frame prediction)
    model.add(Conv2D(
        filters=1,  # Ensure only one output channel to match y_train's shape
        kernel_size=(1, 1),  # Use a 2D kernel size
        activation='tanh',
        padding='same',
        data_format='channels_last'
    ))
    
    # Compile the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
        loss='mse',
        metrics=['mae','mse','mape','accuracy']
    )
    return model

In [None]:
X_train_reshaped = X_train.reshape((X_train.shape[0], X_train.shape[1], 1, 1, X_train.shape[2]))
X_valid_reshaped = X_valid.reshape((X_valid.shape[0], X_valid.shape[1], 1, 1, X_valid.shape[2]))
X_test_reshaped = X_test.reshape((X_test.shape[0], X_test.shape[1], 1, 1, X_test.shape[2]))

In [None]:
batch_size = 32  # Adjust based on memory and GPU capability

train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_dataset = train_dataset.cache().shuffle(1000).batch(batch_size).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

valid_dataset = tf.data.Dataset.from_tensor_slices((X_valid, y_valid))
valid_dataset = valid_dataset.batch(batch_size).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
test_dataset = test_dataset.batch(batch_size).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [None]:
# Define callbacks
early_stopping_cb = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
lr_scheduler_cb = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2)
checkpoint_cb = ModelCheckpoint('model_checkpoint.keras', save_best_only=True)

In [None]:
from keras_tuner import BayesianOptimization

# Define the tuner
tuner = BayesianOptimization(
    build_model,
    objective='val_loss',
    max_trials=1,
    executions_per_trial=1,
    directory='ch4_train',
    project_name='ch4_train_conv3dlstm_tuning',
    overwrite=True,
)

In [None]:
X_train_reshaped = X_train.reshape((X_train.shape[0], X_train.shape[1], 1, 1, X_train.shape[2]))
X_valid_reshaped = X_valid.reshape((X_valid.shape[0], X_valid.shape[1], 1, 1, X_valid.shape[2]))
X_test_reshaped = X_test.reshape((X_test.shape[0], X_test.shape[1], 1, 1, X_test.shape[2]))

In [None]:
#tuner.search(train_dataset, epochs=10, validation_data=valid_dataset, callbacks=[early_stopping_cb, lr_scheduler_cb])
tuner.search(X_train_reshaped, y_train, epochs=10, validation_data=(X_valid_reshaped, y_valid), callbacks=[early_stopping_cb, lr_scheduler_cb])

In [None]:
model = Sequential()

# ConvLSTM2D layer
model.add(ConvLSTM2D(filters=64, kernel_size=(1, 1), input_shape=(X_train_reshaped.shape[1], 1, 1, X_train_reshaped.shape[4]),
                     padding='same', return_sequences=True))
model.add(Dropout(0.2))

model.add(ConvLSTM2D(filters=64, kernel_size=(1, 1), padding='same', return_sequences=False))
model.add(Dropout(0.2))

model.add(Flatten())
model.add(Dense(1))  # Output layer

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae','accuracy','mse'])

In [None]:
plot_model(model, to_file='/Volumes/JPL/ch4_model.png', show_shapes=True, show_layer_names=True, dpi=300)

In [None]:
model = Sequential()
model.add(LSTM(units=32, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(units=32, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint('/Volumes/JPL/ch4_train/model.keras', save_best_only=True, monitor='val_loss', mode='min')

tf.profiler.experimental.start('logdir')

history = model.fit(
    X_train_reshaped, y_train,
    epochs=10,
    batch_size=16,
    validation_data=(X_valid_reshaped, y_valid),
    callbacks=[early_stopping, checkpoint],
    verbose=1
)

tf.profiler.experimental.stop()

In [None]:
tf.profiler.experimental.stop()

In [None]:
# Evaluate the model on the test set
test_loss, test_mae = model.evaluate(X_test, y_test)
print(f'Test Loss: {test_loss}, Test MAE: {test_mae}')


In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# You may want to inverse the scaling if you applied scaling to your data


In [None]:
# alt_valid.interpolate(method='linear', limit_direction='forward').dropna()
# alt_valid_pivot = alt_valid.pivot_table(index='datetime', columns=['lat', 'lon'], values='alt')
# alt_valid_pivot = alt_valid_pivot.fillna(0)
# alt_valid_data = alt_valid_pivot.values
# alt_valid_data = alt_valid_data.reshape((alt_valid_data.shape[0], alt_valid_data.shape[1], 1))

In [None]:
def create_sliding_windows(data, window_size):
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data[i:i+window_size])
        y.append(data[i+window_size])
    return np.array(X), np.array(y)

In [None]:
window_size = 12  # Adjust based on the length of the sequence you want to forecast
combined_df_X, combined_df_y = create_sliding_windows(combined_df_data, window_size)

In [None]:
# window_size = 12  # Adjust based on the length of the sequence you want to forecast
# alt_valid_X, alt_valid_y = create_sliding_windows(alt_valid_data, window_size)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(combined_df_X, combined_df_y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.15, random_state=42)
#X_train=alt_train_X; y_train=alt_train_y; X_valid=alt_valid_X; y_valid=alt_valid_y

In [None]:
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, X_test.shape, y_test.shape

In [None]:
pd.DataFrame(y_train.reshape(-1)).replace(0,np.nan).dropna().reset_index(drop=True).plot();

In [None]:
#combined_df_pivot.iloc[147] #1996-06-01 to 2015-04-01
#combined_df_pivot.iloc[147:147+44] #2015-05-01 to 2019-03-01
#combined_df_pivot.iloc[147+44:] #2019-04-01 to 2022-10-01

In [None]:
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler()
X_train_sc = X_scaler.fit_transform(X_train.reshape(-1,1)).reshape(X_train.shape)
X_valid_sc = X_scaler.transform(X_valid.reshape(-1,1)).reshape(X_valid.shape)
X_test_sc = X_scaler.transform(X_test.reshape(-1,1)).reshape(X_test.shape)

In [None]:
y_scaler = MinMaxScaler()
y_train_sc = y_scaler.fit_transform(y_train.reshape(-1, 1)).reshape(y_train.shape)
y_valid_sc = y_scaler.transform(y_valid.reshape(-1, 1)).reshape(y_valid.shape)
y_test_sc = y_scaler.transform(y_test.reshape(-1, 1)).reshape(y_test.shape)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import ConvLSTM2D, Flatten, Dense
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, ConvLSTM2D, BatchNormalization, Lambda
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from keras_tuner import BayesianOptimization

window_size = 12

def build_model(hp):
    model = Sequential()
    kernel_height = hp.Choice('kernel_height', values=[1, 3])
    kernel_width = hp.Choice('kernel_width', values=[1])
    model.add(ConvLSTM2D(
        filters=hp.Int('filters_1', min_value=32, max_value=128, step=32),
        kernel_size=(kernel_height, kernel_width),
        activation=hp.Choice('activation_function_1', values=['relu','tanh','linear','sigmoid','leaky_relu','swish']),
        input_shape=(window_size, combined_df_data.shape[1], combined_df_data.shape[2], 1)))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse')
    model.compile(
        optimizer=tf.keras.optimizers.Adam(
            hp.Choice('learning_rate', values=[1e-3, 1e-4, 1e-5])
        ),
        loss='mse',
        metrics=['mae','mape','accuracy']
    )
    return model

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))

In [None]:
valid_dataset = tf.data.Dataset.from_tensor_slices((X_valid, y_valid))

In [None]:
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))

In [None]:
batch_size = 32

In [None]:
train_dataset = train_dataset.cache().shuffle(1000).batch(batch_size).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
valid_dataset = valid_dataset.batch(batch_size).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
test_dataset = test_dataset.batch(batch_size).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [None]:
# Define callbacks
early_stopping_cb = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
lr_scheduler_cb = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2)
checkpoint_cb = ModelCheckpoint('model_checkpoint.keras', save_best_only=True)

In [None]:
from keras_tuner import BayesianOptimization

# Define the tuner
tuner = BayesianOptimization(
    build_model,
    objective='val_loss',
    max_trials=10,
    executions_per_trial=1,
    directory='/Volumes/JPL/ch4_train',
    project_name='ch4_train_conv3dlstm_tuning',
    overwrite=True,
)

In [None]:
tuner.search(train_dataset, epochs=10, validation_data=valid_dataset, callbacks=[early_stopping_cb, lr_scheduler_cb])

In [None]:
best_model = tuner.get_best_models(num_models=1)[0]

In [None]:
test_loss, test_mae = best_model.evaluate(test_dataset)
print(f"Test Loss: {test_loss}, Test MAE: {test_mae}")

In [None]:
tuner.search(X_train, y_train, epochs=100, validation_data=(, y_train), callbacks=[early_stopping_cb, lr_scheduler_cb])

In [None]:
# Make predictions on new data
# predictions = best_model.predict(new_data)

In [None]:
#plt.plot(history.history['loss']);
#plt.plot(history.history['val_loss']);

In [None]:
final_test_loss, final_test_mae = best_model.evaluate(train_dataset)
print(f"Final Test Loss: {final_test_loss}, Final Test MAE: {final_test_mae}")

In [None]:
y_pred = model.predict(X_test_sc)
y_pred_original = y_scaler.inverse_transform(y_pred)

In [None]:
y_test_rescaled = y_scaler.inverse_transform(y_test_sc.reshape(-1,1))

In [None]:
plt.plot(y_pred_original);

In [None]:
from sklearn.preprocessing import MinMaxScaler

y_pred_rescaled = y_scaler.inverse_transform(y_pred)
y_test_rescaled = y_scaler.inverse_transform(y_test)

# Calculate the MSE and MAE on the rescaled data
mse_rescaled = np.mean((y_pred_rescaled - y_test_rescaled) ** 2)
mae_rescaled = np.mean(np.abs(y_pred_rescaled - y_test_rescaled))


In [None]:
y_pred = model.predict(X_test_scaled)
y_pred_original = y_scaler.inverse_transform(y_pred)

In [None]:
alt_train['lat_grid'] = alt_train['lat'].round(3)  # or use a more sophisticated spatial binning technique
alt_train['lon_grid'] = alt_train['lon'].round(3)
alt_train_agg = alt_train.groupby(['datetime', 'lat_grid', 'lon_grid']).mean().reset_index()

In [None]:
def aggregate_temporal_features(df, past_n_months=12):
    df_sorted = df.sort_values(by=['lat_grid', 'lon_grid', 'datetime'])
    df['alt_shifted'] = df.groupby(['lat_grid', 'lon_grid'])['alt'].shift(past_n_months)
    return df.dropna()

In [None]:
alt_train_features = aggregate_temporal_features(alt_train_agg)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
alt_train_features['alt_scaled'] = scaler.fit_transform(alt_train_features['alt'].values.reshape(-1, 1))

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
alt_train_features['alt_scaled'] = scaler.fit_transform(alt_train_features['alt'].values.reshape(-1, 1))

In [None]:
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy('mixed_float16')

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler_alt = StandardScaler()
train_alt_scaled = scaler_alt.fit_transform(alt_train[['alt']])
val_alt_scaled = scaler_alt.transform(alt_valid[['alt']])
test_alt_scaled = scaler_alt.transform(alt_test[['alt']])
alt_train['alt']=train_alt_scaled
alt_valid['alt']=val_alt_scaled
alt_test['alt']=test_alt_scaled

In [None]:
#combined_df = pd.concat([alt_train, alt_valid, alt_test])

In [None]:
#combined_df=combined_df.groupby(['datetime','lat','lon'])['alt'].min().reset_index().set_index('datetime').resample('MS').min().dropna().reset_index()

In [None]:
alt_train

In [None]:
alt_train_lat_grid = np.sort(alt_train['lat'].unique())
alt_train_lon_grid = np.sort(alt_train['lon'].unique())

In [None]:
alt_valid_lat_grid = np.sort(alt_valid['lat'].unique())
alt_valid_lon_grid = np.sort(alt_valid['lon'].unique())

In [None]:
alt_test_lat_grid = np.sort(alt_test['lat'].unique())
alt_test_lon_grid = np.sort(alt_test['lon'].unique())

In [None]:
def pivot_dataframe(df, lat_grid, lon_grid):
    df_pivot = df.pivot_table(index='datetime', columns=['lat', 'lon'], values='alt')

    # Reindex to ensure the grid is consistent
    df_pivot = df_pivot.reindex(index=df['datetime'].unique(), columns=pd.MultiIndex.from_product([lat_grid, lon_grid]))

    # Fill missing values
    df_pivot = df_pivot.fillna(0)  # or use another method like interpolation

    # Convert to numpy array
    data_array = df_pivot.values.reshape(len(df_pivot), len(lat_grid), len(lon_grid))

    # Add the depth dimension (1 channel for alt)
    data_array = np.expand_dims(data_array, axis=-1)

    return data_array

In [None]:
alt_train_array = pivot_dataframe(alt_train, alt_train_lat_grid, alt_train_lon_grid)
alt_valid_array = pivot_dataframe(alt_valid, lat_grid, lon_grid)
alt_test_array = pivot_dataframe(alt_test, lat_grid, lon_grid)

In [None]:
def create_sequences(data_array, time_steps):
    X, y = [], []
    for i in range(len(data_array) - time_steps):
        X_seq = data_array[i:i+time_steps]
        y_seq = data_array[i+time_steps]

        X.append(X_seq)
        y.append(y_seq)

    X = np.array(X)
    y = np.array(y)
    
    return X, y

In [None]:
time_steps = 12
X_train, y_train = create_sequences(alt_train_array, time_steps)
X_valid, y_valid = create_sequences(alt_valid_array, time_steps)
X_test, y_test = create_sequences(alt_test_array, time_steps)

# Print shapes for verification
# print("X_train shape:", X_train.shape)
# print("y_train shape:", y_train.shape)
# print("X_valid shape:", X_valid.shape)
# print("y_valid shape:", y_valid.shape)
# print("X_test shape:", X_test.shape)
# print("y_test shape:", y_test.shape)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, ConvLSTM2D, BatchNormalization, Lambda
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from keras_tuner import BayesianOptimization

# def residual_block(x, filters, kernel_size, hp):
#     """
#     A Residual Block with ConvLSTM2D layers.
#     """
#     shortcut = x
    
#     # First ConvLSTM2D layer
#     x = ConvLSTM2D(
#         filters=filters,
#         kernel_size=kernel_size,
#         padding='same',
#         return_sequences=True,
#         activation=hp.Choice('activation_function_1', values=['relu', 'tanh', 'sigmoid', 'leaky_relu', 'swish', 'linear'])
#     )(x)
    
#     x = BatchNormalization()(x)
    
#     # Second ConvLSTM2D layer
#     x = ConvLSTM2D(
#         filters=filters,
#         kernel_size=kernel_size,
#         padding='same',
#         return_sequences=True,
#         activation=hp.Choice('activation_function_2', values=['relu', 'tanh', 'sigmoid', 'leaky_relu', 'swish', 'linear'])
#     )(x)
    
#     x = BatchNormalization()(x)
    
#     # Convolutional layer in the shortcut path to match the filter size of x
#     if shortcut.shape[-1] != filters:
#         shortcut = ConvLSTM2D(
#             filters=filters,
#             kernel_size=(1, 1),
#             padding='same',
#             return_sequences=True,
#             activation='linear'
#         )(shortcut)
    
#     # Add the shortcut to the main path
#     x = Add()([x, shortcut])
#     x = Activation('relu')(x)
    
#     return x

# def build_model(hp):
#     input_shape = (time_steps, len(lat_grid), len(lon_grid), 1)
#     inputs = Input(shape=input_shape)
    
#     # Initial ConvLSTM2D layer
#     x = ConvLSTM2D(
#         filters=hp.Int('filters_1', min_value=64, max_value=128, step=32),
#         kernel_size=hp.Choice('kernel_size_1', values=[3, 5]),
#         padding='same',
#         return_sequences=True,
#         activation=hp.Choice('activation_function_3', values=['relu','tanh','sigmoid','leaky_relu','swish','linear']),
#     )(inputs)
    
#     x = BatchNormalization()(x)    
#     # Residual Block 1
#     x = residual_block(x, filters=hp.Int('filters_2', min_value=64, max_value=128, step=32), kernel_size=hp.Choice('kernel_size_2', values=[3, 5]), hp=hp)
    
#     # Residual Block 2
#     x = residual_block(x, filters=hp.Int('filters_3', min_value=64, max_value=128, step=32), kernel_size=hp.Choice('kernel_size_3', values=[3, 5]), hp=hp)
    
#     # Reduce the time dimension
#     x = Lambda(lambda x: x[:, -1, :, :, :])(x)
    
#     # 3D Convolutional Layer to combine features
#     x = Conv2D(
#         filters=hp.Int('filters_4', min_value=32, max_value=128, step=32),
#         kernel_size=(3, 3),
#         padding='same',
#         activation=hp.Choice('activation_function_4', values=['relu','tanh','sigmoid','leaky_relu','swish','linear']),
#     )(x)
    
#     x = BatchNormalization()(x)
    
#     # Final Conv2D layer for output
#     outputs = Conv2D(
#         filters=1,
#         kernel_size=(3, 3),
#         padding='same',
#         activation=hp.Choice('activation_function_5', values=['relu','tanh','sigmoid','leaky_relu','swish','linear']),
#     )(x)
    
#     # Compile the model
#     model = Model(inputs, outputs)
#     model.compile(
#         optimizer=tf.keras.optimizers.Adam(
#             hp.Choice('learning_rate', values=[1e-4, 1e-5])
#         ),
#         loss='mse',
#         metrics=['mae']
#     )
    
#     return model
    
# def build_model(hp):
#     model = Sequential()
    
#     # Add more layers or units
#     model.add(ConvLSTM2D(
#         filters=hp.Int('filters_1', min_value=32, max_value=128, step=32),
#         kernel_size=hp.Choice('kernel_size_1', values=[3, 5]),
#         input_shape=(time_steps, len(lat_grid), len(lon_grid), 1),
#         padding='same', 
#         return_sequences=True,
#         activation=hp.Choice('activation_function_1', values=['relu','tanh','sigmoid','leaky_relu','swish','linear']),
#     ))
#     model.add(BatchNormalization())
    
#     # More ConvLSTM layers
#     model.add(ConvLSTM2D(
#         filters=hp.Int('filters_2', min_value=32, max_value=128, step=32), 
#         kernel_size=hp.Choice('kernel_size_2', values=[3, 5]),
#         padding='same', 
#         return_sequences=False,
#         activation=hp.Choice('activation_function_1', values=['relu','tanh','sigmoid','leaky_relu','swish','linear']),
#     ))
#     model.add(BatchNormalization())

#     # Final output layer
#     model.add(Conv2D(
#         filters=1, 
#         kernel_size=(3, 3), 
#         activation=hp.Choice('activation_function_1', values=['relu','tanh','sigmoid','leaky_relu','swish','linear']),
#         padding='same',
#     ))

#     model.compile(
#         optimizer=tf.keras.optimizers.Adam(
#             hp.Choice('learning_rate', values=[1e-4, 1e-5])
#         ),
#         loss='mse',
#         metrics=['mae']
#     )
    
#     return model


def build_model(hp):
    model = Sequential()
    
    model.add(ConvLSTM2D(
        filters=hp.Int('filters_1', min_value=16, max_value=64, step=16), 
        kernel_size=hp.Choice('kernel_size_1', values=[3, 5]),
        input_shape=(time_steps, len(lat_grid), len(lon_grid), 1),
        padding='same', 
        return_sequences=True,
        activation=hp.Choice('activation_function_1', values=['relu','tanh','softmax','sigmoid','leaky_relu','swish'])
    ))
    model.add(BatchNormalization())

    model.add(ConvLSTM2D(
        filters=hp.Int('filters_2', min_value=16, max_value=64, step=16), 
        kernel_size=hp.Choice('kernel_size_2', values=[3, 5]),
        padding='same', 
        return_sequences=True,
        activation=hp.Choice('activation_function_2', values=['relu','tanh','softmax','sigmoid','leaky_relu','swish'])
    ))
    model.add(BatchNormalization())

    # Extract the last time step
    model.add(Lambda(lambda x: x[:, -1, :, :, :]))  # Shape becomes [batch_size, height, width, channels]

    # Conv2D Layer for output (single frame prediction)
    model.add(Conv2D(
        filters=1,  # Ensure only one output channel to match y_train's shape
        kernel_size=(3, 3),  # Use a 2D kernel size
        activation=hp.Choice('activation_function_3', values=['relu','tanh','softmax','sigmoid','leaky_relu','swish']),
        padding='same',
        data_format='channels_last'
    ))

    # Compile the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(
            hp.Choice('learning_rate', values=[1e-3, 1e-4, 1e-5])
        ),
        loss='mse',
        metrics=['mae']
    )
    
    return model

In [None]:
batch_size = 32  # Adjust based on memory and GPU capability

train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_dataset = train_dataset.cache().shuffle(1000).batch(batch_size).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

valid_dataset = tf.data.Dataset.from_tensor_slices((X_valid, y_valid))
valid_dataset = valid_dataset.batch(batch_size).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
test_dataset = test_dataset.batch(batch_size).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [None]:
# Define callbacks
early_stopping_cb = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
lr_scheduler_cb = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2)
checkpoint_cb = ModelCheckpoint('model_checkpoint.keras', save_best_only=True)

In [None]:
from keras_tuner import BayesianOptimization

# Define the tuner
tuner = BayesianOptimization(
    build_model,
    objective='val_loss',
    max_trials=10,
    executions_per_trial=1,
    directory='/Volumes/JPL/alt_train3',
    project_name='alt_train_conv3dlstm_tuning',
    overwrite=True,
)

In [None]:
tuner.search(train_dataset, epochs=10, validation_data=valid_dataset, callbacks=[early_stopping_cb, lr_scheduler_cb])

In [None]:
best_model = tuner.get_best_models(num_models=1)[0]

In [None]:
test_loss, test_mae = best_model.evaluate(test_dataset)
print(f"Test Loss: {test_loss}, Test MAE: {test_mae}")

In [None]:
history=best_model.fit(train_dataset, epochs=10, validation_data=valid_dataset, callbacks=[early_stopping_cb, lr_scheduler_cb, checkpoint_cb])

In [None]:
# best_model.fit(np.concatenate([train_alt_scaled, val_alt_scaled]), np.concatenate([y_train_alt, y_val_alt]), 
#                epochs=100, validation_split=0.2)

In [None]:
final_test_loss, final_test_mae = best_model.evaluate(train_dataset)
print(f"Final Test Loss: {final_test_loss}, Final Test MAE: {final_test_mae}")

In [None]:
import numpy as np

# Extract targets (y values) from train_dataset
y_train_scaled = []

for _, y in train_dataset:
    y_train_scaled.append(y.numpy())

y_train_scaled = np.concatenate(y_train_scaled, axis=0)
print("Scaled y_train shape:", y_train_scaled.shape)

In [None]:
# Assume 'scaler' is your fitted StandardScaler used during training
y_train_original = scaler_alt.inverse_transform(y_train_scaled.reshape(-1, 1)).reshape(y_train_scaled.shape)
print("Original y_train shape:", y_train_original.shape)

In [None]:
# # Reconstruct the dataset with the original y values
train_dataset_inverse_transformed = tf.data.Dataset.from_tensor_slices((X_train, y_train_original))
train_dataset_inverse_transformed = train_dataset_inverse_transformed.batch(batch_size)

In [None]:
final_test_loss, final_test_mae = best_model.evaluate(train_dataset_inverse_transformed)
print(f"Final Test Loss: {final_test_loss}, Final Test MAE: {final_test_mae}")

In [None]:
final_test_loss, final_test_mae = best_model.evaluate(valid_dataset)
print(f"Final Test Loss: {final_test_loss}, Final Test MAE: {final_test_mae}")

In [None]:
import numpy as np

# Extract targets (y values) from train_dataset
y_valid_scaled = []

for _, y in valid_dataset:
    y_valid_scaled.append(y.numpy())

y_valid_scaled = np.concatenate(y_valid_scaled, axis=0)
print("Scaled y_valid shape:", y_valid_scaled.shape)

In [None]:
# Assume 'scaler' is your fitted StandardScaler used during training
y_valid_original = scaler_alt.inverse_transform(y_valid_scaled.reshape(-1, 1)).reshape(y_valid_scaled.shape)
print("Original y_valid shape:", y_valid_original.shape)

In [None]:
# # Reconstruct the dataset with the original y values
valid_dataset_inverse_transformed = tf.data.Dataset.from_tensor_slices((X_valid, y_valid_original))
valid_dataset_inverse_transformed = valid_dataset_inverse_transformed.batch(batch_size)

In [None]:
final_test_loss, final_test_mae = best_model.evaluate(valid_dataset_inverse_transformed)
print(f"Final Test Loss: {final_test_loss}, Final Test MAE: {final_test_mae}")

In [None]:
final_test_loss, final_test_mae = best_model.evaluate(test_dataset)
print(f"Final Test Loss: {final_test_loss}, Final Test MAE: {final_test_mae}")

In [None]:
import numpy as np

# Extract targets (y values) from train_dataset
y_test_scaled = []

for _, y in test_dataset:
    y_test_scaled.append(y.numpy())

y_test_scaled = np.concatenate(y_test_scaled, axis=0)
print("Scaled y_test shape:", y_test_scaled.shape)

In [None]:
# Assume 'scaler' is your fitted StandardScaler used during training
y_test_original = scaler_alt.inverse_transform(y_test_scaled.reshape(-1, 1)).reshape(y_test_scaled.shape)
print("Original y_test shape:", y_test_original.shape)

In [None]:
# # Reconstruct the dataset with the original y values
test_dataset_inverse_transformed = tf.data.Dataset.from_tensor_slices((X_test, y_test_original))
test_dataset_inverse_transformed = test_dataset_inverse_transformed.batch(batch_size)

In [None]:
final_test_loss, final_test_mae = best_model.evaluate(test_dataset_inverse_transformed)
print(f"Final Test Loss: {final_test_loss}, Final Test MAE: {final_test_mae}")

In [None]:
predictions = best_model.predict(test_dataset)

# If using a scaler, inverse transform the predictions and actual values
predictions_original = scaler_alt.inverse_transform(predictions.reshape(-1, 1)).reshape(predictions.shape)
actual_values_original = scaler_alt.inverse_transform(y_test.reshape(-1, 1)).reshape(y_test.shape)

# Compare first few predictions with actual values
for i in range(5):
    print(f"Prediction {i+1}: {predictions_original[i, :, :, 0]}")
    print(f"Actual Value {i+1}: {actual_values_original[i, :, :, 0]}")

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.title('Predicted (Original Scale)')
plt.imshow(predictions_original[0, :, :, 0], cmap='viridis')
plt.colorbar()

plt.subplot(1, 2, 2)
plt.title('Actual (Original Scale)')
plt.imshow(actual_values_original[0, :, :, 0], cmap='viridis')
plt.colorbar()

plt.show()

In [None]:
import matplotlib.pyplot as plt

# Calculate residuals
residuals = y_test_original.flatten() - predictions_original.flatten()

# Plot residuals
plt.figure(figsize=(10, 6))
plt.hist(residuals, bins=50, color='blue', alpha=0.7)
plt.title('Residuals Distribution')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.show()

# Scatter plot of actual vs predicted
plt.figure(figsize=(10, 6))
plt.scatter(y_test_original.flatten(), predictions_original.flatten(), alpha=0.3)
plt.plot([min(y_test_original.flatten()), max(y_test_original.flatten())],
         [min(y_test_original.flatten()), max(y_test_original.flatten())], color='red')
plt.title('Actual vs Predicted')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.show()


In [None]:
actual_values_original.mean()

In [None]:
predictions.mean()

In [None]:
best_model.summary()

In [None]:
plot_model(best_model, to_file='/Volumes/JPL/alt_model.png', show_shapes=True, show_layer_names=True, dpi=300)

In [None]:
tuner.results_summary()

In [None]:
best_model.history.model.history.params

In [None]:
predictions_scaled = best_model.predict(test_dataset)

# 5. Inverse Transform the Predictions Back to the Original Scale
# Reshape predictions to be 2D for inverse transform, then reshape back to original shape
predictions_reshaped = predictions_scaled.reshape(-1, 1)
predictions_original = scaler_alt.inverse_transform(predictions_reshaped).reshape(predictions_scaled.shape)

# Example: Compare the first prediction with the actual value
print("First prediction (original scale):", predictions_original[0])
print("First actual value (original scale):", scaler_alt.inverse_transform(y_test.reshape(-1, 1)).reshape(y_test.shape)[0])

# Visualizing predictions and actual values
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.title('Predicted (Original Scale)')
plt.imshow(predictions_original[0, :, :, 0], cmap='viridis')
plt.colorbar()

plt.subplot(1, 2, 2)
plt.title('Actual (Original Scale)')
plt.imshow(scaler_alt.inverse_transform(y_test.reshape(-1, 1)).reshape(y_test.shape)[0, :, :, 0], cmap='viridis')
plt.colorbar()

plt.show()

In [None]:
predictions_scaled.shape

In [None]:
predictions_reshaped.shape

In [None]:
predictions_original.shape

In [None]:
# import shap

# # Assuming ensemble_model is already trained and X_train_scaled is your standardized data
# explainer = shap.KernelExplainer(best_model.predict, X_train)
# shap_values = explainer.shap_values(X_train.reshape(3068,363*367))

# # Global importance plot
# shap.summary_plot(shap_values, X_train, feature_names=['alt'])

# # Dependence plot for a specific feature (e.g., alt)
# shap.dependence_plot('alt', shap_values, X_train)


In [None]:
import shap
from shap import kmeans
import numpy as np

# Using shap.sample to reduce the background dataset
#background_data = shap.sample(X_train.reshape(3068,363*367))  # Reduce to 100 samples for background
background_data = shap.sample(X_train.reshape(3068,363*367), 50) 

# Or using shap.kmeans for clustering
#background_data = shap.kmeans(X_train.reshape(3068,363*367), k=100)  # K-means clustering to find 100 representative samples

In [None]:
X_test_flat = X_test.reshape(X_test.shape[0], -1)
background_data_flat = background_data.reshape(background_data.shape[0], -1)

In [None]:
background_data=background_data.reshape(50, 1, 363, 367, 1)

In [None]:
explainer = shap.KernelExplainer(best_model.predict, background_data)

In [None]:
shap_values = explainer.shap_values(X_test_flat)

In [None]:
shap.summary_plot(shap_values, X_test_flat)

In [None]:
shap_values = explainer.shap_values(X_test)

In [None]:
# Global interpretation
shap.summary_plot(shap_values, X_test.reshape(687,363*367), feature_names=['alt'])

# Local interpretation for the first instance in the test set
shap.force_plot(explainer.expected_value, shap_values[0], X_test[0])


In [None]:
shap_values = explainer.shap_values(X_test)

# Visualize the SHAP values with a summary plot
shap.summary_plot(shap_values, X_test, feature_names=['alt'])

# For local explanations, you can visualize specific instances
shap.force_plot(explainer.expected_value, shap_values[0], X_test[0])

In [None]:
explainer = shap.DeepExplainer(best_model, background_data)

shap_values = explainer.shap_values(X_test.reshape(687,363*367))

In [None]:
X_train.reshape(3068,363*367)

In [None]:
from tensorflow.keras.utils import plot_model

plot_model(best_model, to_file='alt_model.png', show_shapes=True, show_layer_names=True, dpi=300)

In [None]:
import tensorflow as tf

logdir = "logs/ensemble_model/"
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir)

ensemble_model.fit(X_train, y_train, epochs=10, callbacks=[tensorboard_callback])

%load_ext tensorboard
%tensorboard --logdir logs/ensemble_model/

In [None]:
import shap

# Assuming ensemble_model is already trained and X_train_scaled is your standardized data
explainer = shap.KernelExplainer(ensemble_model.predict, X_train_scaled)
shap_values = explainer.shap_values(X_train_scaled)

# Global importance plot
shap.summary_plot(shap_values, X_train_scaled, feature_names=['alt', 'fch4', 'fc'])

# Dependence plot for a specific feature (e.g., alt)
shap.dependence_plot('alt', shap_values, X_train_scaled)


In [None]:
# Explain a single prediction
instance_index = 0
shap.force_plot(explainer.expected_value, shap_values[instance_index], X_train_scaled[instance_index])


In [None]:
import lime
import lime.lime_tabular

# Create a LIME explainer
explainer = lime.lime_tabular.LimeTabularExplainer(X_train_scaled, feature_names=['alt', 'fch4', 'fc'], class_names=['target'], verbose=True, mode='regression')

# Explain a single prediction
instance_index = 0
exp = explainer.explain_instance(X_train_scaled[instance_index], ensemble_model.predict, num_features=5)

# Display the explanation
exp.show_in_notebook(show_table=True)


In [None]:
# SHAP force plot for a single instance
shap.force_plot(explainer.expected_value, shap_values[instance_index], X_train_scaled[instance_index])

# LIME explanation for the same instance
exp = explainer.explain_instance(X_train_scaled[instance_index], ensemble_model.predict, num_features=5)
exp.show_in_notebook(show_table=True)


In [None]:
# If the meta-model is an ensemble of sub-models, you can calculate SHAP values for each sub-model's output
meta_explainer = shap.KernelExplainer(ensemble_model.predict, [X_train_scaled_convlstm, X_train_scaled_transformer, X_train_scaled_vae])
meta_shap_values = meta_explainer.shap_values([X_test_scaled_convlstm, X_test_scaled_transformer, X_test_scaled_vae])

# Visualize the importance of each component model's contribution
shap.summary_plot(meta_shap_values, [X_test_scaled_convlstm, X_test_scaled_transformer, X_test_scaled_vae], feature_names=['ConvLSTM', 'Transformer', 'VAE'])


In [None]:
plot_model(ensemble_model, to_file='ensemble_model.png', show_shapes=True, show_layer_names=True)

In [None]:
alt_train.alt.plot();

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler_alt = MinMaxScaler()
train_alt_scaled = scaler_alt.fit_transform(alt_train[['alt']])
val_alt_scaled = scaler_alt.transform(alt_valid[['alt']])
test_alt_scaled = scaler_alt.transform(alt_test[['alt']])

In [None]:
alt_train['alt']=train_alt_scaled
alt_valid['alt']=val_alt_scaled
alt_test['alt']=test_alt_scaled

In [None]:
alt_train.alt.plot();

In [None]:
lat_grid = np.sort(alt_train['lat'].unique())
lon_grid = np.sort(alt_train['lon'].unique())

In [None]:
alt_train_pivot = alt_train.pivot_table(index='datetime', columns=['lat', 'lon'], values='alt')

In [None]:
alt_train_pivot = alt_train_pivot.reindex(index=alt_train['datetime'].unique(), columns=pd.MultiIndex.from_product([lat_grid, lon_grid]))

In [None]:
alt_train_pivot = alt_train_pivot.fillna(0)

In [None]:
alt_train_arr = alt_train_pivot.values.reshape(len(alt_train_pivot), len(lat_grid), len(lon_grid))

In [None]:
alt_train_arr = np.expand_dims(alt_train_arr, axis=-1)

In [None]:
alt_train_arr.shape

In [None]:
batch_size = 32 
time_steps = alt_train_arr.shape[0] // batch_size

In [None]:
alt_train_arr = alt_train_arr[:batch_size*time_steps].reshape(batch_size, time_steps, len(lat_grid), len(lon_grid), 1)

In [None]:
alt_train_arr.shape

In [None]:
time_steps = 10
batch_size, total_time_steps, height, width, depth = alt_train_arr.shape

X_train = []
y_train = []

for i in range(total_time_steps - time_steps):
    X_seq = alt_train_arr[:, i:i+time_steps, :, :, :]
    y_seq = alt_train_arr[:, i+time_steps, :, :, :]
    X_train.append(X_seq)
    y_train.append(y_seq)

X_train = np.array(X_train)
y_train = np.array(y_train)

X_train = X_train.reshape(-1, time_steps, height, width, depth)

y_train = y_train.reshape(-1, height, width, depth)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv3D, ConvLSTM2D, BatchNormalization, Conv3DTranspose, TimeDistributed
from keras_tuner import BayesianOptimization

def build_model(hp):
    model = Sequential()
    
    model.add(ConvLSTM2D(
        filters=hp.Int('filters_1', min_value=16, max_value=64, step=16), 
        kernel_size=hp.Choice('kernel_size_1', values=[3, 5]),
        input_shape=(None, 326, 328, 1),
        padding='same', 
        return_sequences=True, 
        activation='relu'
    ))
    model.add(BatchNormalization())

    # ConvLSTM2D Layer 2
    model.add(ConvLSTM2D(
        filters=hp.Int('filters_2', min_value=16, max_value=64, step=16), 
        kernel_size=hp.Choice('kernel_size_2', values=[3, 5]),
        padding='same', 
        return_sequences=True,
        activation='relu'
    ))
    model.add(BatchNormalization())

    model.add(Lambda(lambda x: x[:, -1, :, :, :]))

    # Conv2D Layer for output
    model.add(Conv2D(
        filters=1,
        #filters=hp.Int('filters_3', min_value=1, max_value=16, step=5), 
        kernel_size=(3, 3),
        activation='sigmoid',
        padding='same',
        data_format='channels_last'
    ))

    # Compile the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(
            hp.Choice('learning_rate', values=[1e-3, 1e-4, 1e-5])
        ),
        loss='mse',
        metrics=['mae']
    )
    
    return model


In [None]:
from keras_tuner import BayesianOptimization

# Define the tuner
tuner = BayesianOptimization(
    build_model,
    objective='val_loss',
    max_trials=20,  # Number of hyperparameter combinations to try
    executions_per_trial=10,  # Number of times to train the model per trial
    directory='/Volumes/JPL/alt_train',
    project_name='alt_train_conv3dlstm_tuning'
)

In [None]:
tuner.search_space_summary()

In [None]:
# Splitting the data into training and validation sets
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
# Run the hyperparameter search
tuner.search(X_train, y_train, epochs=10, validation_data=(X_val, y_val), batch_size=16)

In [None]:
best_model = tuner.get_best_models(num_models=1)[0]

# Summary of the best model
best_model.summary()

In [None]:
newalt=alt.groupby(['datetime','lat', 'lon'])['alt'].mean().reset_index().replace(0,np.nan).dropna()
newalt['datetime']=pd.to_datetime(newalt['datetime'])
newalt=newalt.sort_values(by='datetime',ascending=True)
newalt=newalt.reset_index(drop=True)
newalt=newalt[['datetime','lat','lon','alt']]
newalt

In [None]:
ch4.fch4.plot();

In [None]:
from sklearn.model_selection import train_test_split
train_alt, test_alt = train_test_split(alt, test_size=0.2, random_state=42)
train_alt, val_alt = train_test_split(train_alt, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2
train_alt=train_alt.sort_values(by='datetime',ascending=True)
train_alt=train_alt.reset_index(drop=True)
val_alt=val_alt.sort_values(by='datetime',ascending=True)
val_alt=val_alt.reset_index(drop=True)
test_alt=test_alt.sort_values(by='datetime',ascending=True)
test_alt=test_alt.reset_index(drop=True)

In [None]:
train_alt.to_parquet('/Volumes/JPL/alt_train_random.parquet',engine='pyarrow',compression='snappy')
val_alt.to_parquet('/Volumes/JPL/alt_valid_random.parquet',engine='pyarrow',compression='snappy')
test_alt.to_parquet('/Volumes/JPL/alt_test_random.parquet',engine='pyarrow',compression='snappy')

In [None]:
from sklearn.model_selection import train_test_split
train_fch4, test_fch4 = train_test_split(ch4, test_size=0.2, random_state=42)
train_fch4, val_fch4 = train_test_split(train_fch4, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2
train_fch4=train_fch4.sort_values(by='datetime',ascending=True)
train_fch4=train_fch4.reset_index(drop=True)
val_fch4=val_fch4.sort_values(by='datetime',ascending=True)
val_fch4=val_fch4.reset_index(drop=True)
test_fch4=test_fch4.sort_values(by='datetime',ascending=True)
test_fch4=test_fch4.reset_index(drop=True)

In [None]:
ch4.fch4.plot();

In [None]:
train_fch4.to_parquet('/Volumes/JPL/fch4_train_random.parquet',engine='pyarrow',compression='snappy')
val_fch4.to_parquet('/Volumes/JPL/fch4_valid_random.parquet',engine='pyarrow',compression='snappy')
test_fch4.to_parquet('/Volumes/JPL/fch4_test_random.parquet',engine='pyarrow',compression='snappy')

In [None]:
with open('/Volumes/JPL/alt_train.parquet','rb') as f:
    alt_train=pd.read_parquet(f)

In [None]:
with open('/Volumes/JPL/alt_valid.parquet','rb') as f:
    alt_valid=pd.read_parquet(f)

In [None]:
with open('/Volumes/JPL/alt_test.parquet','rb') as f:
    alt_test=pd.read_parquet(f)

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler_alt = StandardScaler()
train_alt_scaled = scaler_alt.fit_transform(alt_train[['alt']])
val_alt_scaled = scaler_alt.transform(alt_valid[['alt']])
test_alt_scaled = scaler_alt.transform(alt_test[['alt']])

In [None]:
alt_train['year'] = alt_train['datetime'].dt.year
alt_train['month'] = alt_train['datetime'].dt.month
alt_train['day'] = alt_train['datetime'].dt.day

In [None]:
alt_valid['year'] = alt_valid['datetime'].dt.year
alt_valid['month'] = alt_valid['datetime'].dt.month
alt_valid['day'] = alt_valid['datetime'].dt.day

In [None]:
alt_test['year'] = alt_test['datetime'].dt.year
alt_test['month'] = alt_test['datetime'].dt.month
alt_test['day'] = alt_test['datetime'].dt.day

In [None]:
alt_train['month_sin'] = np.sin(2 * np.pi * alt_train['month'] / 12)
alt_train['month_cos'] = np.cos(2 * np.pi * alt_train['month'] / 12)

In [None]:
alt_valid['month_sin'] = np.sin(2 * np.pi * alt_valid['month'] / 12)
alt_valid['month_cos'] = np.cos(2 * np.pi * alt_valid['month'] / 12)

In [None]:
alt_test['month_sin'] = np.sin(2 * np.pi * alt_test['month'] / 12)
alt_test['month_cos'] = np.cos(2 * np.pi * alt_test['month'] / 12)

In [None]:
alt_train['lat_sin'] = np.sin(np.radians(alt_train['lat']))
alt_train['lat_cos'] = np.cos(np.radians(alt_train['lat']))
alt_train['lon_sin'] = np.sin(np.radians(alt_train['lon']))
alt_train['lon_cos'] = np.cos(np.radians(alt_train['lon']))

In [None]:
alt_valid['lat_sin'] = np.sin(np.radians(alt_valid['lat']))
alt_valid['lat_cos'] = np.cos(np.radians(alt_valid['lat']))
alt_valid['lon_sin'] = np.sin(np.radians(alt_valid['lon']))
alt_valid['lon_cos'] = np.cos(np.radians(alt_valid['lon']))

In [None]:
alt_test['lat_sin'] = np.sin(np.radians(alt_test['lat']))
alt_test['lat_cos'] = np.cos(np.radians(alt_test['lat']))
alt_test['lon_sin'] = np.sin(np.radians(alt_test['lon']))
alt_test['lon_cos'] = np.cos(np.radians(alt_test['lon']))

In [None]:
alt_train_features = alt_train[['month_sin', 'month_cos', 'lat_sin', 'lat_cos', 'lon_sin', 'lon_cos', 'alt']]

In [None]:
alt_valid_features = alt_valid[['month_sin', 'month_cos', 'lat_sin', 'lat_cos', 'lon_sin', 'lon_cos', 'alt']]

In [None]:
alt_test_features = alt_test[['month_sin', 'month_cos', 'lat_sin', 'lat_cos', 'lon_sin', 'lon_cos', 'alt']]

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler_alt = StandardScaler()
train_alt_scaled = scaler_alt.fit_transform(alt_train[['alt']])
val_alt_scaled = scaler_alt.transform(alt_valid[['alt']])
test_alt_scaled = scaler_alt.transform(alt_test[['alt']])

In [None]:
# # df2=df_aggregated[['datetime','fch4']].set_index('datetime')
# newaltmax=alt.groupby(['datetime','lat', 'lon'])['alt'].max().reset_index().replace(0,np.nan).dropna()
# newaltmin=alt.groupby(['datetime','lat', 'lon'])['alt'].min().reset_index().replace(0,np.nan).dropna()
# newaltmean=alt.groupby(['datetime','lat', 'lon'])['alt'].mean().reset_index().replace(0,np.nan).dropna()

In [None]:
# newaltmax.alt.plot(alpha=0.2);
# newaltmean.alt.plot(alpha=0.2);
# newaltmin.alt.plot(alpha=0.2);

In [None]:
# newalt=alt.groupby(['datetime','lat', 'lon'])['alt'].mean().reset_index().replace(0,np.nan).dropna()
# newalt['datetime']=pd.to_datetime(newalt['datetime'])
# newalt=newalt.sort_values(by='datetime',ascending=True)
# newalt=newalt.reset_index(drop=True)
# newalt=newalt[['datetime','lat','lon','alt']]
# newalt

In [None]:
# newalt=pd.concat([pd.DataFrame(alt.set_index('datetime').resample('M').mean().reset_index()['lat'].interpolate()),
#            pd.DataFrame(alt.set_index('datetime').resample('M').mean().reset_index()['lon'].interpolate()),
#            pd.DataFrame(alt.set_index('datetime').resample('M').mean().reset_index()['datetime'].interpolate()),
#            pd.DataFrame(alt.set_index('datetime').resample('M').mean().reset_index()['alt'].interpolate())],axis=1)

In [None]:
# Assuming time_steps is 1 for simplicity here; modify according to your sequence length
X = alt_train_features.values.reshape(n_time, n_lat * n_lon, alt_train_features.shape[1])

In [None]:
alt_train_features

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = StandardScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(alt_train_features), columns=alt_train_features.columns)

In [None]:
scaled_df

In [None]:
lat_bins = np.linspace(scaled_df['lat_sin'].min(), scaled_df['lat_sin'].max(), 11)
lon_bins = np.linspace(scaled_df['lon_sin'].min(), scaled_df['lon_sin'].max(), 11)

In [None]:
scaled_df['lat_bin'] = pd.cut(scaled_df['lat_sin'], bins=lat_bins, labels=False)
scaled_df['lon_bin'] = pd.cut(scaled_df['lon_sin'], bins=lon_bins, labels=False)

In [None]:
scaled_df

In [None]:
scaled_df['datetime'] = pd.to_datetime(alt_train['datetime'])  # Ensure datetime is datetime type
scaled_df = scaled_df.sort_values('datetime')  # Sort by datetime

In [None]:
scaled_df.shape

In [None]:
time_steps = 226

In [None]:
scaled_df.shape[0]/

In [None]:
samples = scaled_df.shape[0] // (time_steps * 121)  # Adjust 100 based on 10x10 grid
X_reshaped = scaled_df[['month_sin', 'month_cos', 'lat_sin', 'lat_cos', 'lon_sin', 'lon_cos', 'alt']].values
X_reshaped = X_reshaped.reshape((samples, 226, 11, 11, 7))

In [None]:
X_reshaped.shape

In [None]:
y_train = scaled_df['alt'].values.reshape(33, time_steps, 11, 11, 1)

In [None]:
y_train = y_train.reshape(902418, -1)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv3D, Conv3DTranspose, ConvLSTM2D, Flatten, Dense

model = Sequential()

model.add(ConvLSTM2D(filters=64, kernel_size=(3, 3), padding='same', return_sequences=True, 
                     input_shape=(time_steps, 11, 11, 7)))
model.add(ConvLSTM2D(filters=64, kernel_size=(3, 3), padding='same', return_sequences=False))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='linear'))

model.compile(optimizer='adam', loss='mse')

In [None]:
model.summary()

In [None]:
model.fit(X_reshaped, y_train, epochs=10, batch_size=32)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Conv3D, Flatten, Reshape, Conv3DTranspose

# Define the input shape
input_shape = (n_time, n_lat, n_lon, 3)  # 3 channels for alt, fch4, and fc

# Build the model
input_layer = Input(shape=input_shape)

# Convolutional layers to capture spatiotemporal patterns
x = Conv3D(32, (3, 3, 3), activation='relu', padding='same')(input_layer)
x = Conv3D(64, (3, 3, 3), activation='relu', padding='same')(x)
x = Flatten()(x)

# Recurrent layer to capture temporal dependencies
x = Reshape((n_time, -1))(x)
x = LSTM(128, return_sequences=True)(x)
x = LSTM(64)(x)

# Dense layer for prediction
output_layer = Dense(3, activation='linear')(x)  # Output layer for alt, fch4, fc

# Define the model
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='mse')

# Print the model summary
model.summary()

# Convert the dataframe to the input format for the model
input_data = combined_df[['alt', 'fch4', 'fc']].values.reshape((n_time, n_lat, n_lon, 3))

# Train the model
model.fit(input_data, input_data, epochs=10, batch_size=16)


In [None]:
#pd.Series(train_alt_scaled.reshape(700253)).plot();

In [None]:
import keras_tuner as kt
from tensorflow.keras import layers

def build_model(hp):
    model = tf.keras.Sequential()
    model.add(layers.ConvLSTM2D(filters=hp.Int('units', min_value=32, max_value=512, step=32), 
                                kernel_size=(3, 3), padding='same', return_sequences=True))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(hp.Float('dropout', 0.0, 0.5, step=0.1)))
    model.add(layers.ConvLSTM2D(filters=hp.Int('units', min_value=32, max_value=512, step=32), 
                                kernel_size=(3, 3), padding='same', return_sequences=False))
    model.add(layers.Flatten())
    model.add(layers.Dense(1))
    
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    
    return model

# Initialize the tuner
tuner = kt.BayesianOptimization(
    build_model,
    objective='val_loss',
    max_trials=10,
    executions_per_trial=3,
    directory='my_dir',
    project_name='bayesian_tuning')

# Perform hyperparameter tuning
tuner.search(train_alt_scaled, y_train_alt, epochs=50, validation_data=(val_alt_scaled, y_val_alt))

# Get the best model
best_model = tuner.get_best_models(num_models=1)[0]


In [None]:
# Retrain on the best hyperparameters
best_model.fit(np.concatenate([train_alt_scaled, val_alt_scaled]), np.concatenate([y_train_alt, y_val_alt]), 
               epochs=100, validation_split=0.2)

# Evaluate on the test set
test_loss, test_mae = best_model.evaluate(test_alt_scaled, y_test_alt)
print(f"Test Loss: {test_loss}, Test MAE: {test_mae}")


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, ConvLSTM2D, BatchNormalization, Dropout, Flatten, Dense
from tensorflow.keras.models import Model

# ConvLSTM Model
def create_convlstm_model(input_shape):
    inputs = Input(shape=input_shape)
    x = ConvLSTM2D(filters=64, kernel_size=(3, 3), padding='same', return_sequences=True)(inputs)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)
    x = ConvLSTM2D(filters=64, kernel_size=(3, 3), padding='same', return_sequences=False)(x)
    x = Flatten()(x)
    outputs = Dense(1)(x)
    model = Model(inputs, outputs)
    return model

# Example usage
input_shape = (None, 10, 10, 1)  # Adjust based on data dimensions
convlstm_model = create_convlstm_model(input_shape)
convlstm_model.compile(optimizer='adam', loss='mse')
convlstm_model.summary()


In [None]:
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, Dropout, Dense, Input, Flatten
from tensorflow.keras.models import Model

# Transformer Block
def transformer_block(inputs, head_size, num_heads, ff_dim, dropout=0):
    x = MultiHeadAttention(key_dim=head_size, num_heads=num_heads)(inputs, inputs)
    x = Dropout(dropout)(x)
    x = LayerNormalization(epsilon=1e-6)(x)
    res = x + inputs

    x = Dense(ff_dim, activation="relu")(res)
    x = Dropout(dropout)(x)
    x = Dense(inputs.shape[-1])(x)
    x = LayerNormalization(epsilon=1e-6)(x)
    return x + res

# Transformer Model
def create_transformer_model(input_shape, head_size, num_heads, ff_dim, num_transformer_blocks, mlp_units, dropout=0):
    inputs = Input(shape=input_shape)
    x = inputs
    for _ in range(num_transformer_blocks):
        x = transformer_block(x, head_size, num_heads, ff_dim, dropout)

    x = Flatten()(x)
    for dim in mlp_units:
        x = Dense(dim, activation="relu")(x)
        x = Dropout(dropout)(x)
    outputs = Dense(1)(x)
    model = Model(inputs, outputs)
    return model

# Example usage
input_shape = (None, 10, 10)  # Adjust based on data dimensions
transformer_model = create_transformer_model(input_shape, head_size=256, num_heads=4, ff_dim=4, num_transformer_blocks=4, mlp_units=[128, 64], dropout=0.1)
transformer_model.compile(optimizer='adam', loss='mse')
transformer_model.summary()


In [None]:
from tensorflow.keras.layers import Lambda, Conv2D, Conv2DTranspose, Reshape
from tensorflow.keras import backend as K

# Sampling function for VAE
def sampling(args):
    z_mean, z_log_var = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

# VAE Model
def create_vae(input_shape, latent_dim):
    inputs = Input(shape=input_shape)
    x = Conv2D(32, 3, padding='same', activation='relu')(inputs)
    x = Conv2D(64, 3, padding='same', activation='relu')(x)
    x = Flatten()(x)
    x = Dense(16, activation='relu')(x)
    
    z_mean = Dense(latent_dim)(x)
    z_log_var = Dense(latent_dim)(x)
    z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])

    encoder = Model(inputs, [z_mean, z_log_var, z], name='encoder')

    latent_inputs = Input(shape=(latent_dim,))
    x = Dense(64 * 64 * 64, activation='relu')(latent_inputs)
    x = Reshape((64, 64, 64))(x)
    x = Conv2DTranspose(64, 3, padding='same', activation='relu')(x)
    x = Conv2DTranspose(32, 3, padding='same', activation='relu')(x)
    outputs = Conv2DTranspose(1, 3, padding='same', activation='sigmoid')(x)

    decoder = Model(latent_inputs, outputs, name='decoder')

    outputs = decoder(encoder(inputs)[2])
    vae = Model(inputs, outputs, name='vae')
    
    # VAE loss function
    reconstruction_loss = tf.keras.losses.mse(K.flatten(inputs), K.flatten(outputs))
    reconstruction_loss *= input_shape[0] * input_shape[1]
    kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
    kl_loss = K.sum(kl_loss, axis=-1)
    kl_loss *= -0.5
    vae_loss = K.mean(reconstruction_loss + kl_loss)
    vae.add_loss(vae_loss)
    return vae

# Example usage
input_shape = (64, 64, 1)  # Adjust based on data dimensions
latent_dim = 2
vae_model = create_vae(input_shape, latent_dim)
vae_model.compile(optimizer='adam')
vae_model.summary()


In [None]:
from tensorflow.keras.layers import Concatenate

def create_ensemble_model(convlstm_model, transformer_model, vae_model):
    combined_input = Concatenate()([convlstm_model.output, transformer_model.output, vae_model.output])
    x = Dense(128, activation="relu")(combined_input)
    x = Dropout(0.2)(x)
    x = Dense(64, activation="relu")(x)
    x = Dropout(0.2)(x)
    outputs = Dense(1)(x)
    ensemble_model = Model(inputs=[convlstm_model.input, transformer_model.input, vae_model.input], outputs=outputs)
    return ensemble_model

# Example usage
ensemble_model = create_ensemble_model(convlstm_model, transformer_model, vae_model)
ensemble_model.compile(optimizer='adam', loss='mse')
ensemble_model.summary()


In [None]:
# Assuming X_train and y_train are preprocessed and ready

# Example training call
ensemble_model.fit([X_train_convlstm, X_train_transformer, X_train_vae], y_train, epochs=50, batch_size=32, validation_split=0.2)


### Arrays

In [None]:
#IN SITU
reframed_alt.shape, type(reframed_alt), reframed_ch4.shape, type(reframed_ch4), reframed_co2.shape, type(reframed_co2)

In [None]:
#SIBBORK
sibbork.shape, type(sibbork)

In [None]:
#TCFM


In [None]:
#AVIRISNG
aviris_arr_list
aviris_arr_list[0].shape, type(aviris_arr_list[0])
#aviris_arr_list[0].shape, type(aviris_arr_list[0])
#aviris_arr_list[1].shape, type(aviris_arr_list[0])
#aviris_arr_list[2].shape, type(aviris_arr_list[0])
#aviris_arr_list[3].shape, type(aviris_arr_list[0])
#aviris_raster_list[0].shape, aviris_arr_list[0].shape #filtered by goodbandlist

In [None]:
#UAVSAR
#plt.imshow(uavsar_arr_list[0][0])
uavsar_arr_list
uavsar_arr_list[0].shape, type(uavsar_arr_list[0])

### Workflow

In [None]:
#X_train.shape, X_valid.shape, X_test.shape, y_train.shape, y_valid.shape, y_test.shape

In [None]:
# X_train_reframed=X_train_scaled.reshape(1772960,1,94);
# X_valid_reframed=X_valid.values.reshape(453144,1,94);
# X_test_reframed=X_test.values.reshape(215137,1,94)
# #X_train_reframed.shape, X_valid_reframed.shape, X_test_reframed.shape

In [None]:
# y_train_reframed=y_train.values.reshape(1772960,1,1);
# y_valid_reframed=y_valid.values.reshape(453144,1,1);
# y_test_reframed=y_test.values.reshape(215137,1,1)

In [None]:
# X_train_reframed_sup=X_train_reframed_sup.reshape(1772957,1,376);
# X_valid_reframed_sup=X_valid_reframed_sup.reshape(453141,1,376);
# X_test_reframed_sup=X_test_reframed_sup.reshape(215134,1,376)
# #X_train_reframed.shape, X_valid_reframed.shape, X_test_reframed.shape

In [None]:
# y_train_reframed_sup=y_train_reframed_sup.reshape(7091828,1,1);
# y_valid_reframed_sup=y_valid_reframed_sup.reshape(1812564,1,1);
# y_test_reframed_sup=y_test_reframed_sup.reshape(860536,1,1)
# #y_train_reframed.shape, y_valid_reframed.shape, y_test_reframed.shape

In [None]:
# X_train_reframed_sup_tensor=tf.convert_to_tensor(X_train_reframed_sup)#.reshape(1772957, 1, 376))
# y_train_reframed_sup_tensor=tf.convert_to_tensor(y_train_reframed_sup)#.reshape(1772957, 1, 4))
# X_valid_reframed_sup_tensor=tf.convert_to_tensor(X_valid_reframed_sup)#.reshape(453141, 1, 376))
# y_valid_reframed_sup_tensor=tf.convert_to_tensor(y_valid_reframed_sup)#.reshape(453141, 1, 4))
# X_test_reframed_sup_tensor=tf.convert_to_tensor(X_test_reframed_sup)#.reshape(215134, 1, 376))
# y_test_reframed_sup_tensor=tf.convert_to_tensor(y_test_reframed_sup)#.reshape(215134, 1, 4))

### ALT

In [None]:
inputs = keras.Input(shape=(trainXscaltref.shape[1],trainXscaltref.shape[2]))#, X_train_reframed_sup.shape[2]))#, X_train_reframed_sup.shape[3]))
inputs.shape

In [None]:
trainXscaltref.shape

In [None]:
inputs

### CH4

In [None]:
inputs = keras.Input(shape=(trainXscch4ref.shape[1],trainXscch4ref.shape[2]))#, X_train_reframed_sup.shape[2]))#, X_train_reframed_sup.shape[3]))
inputs.shape

In [None]:
trainXscch4ref.shape

In [None]:
inputs

### CO2

In [None]:
inputs = keras.Input(shape=(trainXscco2ref.shape[1],trainXscco2ref.shape[2]))#, X_train_reframed_sup.shape[2]))#, X_train_reframed_sup.shape[3]))
inputs.shape

In [None]:
trainXscco2ref.shape

In [None]:
inputs

### Hyperparameterization

In [None]:
inputs = keras.Input(shape=(train_alt.shape[1], train_alt.shape[2], train_alt.shape[3]))

In [None]:
# inputs = keras.Input(shape=(trainXscch4ref.shape[1], trainXscch4ref.shape[2]))
hp = HyperParameters()
units=hp.Int("units", min_value=32, max_value = inputs.shape[2], step=16)
#batch_size=hp.Int("batch_size", min_value = 32, max_value = 256, step = 32)
learning_rate=hp.Choice("learning_rate", [1e-1, 1e-2, 1e-3, 1e-4, 1e-5])
#inputs2 = keras.Input(shape=(alt_Xtrainsc.shape[1], alt_Xtrainsc.shape[2]))
#units2=hp.Int("units2", min_value=64, max_value = inputs2.shape[2], step=64)
#padding=hp.Choice("padding", ['valid','same','causal'])
#n_layers=hp.Int("n_layers", min_value = 1, max_value = 9, step=3)
#n_layers=hp.Int("n_layers", min_value = 5, max_value = 15) 
#batch_size=hp.Int("batch_size", min_value = 32, max_value = 256, step = 32)

conv1d_filters=hp.Int("conv1d_filters",min_value=hp['units'],max_value=2*hp['units'])
conv1d_kernel_size=hp.Int("conv1d_kernel_size", min_value = 3, max_value = 6, step=1)
conv1d_activation=hp.Choice("conv1d_activation", ['relu','sigmoid', 'tanh', 'exponential', 'gelu', 'elu', 'linear', 'selu', 'softmax', 'swish'])

conv1d2_filters=hp.Int("conv1d2_filters", min_value = hp['units'], max_value=2*hp['units'])
conv1d2_kernel_size=hp.Int("conv1d2_kernel_size", min_value = 3, max_value = 6, step=1)
conv1d2_activation=hp.Choice("conv1d2_activation", ['relu','sigmoid', 'tanh', 'exponential', 'gelu', 'elu', 'linear', 'selu', 'softmax', 'swish'])

# conv2d3_filters=hp.Int("conv2d3_filters",min_value=hp['units'],max_value=2*hp['units'])
# conv2d3_kernel_size=hp.Int("conv2d3_kernel_size", min_value = 3, max_value = 6, step=1)
# conv2d3_activation=hp.Choice("conv2d3_activation", ['relu','sigmoid', 'tanh', 'exponential', 'gelu', 'elu', 'linear', 'selu', 'softmax', 'swish'])

# conv2d4_filters=hp.Int("conv2d4_filters", min_value = 64, max_value=2*hp['units'])
# conv2d4_kernel_size=hp.Int("conv2d4_kernel_size", min_value = 3, max_value = 6, step=1)
# conv2d4_activation=hp.Choice("conv2d4_activation", ['relu','sigmoid', 'tanh', 'exponential', 'gelu', 'elu', 'linear', 'selu', 'softmax', 'swish'])

# conv2d5_filters=hp.Int("conv2d5_filters", min_value = 64, max_value=2*hp['units'])
# conv2d5_kernel_size=hp.Int("conv2d5_kernel_size", min_value = 3, max_value = 6, step=1)
# conv2d5_activation=hp.Choice("conv2d5_activation", ['relu','sigmoid', 'tanh', 'exponential', 'gelu', 'elu', 'linear', 'selu', 'softmax', 'swish'])

bilstm_units = hp.Int("bilstm_units", min_value = hp['units'], max_value = 2*hp['units'])
bilstm_activation=hp.Choice("bilstm_activation", ['relu','sigmoid', 'tanh', 'exponential', 'gelu', 'elu', 'linear', 'selu', 'softmax', 'swish'])
bilstm_rec_activation=hp.Choice("bilstm_rec_activation", ['relu','sigmoid', 'tanh', 'exponential', 'gelu', 'elu', 'linear', 'selu', 'softmax', 'swish'])
bilstm_dropout=hp.Choice("bilstm_dropout", [0.1, 0.2, 0.3, 0.4])
bilstm_rec_dropout=hp.Choice("bilstm_rec_dropout", [0.1, 0.2, 0.3, 0.4])
bilstm_bias=hp.Boolean("bilstm_use_bias")
bilstm_f_bias=hp.Boolean("bilstm_forgot_bias")

bilstm2_units = hp.Int("bilstm2_units", min_value = hp['units'], max_value = 2*hp['units'])
bilstm2_activation=hp.Choice("bilstm2_activation", ['relu','sigmoid', 'tanh', 'exponential', 'gelu', 'elu', 'linear', 'selu', 'softmax', 'swish'])
bilstm2_rec_activation=hp.Choice("bilstm2_rec_activation", ['relu','sigmoid', 'tanh', 'exponential', 'gelu', 'elu', 'linear', 'selu', 'softmax', 'swish'])
bilstm2_dropout=hp.Choice("bilstm2_dropout", [0.1, 0.2, 0.3, 0.4])
bilstm2_rec_dropout=hp.Choice("bilstm2_rec_dropout", [0.1, 0.2, 0.3, 0.4])
bilstm2_bias=hp.Boolean("bilstm2_use_bias")
bilstm2_f_bias=hp.Boolean("bilstm2_forgot_bias")

bilstm3_units = hp.Int("bilstm3_units", min_value = hp['units'], max_value = 2*hp['units'])
bilstm3_activation=hp.Choice("bilstm3_activation", ['relu','sigmoid', 'tanh', 'exponential', 'gelu', 'elu', 'linear', 'selu', 'softmax', 'swish'])
bilstm3_rec_activation=hp.Choice("bilstm3_rec_activation", ['relu','sigmoid', 'tanh', 'exponential', 'gelu', 'elu', 'linear', 'selu', 'softmax', 'swish'])
bilstm3_dropout=hp.Choice("bilstm3_dropout", [0.1, 0.2, 0.3, 0.4])
bilstm3_rec_dropout=hp.Choice("bilstm3_rec_dropout", [0.1, 0.2, 0.3, 0.4])
bilstm3_bias=hp.Boolean("bilstm3_use_bias")
bilstm3_f_bias=hp.Boolean("bilstm3_forgot_bias")

bilstm4_units = hp.Int("bilstm4_units", min_value = hp['units'], max_value = 2*hp['units'])
bilstm4_activation=hp.Choice("bilstm4_activation", ['relu','sigmoid', 'tanh', 'exponential', 'gelu', 'elu', 'linear', 'selu', 'softmax', 'swish'])
bilstm4_rec_activation=hp.Choice("bilstm4_rec_activation", ['relu','sigmoid', 'tanh', 'exponential', 'gelu', 'elu', 'linear', 'selu', 'softmax', 'swish'])
bilstm4_dropout=hp.Choice("bilstm4_dropout", [0.1, 0.2, 0.3, 0.4])
bilstm4_rec_dropout=hp.Choice("bilstm4_rec_dropout", [0.1, 0.2, 0.3, 0.4])
bilstm4_bias=hp.Boolean("bilstm4_use_bias")
bilstm4_f_bias=hp.Boolean("bilstm4_forgot_bias")

lstm_units = hp.Int("lstm_units", min_value = hp['units'], max_value = 2*hp['units'])
lstm_activation=hp.Choice("lstm_activation", ['relu','sigmoid', 'tanh', 'exponential', 'gelu', 'elu', 'linear', 'selu', 'softmax', 'swish'])
lstm_rec_activation=hp.Choice("lstm_rec_activation", ['relu','sigmoid', 'tanh', 'exponential', 'gelu', 'elu', 'linear', 'selu', 'softmax', 'swish'])
lstm_dropout=hp.Choice("lstm_dropout", [0.1, 0.2, 0.3, 0.4])
lstm_rec_dropout=hp.Choice("lstm_rec_dropout", [0.1, 0.2, 0.3, 0.4])

lstm2_units = hp.Int("lstm2_units", min_value = hp['units'], max_value = 2*hp['units'])
lstm2_activation=hp.Choice("lstm2_activation", ['relu','sigmoid', 'tanh', 'exponential', 'gelu', 'elu', 'linear', 'selu', 'softmax', 'swish'])
lstm2_rec_activation=hp.Choice("lstm2_rec_activation", ['relu','sigmoid', 'tanh', 'exponential', 'gelu', 'elu', 'linear', 'selu', 'softmax', 'swish'])
lstm2_dropout=hp.Choice("lstm2_dropout", [0.1, 0.2, 0.3, 0.4])
lstm2_rec_dropout=hp.Choice("lstm2_rec_dropout", [0.1, 0.2, 0.3, 0.4])

### Model

In [None]:
class geocryoai(HyperModel):
    def build(self, hp):
        #backend.clear_session()
        #inputs = keras.Input(shape=(X_train_reframed.shape[1], X_train_reframed.shape[2]))
        model = tf.keras.Sequential()
        #for i in range(n_layers):
        model.add(Conv1D(
            filters=hp['conv1d_filters'], 
            kernel_size=hp['conv1d_kernel_size'], 
            activation = hp['conv1d_activation'],
            padding='same', 
            input_shape=(inputs.shape[1], inputs.shape[2])))
        # model.add(MaxPool1D(pool_size=1))
        #for i in range(hp['n_layers']): #TUNE THIS (LAYERS) WHEN ADDING SATELLITE AND MODELING DATA
        model.add(Bidirectional(LSTM(
            input_shape=(inputs.shape[1], inputs.shape[2]),
            return_sequences = True, 
            #kernel_initializer = tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.00001),
            #name = f"1BiLSTM_layer_{i+1}",
            units = hp['bilstm_units'],
            activation = hp['bilstm_activation'],
            #recurrent_activation = hp['bilstm_rec_activation'],
            use_bias = hp['bilstm_use_bias'],
            unit_forget_bias = hp['bilstm_forgot_bias'],
            dropout=hp['bilstm_dropout'],
            recurrent_dropout = hp['bilstm_rec_dropout'])))
        model.add(Bidirectional(LSTM(
            input_shape=(inputs.shape[1], inputs.shape[2]),
            return_sequences = True, 
            #kernel_initializer = tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.00001),
            #name = f"1BiLSTM_layer_{i+1}",
            units = hp['bilstm2_units'],
            activation = hp['bilstm2_activation'],
            #recurrent_activation = hp['bilstm2_rec_activation'],
            use_bias = hp['bilstm2_use_bias'],
            unit_forget_bias = hp['bilstm2_forgot_bias'],
            dropout=hp['bilstm2_dropout'],
            recurrent_dropout = hp['bilstm2_rec_dropout'])))
        #for i in range(hp['n_layers']):
        model.add(LSTM(
            units=hp['lstm_units'],
            activation = hp['lstm_activation'],
            #recurrent_activation=hp['lstm_rec_activation'],
            return_sequences=False, 
            dropout=hp['lstm_dropout'],
            recurrent_dropout=hp['lstm_rec_dropout'],
            input_shape=(inputs.shape[1], inputs.shape[2])))
        #model.add(Bidirectional(LSTM(inputs.shape[-1], activation='relu', return_sequences = False, dropout=0, input_shape=(inputs.shape[1], inputs.shape[2]))))
        model.add(RepeatVector(inputs.shape[1]))   #TUNE THIS (LAYERS) WHEN ADDING SATELLITE AND MODELING DATA
        model.add(LSTM(
            units=hp['lstm2_units'], 
            activation = hp['lstm2_activation'],
            #recurrent_activation= hp['lstm2_rec_activation'],
            return_sequences=True,
            dropout=hp['lstm2_dropout'],
            recurrent_dropout=hp['lstm2_rec_dropout'],
            input_shape=(inputs.shape[1], inputs.shape[2])))
        #model.add(Bidirectional(LSTM(inputs.shape[-1], activation='relu', return_sequences = True, dropout=0, input_shape=(inputs.shape[1], inputs.shape[2])))) 
        model.add(Bidirectional(LSTM(
            input_shape=(inputs.shape[1], inputs.shape[2]),
            return_sequences = True, 
            #kernel_initializer = tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.00001),
            #name = f"2BiLSTM_layer_{i+1}",
            units = hp['bilstm3_units'],
            activation = hp['bilstm3_activation'],
            #recurrent_activation = hp['bilstm3_rec_activation'],
            use_bias = hp['bilstm3_use_bias'],
            unit_forget_bias = hp['bilstm3_forgot_bias'],
            dropout=hp['bilstm3_dropout'],
            recurrent_dropout = hp['bilstm3_rec_dropout'])))
        model.add(Bidirectional(LSTM(
            input_shape=(inputs.shape[1], inputs.shape[2]),
            return_sequences = hp['bilstm4_units'], 
            #kernel_initializer = tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.00001),
            #name = f"2BiLSTM_layer_{i+1}",
            units = 32,
            activation = hp['bilstm4_activation'],
            #recurrent_activation = hp['bilstm4_rec_activation'],
            use_bias = hp['bilstm4_use_bias'],
            unit_forget_bias = hp['bilstm4_forgot_bias'],
            dropout=hp['bilstm4_dropout'],
            recurrent_dropout = hp['bilstm4_rec_dropout'])))
        model.add(Conv1DTranspose(
           filters=hp['conv1d2_filters'], 
           kernel_size=hp['conv1d2_kernel_size'], 
           activation = hp['conv1d2_activation'],
           padding='same', 
           input_shape=(inputs.shape[1], inputs.shape[2])))
        model.add(TimeDistributed(Dense(trainyscaltref.shape[1])))
        #model.add(TimeDistributed(Dense(trainyscch4ref.shape[1])))
        #model.add(TimeDistributed(Dense(trainyscco2ref.shape[1])))
        model.add(Dense(trainXscaltref.shape[1]))
        #model.add(Dense(trainXscch4ref.shape[1]))
        #model.add(Dense(trainXscco2ref.shape[1]))
        metrics=['mean_squared_error', 'mean_absolute_error', RSquare().build(input_shape = (trainyscaltref.shape[1],))]
        #metrics=['mean_squared_error', 'mean_absolute_error', RSquare().build(input_shape = (trainyscch4ref.shape[1],))]
        #metrics=['mean_squared_error', 'mean_absolute_error', RSquare().build(input_shape = (trainyscco2ref.shape[1],))]
        loss_function = 'mean_squared_error'
        model.compile(optimizer = tf.keras.optimizers.legacy.RMSprop(learning_rate = hp.get('learning_rate'), **{"clipvalue" : 1000}),loss = loss_function, metrics = metrics)
        return model
    
    def fit(self, hp, model, *args, **kwargs):
        return model.fit(*args, batch_size = 128, **kwargs) #hp['batch_size'], **kwargs)

In [None]:
# model=geocryoai.build(train_alt, hp)

In [None]:
# model.summary()

In [None]:
# # serialize model to JSON
# model_json = model.to_json()
# #with open("model_070923_insituALT.json", "w") as json_file:
# with open("model_070923_insituCH4.json", "w") as json_file:
# #with open("model_070923_insituCO2.json", "w") as json_file:
#     json_file.write(model_json)
# # serialize weights to HDF5
# #model.save_weights("model_070923_insituALT_experimental.h5")
# model.save_weights("model_070923_insituCH4.h5")
# #model.save_weights("model_070923_insituCO2.h5")
# print("Saved model to disk")

In [None]:
bayesian_tuner = BayesianOptimization(
                    hypermodel = geocryoai(),
                    objective = "val_loss",
                    max_trials = 10,
                    #num_initial_points = 8, #defaults to 3xdimensionality of hyperparameterization space used
                    alpha = 0.0001, #0.01, #0.0001 #default; represents the expected amount of noise in the observed performances in Bayesian optimization.
                    beta = 2.6, #10, #2.6, #default;  the balancing factor of exploration and exploitation. The larger it is, the more explorative it is
                    hyperparameters = hp,
                    **{"tuner_id" : "BayesianOptimization_121924_ALT",
                    #**{"tuner_id" : "BayesianOptimization_071223_CH4",
                    #**{"tuner_id" : "BayesianOptimization_071223_CO2",
                      #"overwrite" : False,
                      "project_name" : "Bayesian_optimization_121924_ALT"}
                      #"project_name" : "bayesian_optimization_071223_CH4"}
                      #"project_name" : "bayesian_optimization_071223_CO2"}
                    )

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', verbose = 1, patience = 10, min_delta = 1e-4, restore_best_weights = True)

In [None]:
trainXscaltref.shape, trainyscaltref.shape
#trainXscch4ref.shape, trainyscch4ref.shape
#trainXscco2ref.shape, trainyscco2ref.shape

In [None]:
#ALT
bayesian_tuner.search(trainXscaltref, 
                     trainyscaltref, 
                     steps_per_epoch = None, 
                     shuffle = False, 
                     validation_data = (validXscaltref, validyscaltref),
                     #validation_split = 0.15,#0.2,
                     verbose = 1, #2, #epoch, #1, #progress bar #0, #nothing
                     callbacks = [early_stopping, History(), TerminateOnNaN(),keras.callbacks.TensorBoard("/tmp/tb_logs")], 
                     use_multiprocessing = True)

In [None]:
# #CH4
# bayesian_tuner.search(trainXscch4ref, 
#                      trainyscch4ref, 
#                      steps_per_epoch = None, 
#                      shuffle = False, 
#                      validation_data = (validXscch4ref, validyscch4ref),
#                      #validation_split = 0.15,#0.2,
#                      verbose = 1,
#                      callbacks = [early_stopping, History(), TerminateOnNaN(),keras.callbacks.TensorBoard("/tmp/tb_logs")], 
#                      use_multiprocessing = True)

In [None]:
# #CO2
# bayesian_tuner.search(trainXscco2ref, 
#                      trainyscco2ref, 
#                      steps_per_epoch = None, 
#                      shuffle = False, 
#                      validation_data = (validXscco2ref, validyscco2ref),
#                      #validation_split = 0.15,#0.2,
#                      verbose = 1,
#                      callbacks = [early_stopping, History(), TerminateOnNaN(),keras.callbacks.TensorBoard("/tmp/tb_logs")], 
#                      use_multiprocessing = True)

In [None]:
bayesian_tuner.results_summary()

In [None]:
#HP ALT
# BEST MODEL: TRIAL24
# Trial 27 Complete [01h 42m 02s]
# val_loss: 0.10911799222230911

# Best val_loss So Far: 0.10056757181882858
# Total elapsed time: 16h 43m 38s

# Search: Running Trial #28

# Value             |Best Value So Far |Hyperparameter
# 0.0001            |0.0001            |learning_rate
# 64                |64                |batch_size
# 64                |64                |units
# 91                |96                |conv1d_filters
# 3                 |9                 |conv1d_kernel_size
# relu              |swish             |conv1d_activation
# 107               |97                |bilstm_units
# swish             |relu              |bilstm_activation
# relu              |relu              |bilstm_rec_activation
# 0.1               |0.4               |bilstm_dropout
# 0.1               |0.1               |bilstm_rec_dropout
# False             |False             |bilstm_use_bias
# True              |True              |bilstm_forgot_bias
# 88                |64                |lstm_units
# gelu              |relu              |lstm_activation
# softmax           |relu              |lstm_rec_activation
# 0.1               |0.1               |lstm_dropout
# 0.1               |0.1               |lstm_rec_dropout
# 128               |128               |lstm2_units
# relu              |relu              |lstm2_activation
# relu              |relu              |lstm2_rec_activation
# 0.1               |0.1               |lstm2_dropout
# 0.4               |0.4               |lstm2_rec_dropout
# 128               |128               |bilstm2_units
# relu              |relu              |bilstm2_activation
# swish             |sigmoid           |bilstm2_rec_activation
# 0.4               |0.4               |bilstm2_dropout
# 0.4               |0.4               |bilstm2_rec_dropout
# True              |True              |bilstm2_use_bias
# False             |False             |bilstm2_forgot_bias
# 91                |88                |conv1d2_filters
# 9                 |9                 |conv1d2_kernel_size
# swish             |swish             |conv1d2_activation

In [None]:
# #ALT Results
# Results summary
# Results in ./Bayesian_optimization_071423_ALT
# Showing 10 best trials
# Objective(name="val_loss", direction="min")

# Trial 09 summary
# Hyperparameters:
# units: 224
# learning_rate: 0.01
# conv1d_filters: 38
# conv1d_kernel_size: 6
# conv1d_activation: linear
# conv1d2_filters: 48
# conv1d2_kernel_size: 5
# conv1d2_activation: exponential
# bilstm_units: 63
# bilstm_activation: sigmoid
# bilstm_rec_activation: tanh
# bilstm_dropout: 0.3
# bilstm_rec_dropout: 0.1
# bilstm_use_bias: False
# bilstm_forgot_bias: True
# bilstm2_units: 45
# bilstm2_activation: sigmoid
# bilstm2_rec_activation: exponential
# bilstm2_dropout: 0.1
# bilstm2_rec_dropout: 0.1
# bilstm2_use_bias: True
# bilstm2_forgot_bias: False
# bilstm3_units: 46
# bilstm3_activation: selu
# bilstm3_rec_activation: relu
# bilstm3_dropout: 0.4
# bilstm3_rec_dropout: 0.1
# bilstm3_use_bias: True
# bilstm3_forgot_bias: False
# bilstm4_units: 47
# bilstm4_activation: sigmoid
# bilstm4_rec_activation: gelu
# bilstm4_dropout: 0.3
# bilstm4_rec_dropout: 0.1
# bilstm4_use_bias: False
# bilstm4_forgot_bias: False
# lstm_units: 63
# lstm_activation: sigmoid
# lstm_rec_activation: gelu
# lstm_dropout: 0.3
# lstm_rec_dropout: 0.1
# lstm2_units: 64
# lstm2_activation: exponential
# lstm2_rec_activation: selu
# lstm2_dropout: 0.3
# lstm2_rec_dropout: 0.3
# Score: 2.8679637908935547

In [None]:
# #ALT
# class geocryoai2(HyperModel):
#     def build(self, hp):
#         #backend.clear_session()
#         #inputs = keras.Input(shape=(X_train_reframed.shape[1], X_train_reframed.shape[2]))
#         inputs = keras.Input(shape=(trainXscch4ref.shape[1], trainXscch4ref.shape[2]))        
#         model = tf.keras.Sequential()
#         #for i in range(n_layers):
#         model.add(Conv1D(
#             filters=38, 
#             kernel_size=6, 
#             activation = 'swish',
#             padding='same', 
#             input_shape=(inputs.shape[1], inputs.shape[2])))
#         # model.add(MaxPool1D(pool_size=1))
#         #for i in range(hp['n_layers']): #TUNE THIS (LAYERS) WHEN ADDING SATELLITE AND MODELING DATA
#         model.add(Bidirectional(LSTM(
#             input_shape=(inputs.shape[1], inputs.shape[2]),
#             return_sequences = True, 
#             #kernel_initializer = tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.00001),
#             #name = f"1BiLSTM_layer_{i+1}",
#             units = 97,
#             activation = 'relu',
#             #recurrent_activation = 'relu',
#             use_bias = False,
#             unit_forget_bias = True,
#             dropout=0.1,
#             recurrent_dropout = 0.1)))
#         model.add(LSTM(
#             units=64,
#             activation = 'relu',
#             #recurrent_activation = 'relu',
#             return_sequences=False, 
#             dropout=0.1,
#             recurrent_dropout=0.1,
#             input_shape=(inputs.shape[1], inputs.shape[2])))
#         #model.add(Bidirectional(LSTM(inputs.shape[-1], activation='relu', return_sequences = False, dropout=0, input_shape=(inputs.shape[1], inputs.shape[2]))))
#         model.add(RepeatVector(inputs.shape[1]))   #TUNE THIS (LAYERS) WHEN ADDING SATELLITE AND MODELING DATA
#         model.add(LSTM(
#             units=128, 
#             activation = 'relu',
#             #recurrent_activation='relu',
#             return_sequences=True,
#             dropout=0.1,
#             recurrent_dropout=0.1,
#             input_shape=(inputs.shape[1], inputs.shape[2])))
#         #model.add(Bidirectional(LSTM(inputs.shape[-1], activation='relu', return_sequences = True, dropout=0, input_shape=(inputs.shape[1], inputs.shape[2])))) 
#         model.add(Bidirectional(LSTM(
#             input_shape=(inputs.shape[1], inputs.shape[2]),
#             return_sequences = True, 
#             #kernel_initializer = tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.00001),
#             #name = f"2BiLSTM_layer_{i+1}",
#             units = 128,
#             activation = 'relu',
#             #recurrent_activation = 'sigmoid',
#             use_bias = True,
#             unit_forget_bias = False,
#             dropout=0.1,
#             recurrent_dropout = 0.1)))
#         model.add(Conv1DTranspose(
#            filters=88, 
#            kernel_size=9, 
#            activation = 'swish',
#            padding='same', 
#            input_shape=(inputs.shape[1], inputs.shape[2])))
#         model.add(TimeDistributed(Dense(trainyscaltref.shape[1])))
#         #model.add(TimeDistributed(Dense(trainyscch4ref.shape[1])))
#         #model.add(Dense(trainXscaltref.shape[1]))
#         #model.add(Dense(trainXscch4ref.shape[1]))
#         metrics=['mean_squared_error', 'mean_absolute_error', RSquare().build(input_shape = (trainyscaltref.shape[1],))]
#         #metrics=['mean_squared_error', 'mean_absolute_error', RSquare().build(input_shape = (trainyscch4ref.shape[1],))]
#         loss_function = 'mean_squared_error'
#         model.compile(optimizer = tf.keras.optimizers.legacy.RMSprop(learning_rate = 0.0001, **{"clipvalue" : 1000}),loss = loss_function, metrics = metrics)
#         return model
    
#     def fit(self, hp, model, *args, **kwargs):
#         return model.fit(*args, batch_size = 256, **kwargs)

In [None]:
#ALT
class geocryoai2(HyperModel):
    def build(self, hp):
        #backend.clear_session()
        inputs = keras.Input(shape=(trainXscaltref.shape[1], trainXscaltref.shape[2]))
        #inputs = keras.Input(shape=(trainXscch4ref.shape[1], trainXscch4ref.shape[2]))        
        model = tf.keras.Sequential()
        #for i in range(n_layers):
        model.add(Conv1D(
            filters=96, 
            kernel_size=9, 
            activation = 'swish',
            padding='same', 
            input_shape=(inputs.shape[1], inputs.shape[2])))
        # model.add(MaxPool1D(pool_size=1))
        #for i in range(hp['n_layers']): #TUNE THIS (LAYERS) WHEN ADDING SATELLITE AND MODELING DATA
        model.add(Bidirectional(LSTM(
            input_shape=(inputs.shape[1], inputs.shape[2]),
            return_sequences = True, 
            #kernel_initializer = tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.00001),
            #name = f"1BiLSTM_layer_{i+1}",
            units = 97,
            activation = 'relu',
            #recurrent_activation = 'relu',
            use_bias = False,
            unit_forget_bias = True,
            dropout=0.1,
            recurrent_dropout = 0.1)))
        model.add(LSTM(
            units=64,
            activation = 'relu',
            #recurrent_activation = 'relu',
            return_sequences=False, 
            dropout=0.1,
            recurrent_dropout=0.1,
            input_shape=(inputs.shape[1], inputs.shape[2])))
        #model.add(Bidirectional(LSTM(inputs.shape[-1], activation='relu', return_sequences = False, dropout=0, input_shape=(inputs.shape[1], inputs.shape[2]))))
        model.add(RepeatVector(inputs.shape[1]))   #TUNE THIS (LAYERS) WHEN ADDING SATELLITE AND MODELING DATA
        model.add(LSTM(
            units=128, 
            activation = 'relu',
            #recurrent_activation='relu',
            return_sequences=True,
            dropout=0.1,
            recurrent_dropout=0.1,
            input_shape=(inputs.shape[1], inputs.shape[2])))
        #model.add(Bidirectional(LSTM(inputs.shape[-1], activation='relu', return_sequences = True, dropout=0, input_shape=(inputs.shape[1], inputs.shape[2])))) 
        model.add(Bidirectional(LSTM(
            input_shape=(inputs.shape[1], inputs.shape[2]),
            return_sequences = True, 
            #kernel_initializer = tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.00001),
            #name = f"2BiLSTM_layer_{i+1}",
            units = 128,
            activation = 'relu',
            #recurrent_activation = 'sigmoid',
            use_bias = True,
            unit_forget_bias = False,
            dropout=0.1,
            recurrent_dropout = 0.1)))
        model.add(Conv1DTranspose(
           filters=88, 
           kernel_size=9, 
           activation = 'swish',
           padding='same', 
           input_shape=(inputs.shape[1], inputs.shape[2])))
        model.add(TimeDistributed(Dense(trainyscaltref.shape[1])))
        #model.add(TimeDistributed(Dense(trainyscch4ref.shape[1])))
        #model.add(Dense(trainXscaltref.shape[1]))
        #model.add(Dense(trainXscch4ref.shape[1]))
        metrics=['mean_squared_error', 'mean_absolute_error', RSquare().build(input_shape = (trainyscaltref.shape[1],))]
        #metrics=['mean_squared_error', 'mean_absolute_error', RSquare().build(input_shape = (trainyscch4ref.shape[1],))]
        loss_function = 'mean_squared_error'
        model.compile(optimizer = tf.keras.optimizers.legacy.RMSprop(learning_rate = 0.0001, **{"clipvalue" : 1000}),loss = loss_function, metrics = metrics)
        return model
    
    def fit(self, hp, model, *args, **kwargs):
        return model.fit(*args, batch_size = 256, **kwargs)

In [None]:
#HP CH4
#BEST MODEL: TRIAL22
# Trial 26 Complete [01h 35m 54s]
# val_loss: 0.4272751808166504

# Best val_loss So Far: 0.004293057601898909
# Total elapsed time: 14h 38m 44s

# Search: Running Trial #27

# Value             |Best Value So Far |Hyperparameter
# 0.001             |0.0001            |learning_rate
# 64                |64                |batch_size
# 320               |128               |units
# 64                |71                |conv1d_filters
# 9                 |9                 |conv1d_kernel_size
# relu              |relu              |conv1d_activation
# 116               |92                |bilstm_units
# relu              |relu              |bilstm_activation
# relu              |relu              |bilstm_rec_activation
# 0.4               |0.3               |bilstm_dropout
# 0.4               |0.4               |bilstm_rec_dropout
# False             |False             |bilstm_use_bias
# True              |True              |bilstm_forgot_bias
# 100               |128               |lstm_units
# linear            |tanh              |lstm_activation
# swish             |elu               |lstm_rec_activation
# 0.4               |0.4               |lstm_dropout
# 0.4               |0.4               |lstm_rec_dropout
# 128               |128               |lstm2_units
# relu              |softmax           |lstm2_activation
# swish             |swish             |lstm2_rec_activation
# 0.4               |0.4               |lstm2_dropout
# 0.4               |0.1               |lstm2_rec_dropout
# 123               |78                |bilstm2_units
# relu              |relu              |bilstm2_activation
# swish             |relu              |bilstm2_rec_activation
# 0.1               |0.1               |bilstm2_dropout
# 0.1               |0.1               |bilstm2_rec_dropout
# False             |False             |bilstm2_use_bias
# True              |False             |bilstm2_forgot_bias
# 128               |117               |conv1d2_filters
# 3                 |3                 |conv1d2_kernel_size
# sigmoid           |swish             |conv1d2_activation

In [None]:
#CH4
class geocryoai2(HyperModel):
    def build(self, hp):
        #backend.clear_session()
        #inputs = keras.Input(shape=(X_train_reframed.shape[1], X_train_reframed.shape[2]))
        #inputs = keras.Input(shape=(trainXscch4ref.shape[1], trainXscch4ref.shape[2]))        
        model = tf.keras.Sequential()
        #for i in range(n_layers):
        model.add(Conv1D(
            filters=71, 
            kernel_size=9, 
            activation = 'swish',
            padding='same', 
            input_shape=(inputs.shape[1], inputs.shape[2])))
        # model.add(MaxPool1D(pool_size=1))
        #for i in range(hp['n_layers']): #TUNE THIS (LAYERS) WHEN ADDING SATELLITE AND MODELING DATA
        model.add(Bidirectional(LSTM(
            input_shape=(inputs.shape[1], inputs.shape[2]),
            return_sequences = True, 
            #kernel_initializer = tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.00001),
            #name = f"1BiLSTM_layer_{i+1}",
            units = 92,
            activation = 'relu',
            #recurrent_activation = 'linear',
            use_bias = False,
            unit_forget_bias = True,
            dropout=0.1,
            recurrent_dropout = 0.1)))
        #for i in range(hp['n_layers']):
        model.add(LSTM(
            units=128,
            activation = 'tanh',
            #recurrent_activation = 'softmax',
            return_sequences=False, 
            dropout=0.1,
            recurrent_dropout=0.1,
            input_shape=(inputs.shape[1], inputs.shape[2])))
        #model.add(Bidirectional(LSTM(inputs.shape[-1], activation='relu', return_sequences = False, dropout=0, input_shape=(inputs.shape[1], inputs.shape[2]))))
        model.add(RepeatVector(inputs.shape[1]))   #TUNE THIS (LAYERS) WHEN ADDING SATELLITE AND MODELING DATA
        model.add(LSTM(
            units=128, 
            activation = 'softmax',
            #recurrent_activation='elu',
            return_sequences=True,
            dropout=0.1,
            recurrent_dropout=0.1,
            input_shape=(inputs.shape[1], inputs.shape[2])))
        #model.add(Bidirectional(LSTM(inputs.shape[-1], activation='relu', return_sequences = True, dropout=0, input_shape=(inputs.shape[1], inputs.shape[2])))) 
        model.add(Bidirectional(LSTM(
            input_shape=(inputs.shape[1], inputs.shape[2]),
            return_sequences = True, 
            #kernel_initializer = tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.00001),
            #name = f"2BiLSTM_layer_{i+1}",
            units = 78,
            activation = 'relu',
            #recurrent_activation = 'relu',
            use_bias = False,
            unit_forget_bias = False,
            dropout=0.1,
            recurrent_dropout = 0.1)))
        model.add(Conv1DTranspose(
           filters=117, 
           kernel_size=3, 
           activation = 'swish',
           padding='same', 
           input_shape=(inputs.shape[1], inputs.shape[2])))
        #model.add(TimeDistributed(Dense(trainyscaltref.shape[1])))
        model.add(TimeDistributed(Dense(trainyscch4ref.shape[1])))
        #model.add(TimeDistributed(Dense(trainyscco2ref.shape[1])))
        #model.add(Dense(trainXscaltref.shape[1]))
        model.add(Dense(trainXscch4ref.shape[1]))
        #model.add(Dense(trainXscco2ref.shape[1]))
        #metrics=['mean_squared_error', 'mean_absolute_error', RSquare().build(input_shape = (trainyscaltref.shape[1],))]
        metrics=['mean_squared_error', 'mean_absolute_error', RSquare().build(input_shape = (trainyscch4ref.shape[1],))]
        #metrics=['mean_squared_error', 'mean_absolute_error', RSquare().build(input_shape = (trainyscco2ref.shape[1],))]
        loss_function = 'mean_squared_error'
        model.compile(optimizer = tf.keras.optimizers.legacy.RMSprop(learning_rate = 0.001, **{"clipvalue" : 1000}),loss = loss_function, metrics = metrics)
        return model
    
    def fit(self, hp, model, *args, **kwargs):
        return model.fit(*args, batch_size = 256, **kwargs)

In [None]:
#HP CO2
#BEST MODEL: TRIAL22
# Trial 31 Complete [01h 26m 33s]
# val_loss: 0.09103024005889893

# Best val_loss So Far: 0.014815938659012318
# Total elapsed time: 13h 56m 11s

# Search: Running Trial #32

# Value             |Best Value So Far |Hyperparameter
# 0.0001            |0.0001            |learning_rate
# 64                |64                |batch_size
# 128               |192               |units
# 128               |128               |conv1d_filters
# 9                 |6                 |conv1d_kernel_size
# relu              |elu               |conv1d_activation
# 111               |100               |bilstm_units
# relu              |relu              |bilstm_activation
# relu              |relu              |bilstm_rec_activation
# 0.1               |0.1               |bilstm_dropout
# 0.3               |0.3               |bilstm_rec_dropout
# False             |True              |bilstm_use_bias
# True              |True              |bilstm_forgot_bias
# 64                |64                |lstm_units
# sigmoid           |tanh              |lstm_activation
# sigmoid           |tanh              |lstm_rec_activation
# 0.1               |0.1               |lstm_dropout
# 0.4               |0.4               |lstm_rec_dropout
# 80                |83                |lstm2_units
# relu              |relu              |lstm2_activation
# relu              |relu              |lstm2_rec_activation
# 0.1               |0.1               |lstm2_dropout
# 0.3               |0.3               |lstm2_rec_dropout
# 64                |64                |bilstm2_units
# relu              |relu              |bilstm2_activation
# relu              |sigmoid           |bilstm2_rec_activation
# 0.4               |0.3               |bilstm2_dropout
# 0.4               |0.3               |bilstm2_rec_dropout
# False             |False             |bilstm2_use_bias
# False             |True              |bilstm2_forgot_bias
# 128               |95                |conv1d2_filters
# 6                 |6                 |conv1d2_kernel_size
# swish             |swish             |conv1d2_activation

In [None]:
#CO2
class geocryoai2(HyperModel):
    def build(self, hp):
        #backend.clear_session()
        #inputs = keras.Input(shape=(X_train_reframed.shape[1], X_train_reframed.shape[2]))
        #inputs = keras.Input(shape=(trainXscch4ref.shape[1], trainXscch4ref.shape[2]))        
        model = tf.keras.Sequential()
        #for i in range(n_layers):
        model.add(Conv1D(
            filters=128, 
            kernel_size=6, 
            activation = 'elu',
            padding='same', 
            input_shape=(inputs.shape[1], inputs.shape[2])))
        # model.add(MaxPool1D(pool_size=1))
        #for i in range(hp['n_layers']): #TUNE THIS (LAYERS) WHEN ADDING SATELLITE AND MODELING DATA
        model.add(Bidirectional(LSTM(
            input_shape=(inputs.shape[1], inputs.shape[2]),
            return_sequences = True, 
            #kernel_initializer = tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.00001),
            #name = f"1BiLSTM_layer_{i+1}",
            units = 120,
            activation = 'relu',
            #recurrent_activation = 'relu',
            use_bias = True,
            unit_forget_bias = True,
            dropout=0.1,
            recurrent_dropout = 0.1)))
        #for i in range(hp['n_layers']):
        model.add(LSTM(
            units=64,
            activation = 'tanh',
            #recurrent_activation = 'tanh',
            return_sequences=False, 
            dropout=0.1,
            recurrent_dropout=0.1,
            input_shape=(inputs.shape[1], inputs.shape[2])))
        #model.add(Bidirectional(LSTM(inputs.shape[-1], activation='relu', return_sequences = False, dropout=0, input_shape=(inputs.shape[1], inputs.shape[2]))))
        model.add(RepeatVector(inputs.shape[1]))   #TUNE THIS (LAYERS) WHEN ADDING SATELLITE AND MODELING DATA
        model.add(LSTM(
            units=83, 
            activation = 'relu',
            #recurrent_activation='relu',
            return_sequences=True,
            dropout=0.1,
            recurrent_dropout=0.1,
            input_shape=(inputs.shape[1], inputs.shape[2])))
        #model.add(Bidirectional(LSTM(inputs.shape[-1], activation='relu', return_sequences = True, dropout=0, input_shape=(inputs.shape[1], inputs.shape[2])))) 
        model.add(Bidirectional(LSTM(
            input_shape=(inputs.shape[1], inputs.shape[2]),
            return_sequences = True, 
            #kernel_initializer = tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.00001),
            #name = f"2BiLSTM_layer_{i+1}",
            units = 64,
            activation = 'relu',
            #recurrent_activation = 'sigmoid',
            use_bias = False,
            unit_forget_bias = True,
            dropout=0.1,
            recurrent_dropout = 0.1)))
        model.add(Conv1DTranspose(
           filters=95, 
           kernel_size=6, 
           activation = 'swish',
           padding='same', 
           input_shape=(inputs.shape[1], inputs.shape[2])))
        #model.add(TimeDistributed(Dense(trainyscaltref.shape[1])))
        #model.add(TimeDistributed(Dense(trainyscch4ref.shape[1])))
        model.add(TimeDistributed(Dense(trainyscco2ref.shape[1])))
        #model.add(Dense(trainXscaltref.shape[1]))
        #model.add(Dense(trainXscch4ref.shape[1]))
        model.add(Dense(trainXscco2ref.shape[1]))
        #metrics=['mean_squared_error', 'mean_absolute_error', RSquare().build(input_shape = (trainyscaltref.shape[1],))]
        #metrics=['mean_squared_error', 'mean_absolute_error', RSquare().build(input_shape = (trainyscch4ref.shape[1],))]
        metrics=['mean_squared_error', 'mean_absolute_error', RSquare().build(input_shape = (trainyscco2ref.shape[1],))]
        loss_function = 'mean_squared_error'
        model.compile(optimizer = tf.keras.optimizers.legacy.RMSprop(learning_rate = 0.0001, **{"clipvalue" : 1000}),loss = loss_function, metrics = metrics)
        return model
    
    def fit(self, hp, model, *args, **kwargs):
        return model.fit(*args, batch_size = 64, **kwargs)

In [None]:
trainXscaltref.shape, trainyscaltref.shape
#trainXscch4ref.shape, trainyscch4ref.shape
#trainXscco2ref.shape, trainyscco2ref.shape

In [None]:
model2=geocryoai2.build(trainXscaltref, hp)
#model3=geocryoai2.build(trainXscch4ref, hp)
#model4=geocryoai2.build(trainXscco2ref, hp)
#model3b=geocryoai2.build(trainXscch4ref, hp)

In [None]:
model2.summary()

In [None]:
model3.summary()

In [None]:
model4.summary()

In [None]:
model3b.summary()

In [None]:
#geocryoai(X_train)
#img_file = '/Users/bradleygay/Downloads/model_arch.jpeg'
img_file = '/Users/bradleygay/Downloads/GeoCryoAI_Arch_071523_insituALT.png'
tf.keras.utils.plot_model(model2, to_file=img_file, show_shapes=True, show_layer_names=True, dpi=1000);
#img_file = '/Users/bradleygay/Downloads/GeoCryoAI_Arch_071423_insituCH4.png'
#tf.keras.utils.plot_model(model3, to_file=img_file, show_shapes=True, show_layer_names=True, dpi=1000);
#img_file = '/Users/bradleygay/Downloads/GeoCryoAI_Arch_071423_insituCO2.png'
#tf.keras.utils.plot_model(model4, to_file=img_file, show_shapes=True, show_layer_names=True, dpi=1000);
#img_file = '/Users/bradleygay/Downloads/GeoCryoAI_Arch_071423_insituCH4model3b.png'
#tf.keras.utils.plot_model(model3b, to_file=img_file, show_shapes=True, show_layer_names=True, dpi=1000);

In [None]:
import os
root_logdir = os.path.join(os.curdir, 'logs')

def get_run_logdir():
    import time
    run_id = time.strftime('run_%Y_%m_%d-%H_%M_%S')
    return os.path.join(root_logdir, run_id)

run_logdir = get_run_logdir()

log_dir = "logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [None]:
#ALT
# add early stopping criteria t/Volumes/op training if validation score does not improve - cuts down on computational load/speed.
filepath="weights_geocryoai2.alt_071523.hdf5"
#filepath="weights_geocryoai2.best_ch4_071423.hdf5"
#filepath="weights_geocryoai2.best__co2_071423.hdf5"
#filepath="weights_geocryoai2.best_ch4_model3b_071423.hdf5"
tensorboard_cb = keras.callbacks.TensorBoard('/tmp/tb_logs')
checkpoint = keras.callbacks.ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early_stopping = EarlyStopping(monitor='val_loss', verbose = 1, patience = 10, min_delta = 1e-4, restore_best_weights = True)
# fit network
start_time = time.time()
history2b = model2.fit(trainXscaltref, #trainXscaltref #trainXscch4ref
#history3 = model3.fit(trainXscch4ref, #trainXscaltref #trainXscco2ref
#history4 = model4.fit(trainXscco2ref, #trainXscaltref #trainXscco2ref
#history5 = model3b.fit(trainXscch4ref, #trainXscaltref #trainXscco2ref
#                   trainyscch4ref, #trainyscaltref #trainXscco2ref
                    trainyscaltref,  
                    epochs=10, 
                    batch_size=256,#128,#512, 
                    validation_data=(validXscaltref,validyscaltref),
                    #validation_data=(validXscch4ref,validyscch4ref), #(validXscaltref,validyscaltref) #(validXscco2ref,validyscco2ref)
                    steps_per_epoch = None,
                    shuffle=False, 
                    callbacks=[early_stopping, TerminateOnNaN(),  keras.callbacks.TensorBoard("/tmp/tb_logs")],
                    use_multiprocessing = True)

elapsed_time = time.time() - start_time
print("\nThe first network took {} s to complete training.".format(round(elapsed_time)))

In [None]:
#CH4
# add early stopping criteria t/Volumes/op training if validation score does not improve - cuts down on computational load/speed.
#filepath="weights_geocryoai2.best_071123.hdf5"
filepath="weights_geocryoai2.best_071223_ch4.hdf5"
#filepath="weights_geocryoai2.best_071223_co2.hdf5"
tensorboard_cb = keras.callbacks.TensorBoard('/tmp/tb_logs')
checkpoint = keras.callbacks.ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early_stopping = EarlyStopping(monitor='val_loss', verbose = 1, patience = 10, min_delta = 1e-4, restore_best_weights = True)
# fit network
start_time = time.time()
#history2 = model2.fit(trainXscaltref, #trainXscaltref #trainXscch4ref
history3 = model3.fit(trainXscch4ref, #trainXscaltref #trainXscco2ref
                    trainyscch4ref, #trainyscaltref #trainXscco2ref
                    epochs=10, 
                    batch_size=256,#512, 
                    validation_data=(validXscch4ref,validyscch4ref), #(validXscaltref,validyscaltref) #(validXscco2ref,validyscco2ref)
                    steps_per_epoch = None,
                    shuffle=False, 
                    callbacks=[early_stopping, TerminateOnNaN(),  keras.callbacks.TensorBoard("/tmp/tb_logs")],
                    use_multiprocessing = True)

elapsed_time = time.time() - start_time
print("\nThe first network took {} s to complete training.".format(round(elapsed_time)))

In [None]:
#CO2
# add early stopping criteria t/Volumes/op training if validation score does not improve - cuts down on computational load/speed.
#filepath="weights_geocryoai2.best_071123.hdf5"
#filepath="weights_geocryoai2.best_071223_ch4.hdf5"
filepath="weights_geocryoai2.best_071223_co2.hdf5"
tensorboard_cb = keras.callbacks.TensorBoard('/tmp/tb_logs')
checkpoint = keras.callbacks.ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early_stopping = EarlyStopping(monitor='val_loss', verbose = 1, patience = 10, min_delta = 1e-4, restore_best_weights = True)
# fit network
start_time = time.time()
#history2 = model2.fit(trainXscaltref, #trainXscaltref #trainXscch4ref
#history3 = model3.fit(trainXscch4ref, #trainXscaltref #trainXscco2ref
history4 = model4.fit(trainXscco2ref, #trainXscaltref #trainXscco2ref
                    trainyscco2ref, #trainyscaltref #trainXscco2ref
                    epochs=10, 
                    batch_size=256,#512, 
                    validation_data=(validXscco2ref,validyscco2ref), #(validXscaltref,validyscaltref) #(validXscco2ref,validyscco2ref)
                    steps_per_epoch = None,
                    shuffle=False, 
                    callbacks=[early_stopping, TerminateOnNaN(),  keras.callbacks.TensorBoard("/tmp/tb_logs")],
                    use_multiprocessing = True)

elapsed_time = time.time() - start_time
print("\nThe first network took {} s to complete training.".format(round(elapsed_time)))

In [None]:
# serialize model to JSON
model_json = model2b.to_json()
#model_json = model3.to_json()
#model_json = model4.to_json()
#model_json = model3b.to_json()
with open("model2_071623_insituALT.json", "w") as json_file:
#with open("model3_071523_insituCH4_2.json", "w") as json_file:
#with open("model4_071523_insituCO2.json", "w") as json_file:
#with open("model3_071523_insituCH4_model3b.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model2.save_weights("model2_071623_insituALT.h5")
#model3.save_weights("model3_071523_insituCH4.h5")
#model4.save_weights("model4_071523_insituCO2.h5")
#model3b.save_weights("model3_071523_insituCH4_model3b.h5")
print("Saved model to disk")

In [None]:
# convert the history.history dict to a pandas DataFrame:     
hist_df = pd.DataFrame(history2b.history)
#hist_df = pd.DataFrame(history3.history)
#hist_df = pd.DataFrame(history4.history)
#hist_df = pd.DataFrame(history5.history)

# save to json:  
hist_json_file = 'historyALT-071623.json' 
#hist_json_file = 'historyCH4-071523.json' 
#hist_json_file = 'historyCO2-071523.json' 
#hist_json_file = 'historyCH4-071523_model3b.json' 
with open(hist_json_file, mode='w') as f:
    hist_df.to_json(f)

# or save to csv: 
hist_csv_file = 'historyALT-071623.csv'
#hist_csv_file = 'historyCH4-071523.csv'
#hist_csv_file = 'historyCO2-071523.csv'
#hist_csv_file = 'historyCH4-071523_model3b.csv'
with open(hist_csv_file, mode='w') as f:
    hist_df.to_csv(f)

In [None]:
with open('trainHistoryALT-071623', 'wb') as file_pi:
#with open('trainHistoryCH4-071523', 'wb') as file_pi:
#with open('trainHistoryCO2-071523', 'wb') as file_pi:
#with open('trainHistoryCH4-071523_model3b', 'wb') as file_pi:
    pickle.dump(history2b.history, file_pi)
    #pickle.dump(history3.history, file_pi)
    #pickle.dump(history4.history, file_pi)
    #pickle.dump(history5.history, file_pi)

In [None]:
testXscaltref.shape
#testXscch4ref.shape
#testXscco2ref.shape

In [None]:
#testXscaltrefres=testXscaltref.reshape(215136,1,456)
#testXscch4refres=testXscch4ref.reshape(161749,1,456)
#testXscco2refres=testXscco2ref.reshape(161749,1,456)

In [None]:
score_2b = model2.evaluate(testXscaltref, testyscaltref, verbose = 1) 
#score_3 = model3.evaluate(testXscch4ref, testyscch4ref, verbose = 1)
#score_4 = model4.evaluate(testXscco2ref, testyscco2ref, verbose = 1)
#score_3b = model3b.evaluate(testXscch4ref, testyscch4ref, verbose = 1)

In [None]:
print('Test MAE:', score_2[1])
print('Test MSE:', score_2[2])
print('Test RMSE:', np.sqrt(score_2[2]))

In [None]:
print('Test MAE:', score_2b[1])
print('Test MSE:', score_2b[2])
print('Test RMSE:', np.sqrt(score_2b[2]))

In [None]:
print('Test MAE:', score_3[1])
print('Test MSE:', score_3[2])
print('Test RMSE:', np.sqrt(score_3[2]))

In [None]:
print('Test MAE:', score_4[1])
print('Test MSE:', score_4[2])
print('Test RMSE:', np.sqrt(score_4[2]))

In [None]:
print('Test MAE:', score_3b[1])
print('Test MSE:', score_3b[2])
print('Test RMSE:', np.sqrt(score_3b[2]))

In [None]:
plt.plot(history2b.history['loss'])
#plt.plot(history3.history['loss'])
#plt.plot(history4.history['loss'])
#plt.plot(history5.history['loss'])

# Load

In [None]:
#import json
#althist_json_file = json.load(open('trainHistoryDictALT_experimental.json', 'r'))
with open('/Users/bradleygay/code/historyALT.json.json', 'rb') as file:
    althistory=pickle.load(file)
with open('/Users/bradleygay/code/trainHistoryDictCH4_experimental', 'rb') as file2:
    ch4history=pickle.load(file2)
with open('/Users/bradleygay/code/trainHistoryDictCO2_experimental', 'rb') as file3:
    co2history=pickle.load(file3)

In [None]:
# Load Models
from keras.models import model_from_json
from keras.models import load_model
#ALT
# load json file and model
alt_json_file = open('model2_070923_insituALT.json', 'r')
alt_loaded_model_json = alt_json_file.read()
alt_json_file.close()
alt_loaded_model_json = model_from_json(alt_loaded_model_json)
# load weights for new model
alt_loaded_model_json.load_weights("model2_070923_insituALT.h5")
print("Loaded model from disk")
# save and reload
alt_loaded_model_json.save('model2_070923_insituALT.hdf5')
alt_loaded_model_json=load_model('model2_070923_insituALT.hdf5')

#CH4
# load json file and model
ch4_json_file = open('model2_070923_insituCH4.json', 'r')
ch4_loaded_model_json = ch4_json_file.read()
ch4_json_file.close()
ch4_loaded_model_json = model_from_json(ch4_loaded_model_json)
# load weights for new model
ch4_loaded_model_json.load_weights("model2_070923_insituCH4.h5")
print("Loaded model from disk")
# save and reload
ch4_loaded_model_json.save('model2_070923_insituCH4.hdf5')
ch4_loaded_model_json=load_model('model2_070923_insituCH4.hdf5')

#CO2
co2_json_file = open('model2_070923_insituCO2.json', 'r')
co2_loaded_model_json = co2_json_file.read()
co2_json_file.close()
co2_loaded_model_json = model_from_json(co2_loaded_model_json)
# load weights for new model
co2_loaded_model_json.load_weights("model2_070923_insituCO2.h5")
print("Loaded model from disk")
# save and reload
co2_loaded_model_json.save('model2_070923_insituCO2.hdf5')
co2_loaded_model_json=load_model('model2_070923_insituCO2.hdf5')

In [None]:
#FROM OTHER FILE
########################################################
########################################################
########################################################

In [None]:
fig,ax = plt.subplots(figsize=(10,6), dpi=1000)
# l1=ax.plot(history2.history['loss'], color='dodgerblue', linestyle='solid', label='ALT Training Loss (cm)')
# l2=ax.plot(history3.history['loss'], color='magenta', linestyle='solid', label='CH4 Flux Training Loss (cm)')
# l3=ax.plot(history4.history['loss'], color='springgreen', linestyle='solid', label='CO2 Flux Training Loss (nmolCH4m-2s-1)')
l1=ax.plot(history2.history['val_loss'], color='dodgerblue', linestyle='solid', label='ALT Validation Loss (cm)')
l2=ax.plot(history3.history['val_loss'], color='magenta', linestyle='solid', label='CH4 Flux Validation Loss (nmolCH4m-2s-1)')
l3=ax.plot(history4.history['val_loss'], color='springgreen', linestyle='solid', label='CO2 Flux Validation Loss (µmolCO2m-2s-1)')
#ax2=ax.twinx();
#ln4=ax2.plot(validPredict, color='coral', linestyle='dotted')

lines = ln1 + ln2 + ln3 #+ ln4 #+ ln5# ln4 + ln5 + ln6# + ln7 + ln8
labs = [line.get_label() for line in lines];
ax.legend(lines, labs, loc='best', fontsize=8)#'lower left', fontsize=8)

ax.grid(linewidth=0.3);
ax.set_xlabel('Epochs', labelpad=8, fontsize=16);
ax.set_ylabel('Training Loss', labelpad=8, fontsize=16)
#ax.set(xticklabels=[])  # remove the tick labels
ax.tick_params(left=False)  # remove the ticks
#plt.ylabel('Active Layer Thickness (cm)')
plt.title('GeoCryoAI In Situ Module | Bidirectional Conv1DLSTM Autoencoder Loss Functions \n ALT, CH4 Flux, and CO2 Flux Simulations (1969-2022)', pad=10)

#plt.xlabel('Year')
#plt.axis([0, 6, 0, 60])
#plt.legend(loc='best')
#plt.show()
#plt.savefig('/Users/bradleygay/Downloads/bilstmae_insitu_CO2_loss.png',dpi=1000)

In [None]:
#reframed_alt.iloc[:,-1]

In [None]:
fig,ax=plt.subplots(figsize=(10,6), dpi=1000);
lns1=ax.plot(history2.history['mean_squared_error'], color='dodgerblue', linestyle='solid', label='Training, ALT (cm)');
ax2=ax.twinx();
lns2=ax2.plot(history2.history['val_mean_squared_error'], color='tomato', linestyle='solid', label='Validation, ALT (cm)');

lns = lns1+lns2#+lns3+lns4;
labs = [l.get_label() for l in lns];
ax2.legend(lns, labs, loc='best', fontsize=12);

ax.grid(linewidth=0.3);
ax2.grid(linewidth=0.3);
ax.set_xlabel('Full Iterations (epochs)', labelpad=12, fontsize=10);
ax.set_ylabel('Training MSE, Scaled ALT (cm)', labelpad=12, fontsize=10);
ax.tick_params(axis='y', labelcolor='dodgerblue')
ax2.set_ylabel('Validation MSE, Scaled ALT (cm)', labelpad=12, fontsize=10)
ax2.tick_params(axis='y', labelcolor='tomato')
ax.tick_params(left=False)  # remove the ticks
ax2.tick_params(right=False)  # remove the ticks
plt.title('GeoCryoAI Modeling, Cost Function and Validation Loss of ALT | Alaska [1969-2022] \n Number of Thaw Depth Samples/Replicates: 2.441M', pad=15, fontsize=14);
ax.grid(linewidth=0.3);
plt.tight_layout()
plt.savefig('ALTstats_1969-2022_071323.png', dpi=1000)

In [None]:
fig,ax=plt.subplots(figsize=(10,6), dpi=1000);
lns1=ax.plot(history3.history['mean_squared_error'], color='magenta', linestyle='solid', label='Training, CH4 Flux (nmolCH4m-2s-1)');
ax2=ax.twinx();
lns2=ax2.plot(history3.history['val_mean_squared_error'], color='slateblue', linestyle='solid', label='Validation, CH4 Flux (nmolCH4m-2s-1)');

lns = lns1+lns2#+lns3+lns4;
labs = [l.get_label() for l in lns];
ax2.legend(lns, labs, loc='best', fontsize=12);

ax.grid(linewidth=0.3);
ax2.grid(linewidth=0.3);
ax.set_xlabel('Full Iterations (epochs)', labelpad=12, fontsize=10);
ax.set_ylabel('Training MSE, Scaled Ch4 Flux (nmolCH4m-2s-1)', labelpad=12, fontsize=10);
ax.tick_params(axis='y', labelcolor='magenta')
ax2.set_ylabel('Validation MSE, Scaled CH4 Flux (nmolCH4m-2s-1)', labelpad=12, fontsize=10)
ax2.tick_params(axis='y', labelcolor='slateblue')
ax.tick_params(left=False)  # remove the ticks
ax2.tick_params(right=False)  # remove the ticks
plt.title('GeoCryoAI Modeling, Cost Function and Validation Loss of CH4 Flux | Alaska [2011-2021] \n Number of CH4 Flux Samples/Replicates: 2.083M', pad=15, fontsize=14);
ax.grid(linewidth=0.3);
plt.tight_layout()
#plt.savefig('ALTstats_CNNLSTMSAEmetrics_1969-2022_021323.png', dpi=1000)

In [None]:
fig,ax=plt.subplots(figsize=(10,6), dpi=1000);
lns1=ax.plot(history5.history['mean_squared_error'], color='midnightblue', linestyle='solid', label='Training, CH4 Flux (nmolCH4m-2s-1)');
ax2=ax.twinx();
lns2=ax2.plot(history5.history['val_mean_squared_error'], color='magenta', linestyle='solid', label='Validation, CH4 Flux (nmolCH4m-2s-1)');

lns = lns1+lns2#+lns3+lns4;
labs = [l.get_label() for l in lns];
ax2.legend(lns, labs, loc='best', fontsize=12);

ax.grid(linewidth=0.3);
ax2.grid(linewidth=0.3);
ax.set_xlabel('Full Iterations (epochs)', labelpad=12, fontsize=10);
ax.set_ylabel('Training MSE, Scaled Ch4 Flux (nmolCH4m-2s-1)', labelpad=12, fontsize=10);
ax.tick_params(axis='y', labelcolor='midnightblue')
ax2.set_ylabel('Validation MSE, Scaled CH4 Flux (nmolCH4m-2s-1)', labelpad=12, fontsize=10)
ax2.tick_params(axis='y', labelcolor='magenta')
ax.tick_params(left=False)  # remove the ticks
ax2.tick_params(right=False)  # remove the ticks
plt.title('GeoCryoAI Modeling, Cost Function and Validation Loss of CH4 Flux | Alaska [2011-2021] \n Number of CH4 Flux Samples/Replicates: 2.083M', pad=15, fontsize=14);
ax.grid(linewidth=0.3);
plt.tight_layout()
plt.savefig('CH4stats_1969-2022_071323.png', dpi=1000)

In [None]:
#plt.plot(reframed_co2.iloc[:,-1]['2006':'2019'].values)
reframed_co2.iloc[:,-1]['2006':'2019']

In [None]:
fig,ax=plt.subplots(figsize=(10,6), dpi=1000);
lns1=ax.plot(history4.history['mean_squared_error'], color='indigo', linestyle='solid', label='Training, CO2 Flux (µolCO2m-2s-1)');
ax2=ax.twinx();
lns2=ax2.plot(history4.history['val_mean_squared_error'], color='lime', linestyle='solid', label='Validation, CO2 Flux (µolCO2m-2s-1)');

lns = lns1+lns2#+lns3+lns4;
labs = [l.get_label() for l in lns];
ax2.legend(lns, labs, loc='best', fontsize=12);

ax.grid(linewidth=0.3);
ax2.grid(linewidth=0.3);
ax.set_xlabel('Full Iterations (epochs)', labelpad=12, fontsize=10);
ax.set_ylabel('Training MSE, Scaled CO2 Flux (µolCO2m-2s-1)', labelpad=12, fontsize=10);
ax.tick_params(axis='y', labelcolor='indigo')
ax2.set_ylabel('Validation MSE, Scaled CO2 Flux (µolCO2m-2s-1)', labelpad=12, fontsize=10)
ax2.tick_params(axis='y', labelcolor='lime')
ax.tick_params(left=False)  # remove the ticks
ax2.tick_params(right=False)  # remove the ticks
plt.title('GeoCryoAI Modeling, Cost Function and Validation Loss of CO2 Flux | Alaska [2006-2019] \n Number of CO2 Flux Samples/Replicates: 1.966M', pad=15, fontsize=14);
ax.grid(linewidth=0.3);
plt.tight_layout()
plt.savefig('CO2stats_1969-2022_071323.png', dpi=1000)

In [None]:
# fig,ax=plt.subplots(figsize=(10,7));
# #lns1=ax.plot(history2.history['loss'], color='dodgerblue', label='Loss, ALT (cm)');
# lns2=ax.plot(history2.history['mean_squared_error'], color='dodgerblue', linestyle='solid', label='RMSE, ALT (cm)');
# ax2=ax.twinx();
# #lns3=ax2.plot(history2.history['val_loss'], color='gold', label='Validation Loss, ALT (cm)');
# lns4=ax2.plot(history2.history['val_mean_squared_error'], color='gold', linestyle='solid', label='Validation RMSE, ALT (cm)');
          
# lns = lns2+lns4; #lns1+lns2+lns3+lns4;
# labs = [l.get_label() for l in lns];
# ax2.legend(lns, labs, loc='best', fontsize=8);

# ax.grid(linewidth=0.3);
# ax.set_xlabel('Full Iterations (epochs)', labelpad=12, fontsize=10);
# ax.set_ylabel('Scaled Depth to Refusal (cm)', labelpad=12, fontsize=10);
# #ax2.set_ylabel('Scaled Depth to Refusal (cm)', labelpad=6, fontsize=9)
# plt.title('Number of Samples/Replicates: 95653', pad=15, fontsize=12, fontweight='ultralight');
# plt.suptitle('Cost Function and Validation Loss from Thaw Depth Modeling, GeoCryoAI Framework in Alaska [1969-2022]', fontsize=14);
# plt.grid(linewidth=0.3);
# #plt.show()
# #plt.savefig('ALTstats_CNNLSTMSAEmetrics_1969-2022_021323.png', dpi=1000)

In [None]:
score = model.evaluate(X_test_reframed, y_test_reframed, verbose = 1) 

In [None]:
print('Test MAE:', score[1])
print('Test MSE:', score[2])
print('Test RMSE:', score[3])

In [None]:
predict = model.predict(X_test_reframed, verbose = 1)

In [None]:
# fig,ax = plt.subplots(figsize=(10,6), dpi=1000)
# ln1=ax.plot(y_test_reframed.reshape(215137,1), color='magenta', linestyle='solid', label='Observation, Test Set')
# ln2=ax.plot(predict.reshape(215137,1), color='dodgerblue', linestyle='solid', label='Prediction, Test Set')
# #ln3=ax.plot(history.history['mean_absolute_error'], color='springgreen', linestyle='dotted', label='MAE')
# #ln4=ax.plot(history.history['mean_squared_error'], color='springgreen', linestyle='dashed', label='Seward Peninsula')
# #ln4=ax.plot(history.history['root_mean_squared_error'], color='springgreen', linestyle='dashed', label='RMSE')
# #ln5=ax.plot(history.history['val_mean_absolute_error'], color='red', linestyle='dotted', label='Val MAE')
# #ln7=ax.plot(history.history['val_mean_squared_error'], color='red', linestyle='dashed', label='Seward Peninsula')
# #ln6=ax.plot(history.history['val_root_mean_squared_error'], color='red', linestyle='dashed', label='Val RMSE')
# #ln2=ax.plot(sib.iloc[2:,7].replace(-9999,np.nan).dropna()color='springgreen', linestyle='dashed', label='Interior')
# #ln3=ax.plot(sib.iloc[2:,16].replace(-9999,np.nan).dropna(), color='magenta', linestyle='dotted', label='Seward Peninsula')
# #ln4=ax.plot(sib.iloc[2:,39].replace(-9999,np.nan).dropna(), color='dodgerblue', linestyle='dotted', label='Yukon-Kuskokwim Delta')
# #ax2=ax.twinx();
# #ln4=ax2.plot(validPredict, color='coral', linestyle='dotted')

# lines = ln1 + ln2 #+ ln3 + ln4 + ln5 + ln6# + ln7 + ln8
# labs = [line.get_label() for line in lines];
# plt.legend(lines, labs, loc='lower left', fontsize=8)

# ax.grid(linewidth=0.3);
# ax.set_xlabel('Samples, 2003-2021', labelpad=10, fontsize=16);
# ax.set_ylabel('Carbon Dioxide Flux (umolCm2s-1)', labelpad=10, fontsize=16)
# #ax.set(xticklabels=[])  # remove the tick labels
# ax.tick_params(left=False)  # remove the ticks
# #plt.ylabel('Active Layer Thickness (cm)')
# plt.title('GeoCryoAI Modeling, Cost Function and Validation Loss of ALT | Alaska [1969-2022]', pad=15, fontsize=14);
# plt.suptitle('Number of Samples/Replicates, ALT: 2.441M', fontsize=12, fontweight='ultralight');
# #plt.title('GeoCryoAI In Situ Module | Bidirectional LSTM Autoencoder Loss Function \n In Situ Carbon Dioxide Flux Simulations (2003-2021)', pad=10)
# #plt.xlabel('Year')
# #plt.axis([0, 6, 0, 60])
# #plt.legend(loc='best')
# #plt.show()
# plt.savefig('/Users/bradleygay/Downloads/bilstmae_insitu_CO2_loss_predictions_experimental.png',dpi=1000);

In [None]:
plt.plot(y_testco2_reframed.reshape(215137,1))
plt.plot(predict.reshape(215137,1))
#plt.axis([0, 130000, -2, 5])
plt.show()

In [None]:
#May need to inverse scale

In [None]:
########################################################
########################################################
########################################################

In [None]:
# fig,ax = plt.subplots(figsize=(10,6), dpi=1000)
# #l1=ax.plot(history2.history['loss'], color='dodgerblue', linestyle='solid', label='ALT Training Loss (cm)')
# #l2=ax.plot(history3.history['loss'], color='magenta', linestyle='solid', label='CH4 Flux Training Loss (cm)')
# #l3=ax.plot(history4.history['loss'], color='springgreen', linestyle='solid', label='CO2 Flux Training Loss (nmolCH4m-2s-1)')
# l1=ax.plot(history2.history['val_loss'], color='dodgerblue', linestyle='solid', label='ALT Validation Loss (cm)')
# ax2=ax.twinx();
# l2=ax2.plot(history3.history['val_loss'], color='magenta', linestyle='solid', label='CH4 Flux Validation Loss (nmolCH4m-2s-1)')
# l3=ax2.plot(history4.history['val_loss'], color='springgreen', linestyle='solid', label='CO2 Flux Validation Loss (µmolCO2m-2s-1)')
# #l3=ax.plot(history4.history['loss'], color='springgreen', linestyle='solid', label='CO2 Flux Loss (µmolCO2m-2s-1)')
# #l1=ax.plot(history2.history['val_mean_squared_error'], color='dodgerblue', linestyle='solid', label='ALT Loss (cm)')
# #l2=ax.plot(history3.history['val_mean_squared_error'], color='magenta', linestyle='solid', label='CH4 Flux Loss (nmolCH4m-2s-1)')
# #l3=ax.plot(history4.history['val_mean_squared_error'], color='springgreen', linestyle='solid', label='CO2 Flux Loss (µmolCO2m-2s-1)')
# #ax2=ax.twinx();
# #l2=ax2.plot(althistory['val_mean_squared_error'], color='dodgerblue', linestyle='solid', label='Validation Loss (MSE)')
# #l2=ax2.plot(ch4history['val_mean_squared_error'], color='coral', linestyle='solid', label='Validation Loss (MSE)')
# #l2=ax2.plot(althistory['val_mean_squared_error'], color='coral', linestyle='solid', label='Validation Loss (MSE)')

# lns = l1+l2 +l3
# labs = [l.get_label() for l in lns];
# #ax2.legend(lns, labs, loc='best', fontsize=8);
# ax.legend(lns, labs, loc='best', fontsize=12);


# ax.set_xlabel('Full Iterations (epochs)', labelpad=15, fontsize=12);
# #ax.set_ylabel('Training Loss (units)', labelpad=15, fontsize=12)
# ax.set_ylabel('Validation Loss (units)', labelpad=15, fontsize=12)
# ax2.set_ylabel('Validation Loss (units)', labelpad=15, fontsize=10)
# #ax2.set_ylabel('Validation Loss, MSE (cm)', labelpad=15, fontsize=10)
# #ax.set_ylabel('Training Loss (units)', labelpad=15, fontsize=12)
# #ax2.set_ylabel('Validation Loss, MSE (nmolCH4m2s-1)', labelpad=15, fontsize=10)
# #ax.set_ylabel('Loss, MSE (µmolCO2m2s-1)', labelpad=15, fontsize=10)
# #ax2.set_ylabel('Validation Loss, MSE (µmolCO2m2s-1)', labelpad=15, fontsize=10)
# #ax.set(xticklabels=[])  # remove the tick labels
# ax.tick_params(left=False)  # remove the ticks
# ax2.tick_params(left=False)  # remove the ticks
# #plt.ylabel('Active Layer Thickness (cm)')
# ax.grid(linewidth=0.3);
# plt.title('GeoCryoAI Modeling, Cost Function and Validation Loss of ALT | Alaska [1969-2022] \n Number of Samples/Replicates, ALT: 2.441M', pad=15, fontsize=14);
# #plt.suptitle('Number of Samples/Replicates, ALT: 2.441M', fontsize=12, fontweight='ultralight');
# #plt.title('GeoCryoAI Training and Validation Loss | In Situ Thaw Depth Simulations [1969-2022]', pad=15, fontsize=14)
# #plt.title('GeoCryoAI Model Simulations | Cost Functions \n ALT, CH4 Flux, and CO2 Flux [1969-2022]', pad=15, fontsize=14)
# #plt.title('GeoCryoAI Training and Validation Loss | In Situ CO2 Flux Simulations [2006-2019]', pad=15, fontsize=14)
# #plt.xlabel('Year')
# #plt.axis([0, 6, 0, 60])
# #plt.grid(linewidth=0.3);
# #plt.show()
# #plt.savefig('/Users/bradleygay/Downloads/bilstmae_insitu_ALT_loss.png',dpi=1000)

In [None]:
alt_model=alt_loaded_model_json
ch4_model=ch4_loaded_model_json
co2_model=co2_loaded_model_json

In [None]:
loss_function = 'mean_squared_error'
alt_metrics=['mean_squared_error', 'mean_absolute_error', RSquare().build(input_shape = (trainyscaltref.shape[1],))]
alt_model.compile(optimizer = tf.keras.optimizers.legacy.RMSprop(learning_rate = 0.0001, **{"clipvalue" : 1000}),
                  loss = loss_function, metrics = alt_metrics)
ch4_metrics=['mean_squared_error', 'mean_absolute_error', RSquare().build(input_shape = (trainyscch4ref.shape[1],))]
ch4_model.compile(optimizer = tf.keras.optimizers.legacy.RMSprop(learning_rate = 0.001, **{"clipvalue" : 1000}),
                  loss = loss_function, metrics = ch4_metrics)
co2_metrics=['mean_squared_error', 'mean_absolute_error', RSquare().build(input_shape = (trainyscco2ref.shape[1],))]
co2_model.compile(optimizer = tf.keras.optimizers.legacy.RMSprop(learning_rate = 0.0001, **{"clipvalue" : 1000}),
                  loss = loss_function, metrics = co2_metrics)

In [None]:
#score1=alt_model.evaluate(testXscaltref,testyscaltref,verbose=1)
#score2=co2_model.evaluate(testXscco2ref,testyscco2ref,verbose=1)
score3=ch4_model.evaluate(testXscch4ref,testyscch4ref,verbose=1)

In [None]:
score_2, score_3b, score_4#score_3

In [None]:
#ALT
print('Test MAE:', score1[1])
print('Test MSE:', score1[2])
print('Test RMSE:', np.sqrt(score1[2]))

In [None]:
#CO2
print('Test MAE:', score2[1])
print('Test MSE:', score2[2])
print('Test RMSE:', np.sqrt(score2[2]))

In [None]:
#CH4
print('Test MAE:', score3[1])
print('Test MSE:', score3[2])
print('Test RMSE:', np.sqrt(score3[2]))

In [None]:
trainyscaltpred = alt_model.predict(trainXscaltref)
validyscaltpred = alt_model.predict(validXscaltref)
testyscaltpred = alt_model.predict(testXscaltref)

In [None]:
with open('trainyscaltpred', 'wb') as file_theta:
    pickle.dump(trainyscaltpred, file_theta)
with open('validyscaltpred', 'wb') as file_alpha:
    pickle.dump(validyscaltpred, file_alpha)
with open('testyscaltpred', 'wb') as file_zeta:
    pickle.dump(testyscaltpred, file_zeta)

with open('/Users/bradleygay/code/trainyscaltpred', 'rb') as file_theta:
    trainyscaltpred=pickle.load(file_theta)
with open('/Users/bradleygay/code/validyscaltpred', 'rb') as file_alpha:
    validyscaltpred=pickle.load(file_alpha)
with open('/Users/bradleygay/code/testyscaltpred', 'rb') as file_zeta:
    testyscaltpred=pickle.load(file_zeta)

In [None]:
trainyscco2pred = co2_model.predict(trainXscco2ref)
validyscco2pred = co2_model.predict(validXscco2ref)
testyscco2pred = co2_model.predict(testXscco2ref)

In [None]:
with open('trainyscco2pred', 'wb') as file_a:
    pickle.dump(trainyscco2pred, file_a)
with open('validyscco2pred', 'wb') as file_b:
    pickle.dump(validyscco2pred, file_b)
with open('testyscco2pred', 'wb') as file_c:
    pickle.dump(testyscco2pred, file_c)

with open('/Users/bradleygay/code/trainyscco2pred', 'rb') as file_a:
    trainyscco2pred=pickle.load(file_a)
with open('/Users/bradleygay/code/validyscco2pred', 'rb') as file_b:
    validyscco2pred=pickle.load(file_b)
with open('/Users/bradleygay/code/testyscco2pred', 'rb') as file_c:
    testyscco2pred=pickle.load(file_c)

In [None]:
trainyscch4pred = ch4_model.predict(trainXscch4ref)
validyscch4pred = ch4_model.predict(validXscch4ref)
testyscch4pred = ch4_model.predict(testXscch4ref)

In [None]:
with open('trainyscch4pred', 'wb') as file_i:
    pickle.dump(trainyscch4pred, file_i)
with open('validyscch4pred', 'wb') as file_ii:
    pickle.dump(validyscch4pred, file_ii)
with open('testyscch4pred', 'wb') as file_iii:
    pickle.dump(testyscch4pred, file_iii)

with open('/Users/bradleygay/code/trainyscch4pred', 'rb') as file_i:
    trainyscch4pred=pickle.load(file_i)
with open('/Users/bradleygay/code/validyscch4pred', 'rb') as file_ii:
    validyscch4pred=pickle.load(file_ii)
with open('/Users/bradleygay/code/testyscch4pred', 'rb') as file_iii:
    testyscch4pred=pickle.load(file_iii)

In [None]:
#model2 == ALT (history2)
#model3b == CH4 (history5)
#model4 == CO2 (history4)

### Archive

In [None]:
#altlist=[]
#altlist=np.append(altlist,p)
#altlist=np.append(altlist,pp)
altlist=np.append(altlist,ppp)

In [None]:
plt.plot(yscaleralt.inverse_transform(altlist.reshape(-1,1)))

In [None]:
plt.plot(yscaleralt.inverse_transform(p.reshape(1432318,1)))

In [None]:
data_val = X_scaler.fit_transform(df.tail(48))
   val_rescaled = data_val.reshape(1, data_val.shape[0], data_val.shape[1])
 pred = lstm_model.predict(val_rescaled)
 pred_Inverse = Y_scaler.inverse_transform(pred)
 pred_Inverse 

In [None]:
type(p.reshape(1432318, 1))

In [None]:
type(testXscaltref.reshape((testXscaltref.shape[0], testXscaltref.shape[2])))

In [None]:
# make a prediction
yhat = model2.predict(testXscalt)
test_X = testXscalt.reshape((test_X.shape[0], test_X.shape[2]))
# invert scaling for forecast
inv_yhat = concatenate((yhat, test_X[:, 1:]), axis=1)
inv_yhat = scaler.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:,0]
# invert scaling for actual
test_y = test_y.reshape((len(test_y), 1))
inv_y = concatenate((test_y, test_X[:, 1:]), axis=1)
inv_y = scaler.inverse_transform(inv_y)
inv_y = inv_y[:,0]
# calculate RMSE
rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
print('Test RMSE: %.3f' % rmse)

In [None]:
trainXaltdf=pd.DataFrame(trainXalt).to_numpy().reshape(1432318, 1, 273)

In [None]:
#y_pred = model2.predict(trainXaltdfres)
y_pred_scaled = model2.predict(testXscaltref)

In [None]:
y_pred.shape

In [None]:
y_pred_scaled.shape

In [None]:
testXalt.shape

In [None]:
testyalt.iloc[:,0]

In [None]:
testyscaltpd=pd.DataFrame(testyscalt)
testyscaltpd.index=testyalt.iloc[:,0].index
testyscaltpd.index=pd.to_datetime(testyscaltpd.index, format='%Y')
testyscaltpd.index.name = None

In [None]:
#testyscaltpd.to_numpy().reshape(215137,1,1)

In [None]:
testyscaltpd.columns=[testyalt.iloc[:,-1].name]

In [None]:
testyscaltpd

In [None]:
validyscaltpd=pd.DataFrame(validyscalt)
validyscaltpd.index=validyalt.iloc[:,0].index
validyscaltpd.index=pd.to_datetime(validyscaltpd.index, format='%Y')
validyscaltpd.index.name = None

In [None]:
#validyscaltpd.to_numpy().reshape(793782,1,1)

In [None]:
validyscaltpd.columns=[validyalt.iloc[:,-1].name]

In [None]:
validyscaltpd

In [None]:
trainyscaltpd=pd.DataFrame(trainyscalt)
trainyscaltpd.index=trainyalt.iloc[:,0].index
trainyscaltpd.index=pd.to_datetime(trainyscaltpd.index, format='%Y')
trainyscaltpd.index.name = None

In [None]:
trainyscaltpd.columns=[trainyalt.iloc[:,-1].name]

In [None]:
trainyscaltpd

In [None]:
plt.plot(trainyscaltpd)
plt.plot(validyscaltpd)
plt.plot(testyscaltpd)

In [None]:
plt.plot(trainyalt)
plt.plot(validyalt)
plt.plot(testyalt)

In [None]:
#trainXalt

In [None]:
#testyscaltpd.to_numpy().reshape(215137,1,1)
#testyscalt.reshape(215137,1,1)
t=trainXalt.iloc[:,-1].resample('Y').mean()
v=validXalt.iloc[:,-1].resample('Y').mean()
r=testXalt.iloc[:,-1].resample('Y').mean()#.reshape(215137,1, 273)

In [None]:
trainXalt.iloc[:,-183]

In [None]:
t2=trainXalt.iloc[:,-92].resample('Y').mean()
v2=validXalt.iloc[:,-92].resample('Y').mean()
r2=testXalt.iloc[:,-92].resample('Y').mean()#.reshape(215137,1, 273)

In [None]:
t3=trainXalt.iloc[:,-183].resample('Y').mean()
v3=validXalt.iloc[:,-183].resample('Y').mean()
r3=testXalt.iloc[:,-183].resample('Y').mean()#.reshape(215137,1, 273)

In [None]:
yup=[]
yup=np.append(yup,t.values)
yup=np.append(yup,v.values)
yup=np.append(yup,r.values)

In [None]:
yup2=[]
yup2=np.append(yup2,t2.values)
yup2=np.append(yup2,v2.values)
yup2=np.append(yup2,r2.values)

In [None]:
yup3=[]
yup3=np.append(yup3,t3.values)
yup3=np.append(yup3,v3.values)
yup3=np.append(yup3,r3.values)

In [None]:
plt.plot(reframed_alt.iloc[:,-1].resample('Y').mean().values)

In [None]:
plt.plot(df['ALT'].resample('Y').mean().values)
#plt.plot(reframed_alt.iloc[:,-1].resample('Y').mean().values)
plt.plot(yup, linestyle='dotted')
plt.plot(yup2, linestyle='dotted')
plt.plot(yup3, linestyle='dotted')
plt.show()

In [None]:
plt.plot(df['ALT'].resample('Y').mean().values)

In [None]:
plt.plot(yup, linestyle='dotted')
plt.plot(yup2, linestyle='dotted')
plt.plot(yup3, linestyle='dotted')

In [None]:
#trainXaltdfres=pd.DataFrame(trainXalt).to_numpy().reshape(1432318, 1, 273)

### ALT

In [None]:
trainXscalt.shape

In [None]:
trainXscaltdfres=pd.DataFrame(trainXscalt).to_numpy().reshape(1432318, 1, 273)

In [None]:
trainXscaltdfres.shape

In [None]:
#p=model2.predict(trainXaltdfres)

In [None]:
p=model2.predict(trainXscaltdfres)

In [None]:
p.shape

In [None]:
# plt.plot(trainXalt.to_numpy().reshape(1432318, 1, 273)[:,-1,-1])
# plt.plot(validXalt.to_numpy().reshape(793782, 1, 273)[:,-1,-1])
# plt.plot(testXalt.to_numpy().reshape(215137, 1, 273)[:,-1,-1])
# plt.plot(trainXscalt.reshape(1432318, 1, 273)[:,-1,-1])
# plt.plot(validXscalt.reshape(793782, 1, 273)[:,-1,-1])
# plt.plot(testXscalt.reshape(215137, 1, 273)[:,-1,-1])

In [None]:
plt.plot(p.reshape(1432318,1))
plt.show()

In [None]:
validXscalt.shape

In [None]:
validXscaltdfres=pd.DataFrame(validXscalt).to_numpy().reshape(793782, 1, 273)

In [None]:
validXscaltdfres.shape

In [None]:
pp=model2.predict(validXscaltdfres)

In [None]:
pp.shape

In [None]:
# plt.plot(trainXalt.to_numpy().reshape(1432318, 1, 273)[:,-1,-1])
# plt.plot(validXalt.to_numpy().reshape(793782, 1, 273)[:,-1,-1])
# plt.plot(testXalt.to_numpy().reshape(215137, 1, 273)[:,-1,-1])
# plt.plot(trainXscalt.reshape(1432318, 1, 273)[:,-1,-1])
# plt.plot(validXscalt.reshape(793782, 1, 273)[:,-1,-1])
# plt.plot(testXscalt.reshape(215137, 1, 273)[:,-1,-1])

In [None]:
plt.plot(pp.reshape(793782,1))
plt.show()

In [None]:
testXscalt.shape

In [None]:
testXscaltdfres=pd.DataFrame(testXscalt).to_numpy().reshape(215137, 1, 273)

In [None]:
testXscaltdfres.shape

In [None]:
ppp=model2.predict(testXscaltdfres)

In [None]:
ppp.shape

In [None]:
# plt.plot(trainXalt.to_numpy().reshape(1432318, 1, 273)[:,-1,-1])
# plt.plot(validXalt.to_numpy().reshape(793782, 1, 273)[:,-1,-1])
# plt.plot(testXalt.to_numpy().reshape(215137, 1, 273)[:,-1,-1])
# plt.plot(trainXscalt.reshape(1432318, 1, 273)[:,-1,-1])
# plt.plot(validXscalt.reshape(793782, 1, 273)[:,-1,-1])
# plt.plot(testXscalt.reshape(215137, 1, 273)[:,-1,-1])

In [None]:
plt.plot(ppp.reshape(215137,1))
plt.show()

In [None]:
arr=[]
arr=np.append(arr,p.reshape(1432318,))
arr=np.append(arr,pp.reshape(793782,))
arr=np.append(arr,ppp.reshape(215137,))

In [None]:
plt.plot(arr)

In [None]:
arr2=[]
arr2=np.append(arr2,trainyalt.values.reshape(1432318,))
arr2=np.append(arr2,validyalt.values.reshape(793782,))
arr2=np.append(arr2,testyalt.values.reshape(215137,))

In [None]:
plt.plot(arr2)

In [None]:
plt.plot(yscaleralt.inverse_transform(arr.reshape(2441237,1)))

In [None]:
2441237-215137
#testXscalt.shape

In [None]:
arr2.shape

In [None]:
fig,ax=plt.subplots(figsize=(10,6), dpi=1000);
lns1=ax.plot(arr2.reshape(2441237,1), color='dodgerblue', linestyle='solid', label='Thaw Depth Observations, ALT (cm)');
#ax2=ax.twinx();
lns2=ax.plot(yscaleralt.inverse_transform(arr.reshape(2441237,1)), color='tomato', alpha=0.5, linestyle='solid', label='Thaw Depth Predictions, ALT (cm)');

lns = lns1+lns2#+lns3+lns4;
labs = [l.get_label() for l in lns];
ax.legend(lns, labs, loc='best', fontsize=12);

ax.grid(linewidth=0.3);
#ax2.grid(linewidth=0.3);
ax.set_xlabel('Full Iterations (epochs)', labelpad=12, fontsize=10);
ax.set_ylabel('Active Layer Thickness (cm)', labelpad=12, fontsize=10);
#ax.tick_params(axis='y', labelcolor='springgreen')
#ax2.set_ylabel('Validation MSE, Scaled CO2 Flux (µolCO2m-2s-1)', labelpad=12, fontsize=10)
#ax2.tick_params(axis='y', labelcolor='yellowgreen')
ax.tick_params(left=False)  # remove the ticks
#ax2.tick_params(right=False, labelright=False)  # remove the ticks
plt.title('GeoCryoAI Modeling, ALT Observations v. Predictions | Alaska [1969-2022] \n Number of ALT Samples/Replicates: 2.441M', pad=15, fontsize=14);
ax.grid(linewidth=0.3);
plt.axis([0, 2441237, 0, 300])
#plt.axis([2226100, 2441237, 0, 200])
plt.tight_layout()
plt.savefig('ALT_ObsVPred_1969-2022_071323.svg', dpi=1000)
plt.savefig('ALT_ObsVPred_1969-2022_071323.png', dpi=1000)

### CO2

In [None]:
trainXscco2.shape

In [None]:
trainXscco2dfres=pd.DataFrame(trainXscco2).to_numpy().reshape(1432318, 1, 273)

In [None]:
trainXscco2dfres.shape

In [None]:
#p=model2.predict(trainXaltdfres)

In [None]:
q=model4.predict(trainXscco2dfres)

In [None]:
q.shape

In [None]:
# plt.plot(trainXalt.to_numpy().reshape(1432318, 1, 273)[:,-1,-1])
# plt.plot(validXalt.to_numpy().reshape(793782, 1, 273)[:,-1,-1])
# plt.plot(testXalt.to_numpy().reshape(215137, 1, 273)[:,-1,-1])
# plt.plot(trainXscalt.reshape(1432318, 1, 273)[:,-1,-1])
# plt.plot(validXscalt.reshape(793782, 1, 273)[:,-1,-1])
# plt.plot(testXscalt.reshape(215137, 1, 273)[:,-1,-1])

In [None]:
plt.plot(q.reshape(1432318,1))
plt.show()

In [None]:
validXscco2.shape

In [None]:
validXscco2dfres=pd.DataFrame(validXscco2).to_numpy().reshape(793782, 1, 273)

In [None]:
validXscco2dfres.shape

In [None]:
qq=model4.predict(validXscco2dfres)

In [None]:
qq.shape

In [None]:
# plt.plot(trainXalt.to_numpy().reshape(1432318, 1, 273)[:,-1,-1])
# plt.plot(validXalt.to_numpy().reshape(793782, 1, 273)[:,-1,-1])
# plt.plot(testXalt.to_numpy().reshape(215137, 1, 273)[:,-1,-1])
# plt.plot(trainXscalt.reshape(1432318, 1, 273)[:,-1,-1])
# plt.plot(validXscalt.reshape(793782, 1, 273)[:,-1,-1])
# plt.plot(testXscalt.reshape(215137, 1, 273)[:,-1,-1])

In [None]:
plt.plot(qq.reshape(793782,1))
plt.show()

In [None]:
testXscco2.shape

In [None]:
testXscco2dfres=pd.DataFrame(testXscco2).to_numpy().reshape(215137, 1, 273)

In [None]:
testXscco2dfres.shape

In [None]:
qqq=model4.predict(testXscco2dfres)

In [None]:
qqq.shape

In [None]:
# plt.plot(trainXalt.to_numpy().reshape(1432318, 1, 273)[:,-1,-1])
# plt.plot(validXalt.to_numpy().reshape(793782, 1, 273)[:,-1,-1])
# plt.plot(testXalt.to_numpy().reshape(215137, 1, 273)[:,-1,-1])
# plt.plot(trainXscalt.reshape(1432318, 1, 273)[:,-1,-1])
# plt.plot(validXscalt.reshape(793782, 1, 273)[:,-1,-1])
# plt.plot(testXscalt.reshape(215137, 1, 273)[:,-1,-1])

In [None]:
plt.plot(qqq.reshape(215137,1))
plt.show()

In [None]:
arr3=[]
arr3=np.append(arr3,q.reshape(1432318,))
arr3=np.append(arr3,qq.reshape(793782,))
arr3=np.append(arr3,qqq.reshape(215137,))

In [None]:
plt.plot(arr3)

In [None]:
arr4=[]
arr4=np.append(arr4,trainyco2.values.reshape(1432318,))
arr4=np.append(arr4,validyco2.values.reshape(793782,))
arr4=np.append(arr4,testyco2.values.reshape(215137,))

In [None]:
plt.plot(arr4)

In [None]:
plt.plot(yscalerco2.inverse_transform(arr3.reshape(2441237,1)))

In [None]:
#reframed_co2.iloc[:,-1].name
#('CO2_1_2_1', 't')
#reframed_co2.iloc[:,-1]['2006':'2019']
#reframed_co2.shape
#2441237-1965628
#78115:2043743
plt.plot(reframed_co2.iloc[78115:2043743,-1].values)#['2005':'2019']
#reframed_co2.iloc[:,-1][:'2019']

In [None]:
fig,ax=plt.subplots(figsize=(10,6), dpi=1000);
lns1=ax.plot(arr4.reshape(2441237,1), color='indigo', linestyle='solid', label='Flux Observations, CO2 (µmolCO2m-2s-1)');
ax2=ax.twinx();
lns2=ax2.plot(yscalerco2.inverse_transform(arr3.reshape(2441237,1)), alpha=0.5, color='lime', linestyle='solid', label='Flux Predictions, CO2 (µmolCO2m-2s-1)');

lns = lns1+lns2#+lns3+lns4;
labs = [l.get_label() for l in lns];
ax.legend(lns, labs, loc='upper right', fontsize=12);

ax.grid(linewidth=0.3);
#ax2.grid(linewidth=0.3);
ax.set_xlabel('Full Iterations (epochs)', labelpad=12, fontsize=10);
ax.set_ylabel('Carbon Dioxide Flux (µmolCO2m-2s-1)', labelpad=12, fontsize=10);
#ax.tick_params(axis='y', labelcolor='springgreen')
#ax2.set_ylabel('Validation MSE, Scaled CO2 Flux (µolCO2m-2s-1)', labelpad=12, fontsize=10)
#ax2.tick_params(axis='y', labelcolor='yellowgreen')
ax.tick_params(left=False, labelright=False)  # remove the ticks
ax2.tick_params(right=False, labelright=False)  # remove the ticks
plt.title('GeoCryoAI Modeling, CO2 Flux Observations v. Predictions | Alaska [2006-2019] \n Number of CO2 Flux \
Samples/Replicates: 1.966M', pad=15, fontsize=14);
ax.grid(linewidth=0.3);
#plt.axis([78115, 2043743, 0, 800])
#plt.axis([2226100, 2441237, 0, 200])
plt.tight_layout()
plt.savefig('CO2_ObsVPred_2006-2019_071323.svg', dpi=1000)
plt.savefig('CO2_ObsVPred_2006-2019_071323.png', dpi=1000)

### CH4

In [None]:
trainXscch4.shape

In [None]:
trainXscch4dfres=pd.DataFrame(trainXscch4).to_numpy().reshape(1432318, 1, 273)

In [None]:
trainXscch4dfres.shape

In [None]:
#p=model2.predict(trainXaltdfres)

In [None]:
o=model3b.predict(trainXscch4dfres)

In [None]:
o.shape

In [None]:
# plt.plot(trainXalt.to_numpy().reshape(1432318, 1, 273)[:,-1,-1])
# plt.plot(validXalt.to_numpy().reshape(793782, 1, 273)[:,-1,-1])
# plt.plot(testXalt.to_numpy().reshape(215137, 1, 273)[:,-1,-1])
# plt.plot(trainXscalt.reshape(1432318, 1, 273)[:,-1,-1])
# plt.plot(validXscalt.reshape(793782, 1, 273)[:,-1,-1])
# plt.plot(testXscalt.reshape(215137, 1, 273)[:,-1,-1])

In [None]:
plt.plot(o.reshape(1432318,1))
plt.show()

In [None]:
validXscch4.shape

In [None]:
validXscch4dfres=pd.DataFrame(validXscch4).to_numpy().reshape(793782, 1, 273)

In [None]:
validXscch4dfres.shape

In [None]:
oo=model3b.predict(validXscch4dfres)

In [None]:
oo.shape

In [None]:
# plt.plot(trainXalt.to_numpy().reshape(1432318, 1, 273)[:,-1,-1])
# plt.plot(validXalt.to_numpy().reshape(793782, 1, 273)[:,-1,-1])
# plt.plot(testXalt.to_numpy().reshape(215137, 1, 273)[:,-1,-1])
# plt.plot(trainXscalt.reshape(1432318, 1, 273)[:,-1,-1])
# plt.plot(validXscalt.reshape(793782, 1, 273)[:,-1,-1])
# plt.plot(testXscalt.reshape(215137, 1, 273)[:,-1,-1])

In [None]:
plt.plot(oo.reshape(793782,1))
plt.show()

In [None]:
testXscch4.shape

In [None]:
testXscch4dfres=pd.DataFrame(testXscch4).to_numpy().reshape(215137, 1, 273)

In [None]:
testXscch4dfres.shape

In [None]:
ooo=model3b.predict(testXscch4dfres)

In [None]:
ooo.shape

In [None]:
# plt.plot(trainXalt.to_numpy().reshape(1432318, 1, 273)[:,-1,-1])
# plt.plot(validXalt.to_numpy().reshape(793782, 1, 273)[:,-1,-1])
# plt.plot(testXalt.to_numpy().reshape(215137, 1, 273)[:,-1,-1])
# plt.plot(trainXscalt.reshape(1432318, 1, 273)[:,-1,-1])
# plt.plot(validXscalt.reshape(793782, 1, 273)[:,-1,-1])
# plt.plot(testXscalt.reshape(215137, 1, 273)[:,-1,-1])

In [None]:
plt.plot(ooo.reshape(215137,1))
plt.show()

In [None]:
arr5=[]
arr5=np.append(arr5,o.reshape(1432318,))
arr5=np.append(arr5,oo.reshape(793782,))
arr5=np.append(arr5,ooo.reshape(215137,))

In [None]:
arr5.shape

In [None]:
plt.plot(yscalerch4.inverse_transform(arr5.reshape(2441237,1)))

In [None]:
arr6=[]
arr6=np.append(arr6,trainych4.values.reshape(1432318,))
arr6=np.append(arr6,validych4.values.reshape(793782,))
arr6=np.append(arr6,testych4.values.reshape(215137,))

In [None]:
plt.plot(arr6)

In [None]:
plt.plot(yscalerch4.inverse_transform(arr5.reshape(2441237,1)))

In [None]:
#reframed_ch4.iloc[:,-1].name
#('CH4_1_1_2', 't')
#reframed_ch4.iloc[:,-1]['2011':'2021']#2.083M
#reframed_ch4.iloc[:,-1][:'2021']
#reframed_ch4.iloc[:,-1][:'2021']
#reframed_ch4.iloc[:304966,-1]
reframed_ch4.iloc[304966:2387849,-1]

In [None]:
#arr6.reshape(2441237,1)

In [None]:
fig,ax=plt.subplots(figsize=(10,6), dpi=1000);
lns1=ax.plot(arr6.reshape(2441237,1), color='midnightblue', linestyle='solid', label='Flux Observations, CH4 (nmolCO2m-2s-1)');
ax2=ax.twinx();
lns2=ax2.plot(yscalerch4.inverse_transform(arr5.reshape(2441237,1)), color='magenta', alpha=0.5, linestyle='solid', label='Flux Predictions, CH4 (nmolCO2m-2s-1)');

lns = lns1+lns2#+lns3+lns4;
labs = [l.get_label() for l in lns];
ax2.legend(lns, labs, loc='best', fontsize=12);

ax.grid(linewidth=0.3);
#ax2.grid(linewidth=0.3);
ax.set_xlabel('Full Iterations (epochs)', labelpad=12, fontsize=10);
ax.set_ylabel('Methane Flux (nmolCO2m-2s-1)', labelpad=12, fontsize=10);
#ax.tick_params(axis='y', labelcolor='springgreen')
#ax2.set_ylabel('Validation MSE, Scaled CO2 Flux (µolCO2m-2s-1)', labelpad=12, fontsize=10)
#ax2.tick_params(axis='y', labelcolor='yellowgreen')
ax.tick_params(left=False)  # remove the ticks
ax2.tick_params(right=False, labelright=False)  # remove the ticks
plt.title('GeoCryoAI Modeling, CH4 Flux Observations v. Predictions | Alaska [2011-2021] \n Number of CH4 Flux Samples/Replicates: 2.083M', pad=15, fontsize=14);
ax.grid(linewidth=0.3);
#plt.axis([304966, 2387849, 0, 2060])
#plt.axis([0, 2441237, 0, 2060])
plt.tight_layout()
plt.savefig('CH4_ObsVPred_2006-2019_071323.svg', dpi=1000)
plt.savefig('CH4_ObsVPred_2006-2019_071323.png', dpi=1000)

### Archive

In [None]:
validXaltdfres=pd.DataFrame(validXalt).to_numpy().reshape(793782, 1, 273)

In [None]:
pp=model2.predict(validXaltdfres)

In [None]:
plt.plot(pp.reshape(793782,1))
plt.show()

In [None]:
testXaltdfres=pd.DataFrame(testXalt).to_numpy().reshape(215137, 1, 273)

In [None]:
ppp=model2.predict(testXaltdfres)

In [None]:
plt.plot(ppp.reshape(215137,1))
plt.show()

In [None]:
plt.plot(yscaleralt.inverse_transform(p.reshape(1432318,1)))

In [None]:
plt.plot(yscaleralt.inverse_transform(pp.reshape(793782,1)))

In [None]:
plt.plot(yscaleralt.inverse_transform(ppp.reshape(215137,1)))

In [None]:
#pd.DataFrame(yscaleralt.inverse_transform(p.reshape(1432318,1)))
plt.plot(trainyalt.values)
plt.plot(yscaleralt.inverse_transform(p.reshape(1432318,1)))

In [None]:
#pd.DataFrame(yscaleralt.inverse_transform(p.reshape(1432318,1)))
plt.plot(validyalt.values)
plt.plot(yscaleralt.inverse_transform(pp.reshape(793782,1)))

In [None]:
plt.plot(p.reshape(1432318,1))
plt.plot(pp.reshape(793782,1))
plt.plot(ppp.reshape(215137,1))
plt.show()

In [None]:
newp=p.reshape(1432318,1)
newpp=pp.reshape(793782,1)
newppp=ppp.reshape(215137,1)

In [None]:
# invert predictions
sc1=StandardScaler().fit(newp)
newTrain=sc1.inverse_transform(newp)
sc2=StandardScaler().fit(newpp)
newValid = sc2.inverse_transform(newpp)
sc3=StandardScaler().fit(newppp)
newTest = sc3.inverse_transform(newppp)

In [None]:
newTrain.shape == newp.shape, newValid.shape == newpp.shape, newTest.shape == newppp.shape

In [None]:
trainXaltdfres.shape, newp.shape, newTrain.shape

In [None]:
testyalt.to_numpy().reshape(215137,)

In [None]:
# calculate root mean squared error
trainScore = np.sqrt(keras.losses.mean_squared_error(trainyalt.to_numpy().reshape(1432318,), newp[:,0]))
print('Train Score: %.6f RMSE' % (trainScore))

In [None]:
validScore = np.sqrt(keras.losses.mean_squared_error(validyalt.to_numpy().reshape(793782,), newpp[:,0]))
print('Valid Score: %.6f RMSE' % (validScore))

In [None]:
testScore = np.sqrt(keras.losses.mean_squared_error(testyalt.to_numpy().reshape(215137,), newppp[:,0]))
print('Test Score: %.6f RMSE' % (testScore))

In [None]:
testScore2 = np.sqrt(keras.losses.mean_squared_error(newppp[0], newTest[:,0]))
print('Test Score: %.6f RMSE' % (testScore2))

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
ln1=ax.plot(y_train, color='royalblue', linestyle='solid', label='Observed ALT')
ln2=ax.plot(y_test, color='springgreen', linestyle='dashed', label='Tested ALT')
ln3=ax.plot(y_valid, color='magenta', linestyle='dotted', label='Validated ALT')
ax2=ax.twinx();
ln4=ax2.plot(testPredict, color='yellow', linestyle='dotted', label='Test_Predicted ALT')

lines = ln1 + ln2 + ln3 + ln4
labs = [line.get_label() for line in lines];
ax2.legend(lines, labs, loc='best')

ax.grid(linewidth=0.3);
ax.set_xlabel('Epochs', labelpad=6, fontsize=9);
ax.set_ylabel('Loss', labelpad=6, fontsize=9)
ax2.set_ylabel('Forecasted ALT')
plt.title('LSTM+CNN+VAE Model (GeoCryoAI): \n Observed v. Forecasted Active Layer Thickness (cm)')
plt.ylabel('Scaled Active Layer Thickness (cm)')
plt.xlabel('Epoch')
plt.axis([0, 12000, -0.1, 1])
#plt.legend(loc='best')
plt.show()



# fig,ax=plt.subplots(figsize=(10,5));
# lns1=ax.plot(history.history['loss'], color='dodgerblue', label='Loss, Active Layer Thickness (cm)');
# lns2=ax.plot(history.history['root_mean_squared_error'], color='dodgerblue', linestyle='dotted', label='RMSE, Active Layer Thickness (cm)');
# ax2=ax.twinx();
# lns3=ax2.plot(history.history['val_loss'], color='gold', label='Validation Loss, Active Layer Thickness (cm)');
# lns4=ax2.plot(history.history['val_root_mean_squared_error'], color='gold', linestyle='dotted', label='Validation RMSE, Active Layer Thickness (cm)');
          
# lns = lns1+lns2+lns3+lns4;
# labs = [l.get_label() for l in lns];
# ax2.legend(lns, labs, loc='best', fontsize=8);

# ax.grid(linewidth=0.3);
# ax.set_xlabel('Epochs', labelpad=6, fontsize=9);
# #ax.set_ylabel('Loss', labelpad=6, fontsize=9)
# #ax2.set_ylabel('Scaled Depth to Refusal (cm)', labelpad=6, fontsize=9)
# plt.title('Number of Samples/Replicates: 95653', pad=12, fontsize=8, fontweight='ultralight');
# plt.suptitle('Cost Function and Validation Loss from Thaw Depth Modeling in LSTM-AE Framework, Alaska [1969-2022]', fontsize=11);
# plt.grid(linewidth=0.3);
# #plt.show()
# #plt.savefig('ALTstats_LSTM-AEmetrics_1969-2022.png', dpi=1000)


In [None]:
# Get the predicted values
yup_pred_scaled = model2.predict(testXscaltref)

# Unscale the predicted values
yup_pred = yscaleralt.inverse_transform(yup_pred_scaled.reshape(215137,1))
yup_test_unscaled = yscaleralt.inverse_transform(testyscaltref.reshape(-1, 1))


In [None]:
plt.plot(yup_pred_scaled.reshape(215137,1))

In [None]:
plt.plot(yup_pred)

In [None]:
plt.plot(yup_test_unscaled)

In [None]:
plt.plot(trainyalt.values)

In [None]:
plt.plot(y_testco2_reframed.reshape(215137,1))
plt.plot(predict.reshape(215137,1))
#plt.axis([0, 130000, -2, 5])
plt.show()

In [None]:
train, test = X[0:-144], X[-144:]
# walk-forward validation
history = [x for x in train]

In [None]:
from sklearn.metrics import accuracy_score
print("Test accuracy for the unscaled ALT data")
print(f"{accuracy_score(testyaltdfres, y_pred):.2%}\n")

In [None]:
testyaltdfres=pd.DataFrame(testyalt).to_numpy().reshape(215137, 1, 1)

In [None]:
from sklearn.metrics import accuracy_score
print("Test accuracy for the unscaled ALT data")
print(f"{accuracy_score(testyaltdfres, y_pred):.2%}\n")
print("Test accuracy for the standardized ALT data")
print(f"{accuracy_score(testyscalt, y_pred_scaled):.2%}\n")

In [None]:
trainXaltdfres=pd.DataFrame(trainXalt).to_numpy().reshape(1432318, 1, 273)

In [None]:
y_pred = model2.predict(trainXaltdfres)
y_pred_scaled = model2.predict(testXscalt)

In [None]:
testyaltdfres=pd.DataFrame(testyalt).to_numpy().reshape(215137, 1, 1)

In [None]:
from sklearn.metrics import accuracy_score
print("Test accuracy for the unscaled ALT data")
print(f"{accuracy_score(testyaltdfres, y_pred):.2%}\n")
print("Test accuracy for the standardized ALT data")
print(f"{accuracy_score(testyscalt, y_pred_scaled):.2%}\n")

In [None]:
Y_predicted_reframed = model2.predict(testXaltdfres, verbose = 1, use_multiprocessing = True)

In [None]:
plt.plot(testyscalt)
plt.plot(Y_predicted_reframed.reshape(215137,1))

In [None]:
Y_predicted_reframed = bayesian_best_model.predict(X_test_reframed,  batch_size = 384, verbose = 1, use_multiprocessing = True)

Y_predicted_scaled = Y_predicted_reframed.reshape(Y_predicted_reframed.shape[0], Y_predicted_reframed.shape[1])

Y_predicted = scaler_Y.inverse_transform(Y_predicted_scaled)

In [None]:
def plot_results(Y_test, Y_predicted, title = "Test Data and Predictions", index = None):
    if index is None:
        index = range(0, Y_test.shape[0])
    df_index = pd.DataFrame(data = index, index = range(0, Y_test.shape[0]))
    df_index.columns = ["user_index"]  

    shift = Y_test.shape[0] - Y_predicted.shape[0]

    fig, axes = plt.subplots(figsize = (9, 6), sharex = True, nrows = Y_test.shape[1], squeeze = False)
    
    for target, ax in enumerate(axes.flat):
        ax.step(df_index.values, Y_test[:,target], where = "post", label = "Testing Set", color = "blue")
        ax.step(df_index.loc[shift:, "user_index"].values, Y_predicted[:,target], where = "post", 
                label = "Predictions", color = "red")
    
    plt.suptitle(title)
    plt.legend()

In [None]:
plot_results(scaler_Y.inverse_transform(Y_test.values), Y_predicted, index = Y_test.index, 
             title = "Active Layer Thickness /n Predictions v. Test Data \n via 8-layer C1DLSTMSAE Network")

In [None]:
Y_predicted=Y_predicted.reshape(397492,)#;Y_test=Y_test.reshape(668168,)

In [None]:
def qq_plot(Y_test, Y_predicted, title = "Test Data and Predictions", index = None):
    if index is None:
        index = range(0, Y_test.shape[0])
    df_index = pd.DataFrame(data = index, index = range(0, Y_test.shape[0]))
    df_index.columns = ["user_index"]  

    shift = Y_test.shape[0] - Y_predicted.shape[0]

    fig, axes = plt.subplots(figsize = (9, 6), sharex = True, nrows = Y_test.shape[1], squeeze = False)
    
    for target, ax in enumerate(axes.flat):
        ax.scatter(Y_test[shift:,target], Y_predicted[:,target], label = "Predictions", color = "red", s = 5, 
                   alpha = 0.5)
        ax.scatter(Y_test[:,target], Y_test[:,target], label = "Testing Set", color = "blue", s = 5)

    plt.suptitle(title)
    plt.legend()

In [None]:
qq_plot(Y_test.values, Y_predicted_scaled, index = Y_test.index, 
        title = "Active Layer Thickness /n Predictions v. Test Data via \n 7-layer Sequential Time-Distributed C1DLSTMSAE Network")

In [None]:
Y_predicted_reframed = model.predict(X_test_reframed)
Y_predicted = Y_predicted_reframed.reshape(Y_predicted_reframed.shape[0], Y_predicted_reframed.shape[1])

In [None]:
import pandas as pd
testim=pd.read_csv(r'/Users/bradleygay/test_store_ALT_2022.csv')

In [None]:
testim.index=testim.iloc[:,0]

In [None]:
testim=testim.drop(testim.columns[0],axis=1)

In [None]:
testim.index.name = None

In [None]:
#testim.index=pd.to_datetime(testim.index, format='%Y')

In [None]:
testim=testim.sort_index()

In [None]:
testim.index = pd.to_datetime(testim.index)

In [None]:
testim

In [None]:
plt.plot(scaler_Y.inverse_transform(testim))

In [None]:
plot_results(scaler_Y.inverse_transform(Y_test.values), scaler_Y.inverse_transform(Y_predicted),
             index = Y_test.index, title = "Active Layer Thickness - Predicteed v. Test Data")

In [None]:
test_scores = model.evaluate(X_test_reframed, Y_test[backward_steps:],
                                           batch_size=hp["batch_size"], use_multiprocessing=True,)

In [None]:
def metrics_print(test_data,test_predict):
    print('Test RMSE: ', round(np.sqrt(sklearn.metrics.mean_squared_error(test_data, test_predict)), 2))
    print('Test R^2 : ', round((sklearn.metrics.r2_score(test_data, test_predict)*100), 2) ,"%")
    print('Test MAPE: ', round(sklearn.metrics.mean_absolute_percentage_error(test_data, test_predict)*100,2), '%')

In [None]:
print("##************** Linear Regression Results **************##")
metrics_print(prediction_df['Observed'], prediction_df['LR'])
print(" ")
print(" ")

print("##************** Deep Learning Results **************##")
metrics_print(prediction_df['Observed'], prediction_df['DNN'])
print(" ")
print(" ")

In [None]:
fa = plt.figure(figsize=(16,5))
plt.subplot(1,2,1)
plt.scatter(prediction_df['Observed'],prediction_df['LR'])
plt.xlabel('True Values [snow_depth]', fontsize=15)
plt.ylabel('Predictions [snow_depth]', fontsize=15)
plt.title("Linear Regression")


plt.subplot(1,2,2)
plt.scatter(prediction_df['Observed'],prediction_df['DNN'])
plt.xlabel('True Values [snow_depth]', fontsize=15)
plt.ylabel('Predictions [snow_depth]', fontsize=15)
plt.title("Deep Neural Network")

In [None]:
LR_error = prediction_df['Observed'] - prediction_df['LR']
DNN_error = prediction_df['Observed'] - prediction_df['DNN']

fa = plt.figure(figsize=(16,5))

plt.subplot(1,2,1)
LR_error.hist()
plt.xlabel('Error', fontsize=15)
plt.ylabel('Frequency', fontsize=15)
plt.title("Linear Regression")

plt.subplot(1,2,2)
DNN_error.hist()
plt.xlabel('Error', fontsize=15)
plt.ylabel('Frequency', fontsize=15)
plt.title("Deep Neural Network")

In [None]:
trainXaltdfres=pd.DataFrame(trainXalt).to_numpy().reshape(1432318, 1, 273)

In [None]:
y_pred = model2.predict(trainXaltdfres)
y_pred_scaled = model2.predict(testXscalt)

In [None]:
testyaltdfres=pd.DataFrame(testyalt).to_numpy().reshape(215137, 1, 1)

In [None]:
from sklearn.metrics import accuracy_score
print("Test accuracy for the unscaled ALT data")
print(f"{accuracy_score(testyaltdfres, y_pred):.2%}\n")
print("Test accuracy for the standardized ALT data")
print(f"{accuracy_score(testyscalt, y_pred_scaled):.2%}\n")

In [None]:
#ALT
plt.plot(yscaleralt.inverse_transform(testyscalt))

In [None]:
#CH4

In [None]:
#CO2

In [None]:
#alt_model.predict(testXscaltref, testyscaltref, verbose=1)

In [None]:
one=np.concatenate((trainyalt.resample('Y').mean(), validyalt.resample('Y').mean()), axis=0)
two=np.concatenate((one, testyalt.resample('Y').mean()), axis=0)

In [None]:
abc=pd.DataFrame(yscaler.inverse_transform(testyscaltpredres))
#yscaler.inverse_transform(testyscaltpredres)

In [None]:
plt.plot(two)

In [None]:
trainyalt.index

In [None]:
three=pd.DataFrame(trainyscaltpredinv)
#np.concatenate(trainyscaltpredinv, validyscaltpredinv)

In [None]:
three.index=trainyalt.index

In [None]:
plt.plot(three)

In [None]:
two=np.concatenate(one, validyalt.resample('Y').mean()), axis=0)

In [None]:
one=np.concatenate((trainyalt.resample('Y').mean(), validyalt.resample('Y').mean()), axis=0)

In [None]:
plt.plot(one)

In [None]:
plt.plot(trainyalt.resample('Y').mean())
plt.plot(validyalt.resample('Y').mean())
plt.plot(testyalt.resample('Y').mean())

In [None]:
# make predictions
trainPredict = model.predict(trainX)
testPredict = model.predict(testX)
# invert predictions
trainPredict = scaler.inverse_transform(trainPredict)
trainY = scaler.inverse_transform([trainY])
testPredict = scaler.inverse_transform(testPredict)
testY = scaler.inverse_transform([testY])
# calculate root mean squared error
trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
print('Train Score: %.2f RMSE' % (trainScore))
testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
print('Test Score: %.2f RMSE' % (testScore))
#-----Visualize---------- 
# shift train predictions for plotting
trainPredictPlot = numpy.empty_like(dataset)
trainPredictPlot[:, :] = numpy.nan
trainPredictPlot[look_back:len(trainPredict)+look_back, :] = trainPredict
# shift test predictions for plotting
testPredictPlot = numpy.empty_like(dataset)
testPredictPlot[:, :] = numpy.nan
testPredictPlot[len(trainPredict)+(look_back*2)+1:len(dataset)-1, :] = testPredict
# plot baseline and predictions
plt.plot(scaler.inverse_transform(dataset))
plt.plot(trainPredictPlot)
plt.plot(testPredictPlot)
plt.show()

In [None]:
#df['ALT']['2021':].shape
#1432321/2441240#0.5867186347921548
#793782/2441240#0.325155248971834
#215137/2441240#0.0881261162360112

In [None]:
# plt.plot(df.ALT['1970':'2017'][1:].to_numpy().reshape(1432317,1), trainyalt)
# plt.plot(df.ALT['2018':'2020'].values, validyalt)
# plt.plot(df.ALT['2021':][1:].to_numpy().reshape(215136,1), testyalt)
# plt.show()

In [None]:
# plt.figure(figsize=(10,6))
# plt.plot(trainyscalt, label='train')
# plt.plot(trainyscaltpredres, label='trainpred')
# plt.plot(validyscalt, label='valid')
# plt.plot(validyscaltpredres, label='validpred')
# plt.plot(testyscalt, label='test')
# plt.plot(testyscaltpredres, label='testpred')
# plt.legend()
# #plt.axis(xmin=0, xmax=250000)

In [None]:
testXscalt

In [None]:
from sklearn import metrics
metrics.r2_score(testyscalt, testyscaltpred)

In [None]:
#invyhat=np.concatenate((testyscaltpredres,testXscalt[:,-1:]),axis=1)

In [None]:
#invyhat=yscaler.inverse_transform(invyhat)[:,0]

In [None]:
#invy=np.concatenate((testyscalt.reshape((len(testyscalt), 1)),testXscalt[:,1:]),axis=1)

In [None]:
#invy=yscaler.inverse_transform(invy)[:,0]

In [None]:
#rmse = np.sqrt(mean_squared_error(invy, invyhat))
#print('Test RMSE: %.3f' % rmse)

In [None]:
# yhat = model3.predict(test_X)
# test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]))
# # invert scaling for forecast
# inv_yhat = concatenate((yhat, test_X[:, 1:]), axis=1)
# inv_yhat = scaler.inverse_transform(inv_yhat)
# inv_yhat = inv_yhat[:,0]
# # invert scaling for actual
# test_y = test_y.reshape((len(test_y), 1))
# inv_y = concatenate((test_y, test_X[:, 1:]), axis=1)
# inv_y = scaler.inverse_transform(inv_y)
# inv_y = inv_y[:,0]
# # calculate RMSE
# rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
# print('Test RMSE: %.3f' % rmse)

In [None]:
trainXalt.shape, trainXscalt.shape, trainXscaltref.shape

In [None]:
trainyscaltpredres=trainyscaltpred.reshape(1432317,1)
validyscaltpredres=validyscaltpred.reshape(793782,1)
testyscaltpredres=testyscaltpred.reshape(215136,1)

In [None]:
#print(list(trainXalt.columns))

In [None]:
#ind=testyalt.index.values
#ind=trainyalt.index.values
ind=validyalt.index.values

In [None]:
plt.figure(figsize=(10,6))
plt.plot(ind,yscaler.inverse_transform(trainyscalt))
plt.plot(ind,yscaler.inverse_transform(trainyscaltpredres))
plt.show()

In [None]:
plt.figure(figsize=(10,6))
plt.plot(ind,yscaler.inverse_transform(validyscalt))
plt.plot(ind,yscaler.inverse_transform(validyscaltpredres))
plt.show()

In [None]:
plt.figure(figsize=(10,6))
plt.plot(ind,yscaler.inverse_transform(testyscalt))
plt.plot(ind,yscaler.inverse_transform(testyscaltpredres))
plt.show()

In [None]:
trainyscaltpredinv=yscaler.inverse_transform(trainyscaltpredres)
validyscaltpredinv=yscaler.inverse_transform(validyscaltpredres)

In [None]:
trainyscaltpredinv.shape, validyscaltpredinv.shape

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
print(mean_squared_error(trainyalt, trainyscaltpredinv))

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
print(mean_squared_error(train, y_train_pred))

## Exploratory Plots (Active Layer Thickness, Carbon Dioxide, Methane)

In [None]:
# fig,ax=plt.subplots(figsize=(10,7));
# #lns1=ax.plot(history2.history['loss'], color='dodgerblue', label='Loss, ALT (cm)');
# lns2=ax.plot(history2.history['mean_squared_error'], color='dodgerblue', linestyle='solid', label='RMSE, ALT (cm)');
# ax2=ax.twinx();
# #lns3=ax2.plot(history2.history['val_loss'], color='gold', label='Validation Loss, ALT (cm)');
# lns4=ax2.plot(history2.history['val_mean_squared_error'], color='gold', linestyle='solid', label='Validation RMSE, ALT (cm)');
          
# lns = lns2+lns4; #lns1+lns2+lns3+lns4;
# labs = [l.get_label() for l in lns];
# ax2.legend(lns, labs, loc='best', fontsize=8);

# ax.grid(linewidth=0.3);
# ax.set_xlabel('Epochs', labelpad=12, fontsize=10);
# ax.set_ylabel('Scaled Depth to Refusal (cm)', labelpad=12, fontsize=10);
# #ax2.set_ylabel('Scaled Depth to Refusal (cm)', labelpad=6, fontsize=9)
# plt.title('Number of Samples/Replicates: 95653', pad=15, fontsize=12, fontweight='ultralight');
# plt.suptitle('Cost Function and Validation Loss from Thaw Depth Modeling, GeoCryoAI Framework in Alaska [1969-2022]', fontsize=14);
# plt.grid(linewidth=0.3);
# #plt.show()
# #plt.savefig('ALTstats_CNNLSTMSAEmetrics_1969-2022_021323.png', dpi=1000)


In [None]:
# fig,ax=plt.subplots(figsize=(10,7));
# lns1=ax.plot(history.history['loss'], color='dodgerblue', label='Loss, Carbon Dioxide Mole Fraction (µmolCO2mol-1ms-1)');
# lns2=ax.plot(history.history['root_mean_squared_error'], color='dodgerblue', linestyle='dotted', label='RMSE, Carbon Dioxide Mole Fraction (µmolCO2mol-1m)');
# ax2=ax.twinx();
# lns3=ax2.plot(history.history['val_loss'], color='gold', label='Validation Loss, Active Layer Thickness (cm)');
# lns4=ax2.plot(history.history['val_root_mean_squared_error'], color='gold', linestyle='dotted', label='Validation RMSE, Carbon Dioxide Mole Fraction (µmolCO2mol-1m)');
          
# lns = lns1+lns2+lns3+lns4;
# labs = [l.get_label() for l in lns];
# ax2.legend(lns, labs, loc='best', fontsize=8);

# ax.grid(linewidth=0.3);
# ax.set_xlabel('Epochs', labelpad=6, fontsize=9);
# #ax.set_ylabel('Loss', labelpad=6, fon#tsize=9)
# #ax2.set_ylabel('Scaled Depth to Refusal (cm)', labelpad=6, fontsize=9)
# plt.title('Number of Samples/Replicates: 95653', pad=12, fontsize=8, fontweight='ultralight');
# plt.suptitle('Cost Function and Validation Loss from Carbon Dioxide Mole Fraction Modeling in LSTM-AE Framework, Alaska [1969-2022]', fontsize=11);
# plt.grid(linewidth=0.3);
# #plt.show()
# #plt.savefig('ALTstats_LSTM-AEmetrics_1969-2022-x2.png', dpi=1000)


In [None]:
# fig,ax=plt.subplots(figsize=(10,5));
# lns1=ax.plot(history.history['loss'], color='dodgerblue', label='Loss, Methane Flux (nmolCO2m-2s)');
# lns2=ax.plot(history.history['root_mean_squared_error'], color='dodgerblue', linestyle='dotted', label='RMSE, Methane Flux (nmolCO2m-2s)');
# ax2=ax.twinx();
# lns3=ax2.plot(history.history['val_loss'], color='gold', label='Validation Loss, Active Layer Thickness (cm)');
# lns4=ax2.plot(history.history['val_root_mean_squared_error'], color='gold', linestyle='dotted', label='Validation RMSE, Methane Flux (nmolCO2m-2s)');
          
# lns = lns1+lns2+lns3+lns4;
# labs = [l.get_label() for l in lns];
# ax2.legend(lns, labs, loc='best', fontsize=8);

# ax.grid(linewidth=0.3);
# ax.set_xlabel('Epochs', labelpad=6, fontsize=9);
# #ax.set_ylabel('Loss', labelpad=6, fon#tsize=9)
# #ax2.set_ylabel('Scaled Depth to Refusal (cm)', labelpad=6, fontsize=9)
# plt.title('Number of Samples/Replicates: 120539', pad=12, fontsize=8, fontweight='ultralight');
# plt.suptitle('Cost Function and Validation Loss from Methane Flux Modeling in LSTM-AE Framework, Alaska [2015-2018]', fontsize=11);
# plt.grid(linewidth=0.3);
# #plt.show()
# #plt.savefig('ALTstats_LSTM-AEmetrics_1969-2022-x2.png', dpi=1000)


In [None]:
test=alt.loc["2022"].replace(-9999,np.nan).dropna().values

In [None]:
test=np.reshape(test, (test.shape[0], test.shape[1], 1))

In [None]:
testX=test

In [None]:
testy=np.reshape(test, (test.shape[0],))

In [None]:
#X_test.shape, y_test.shape

In [None]:
testX.shape, testy.shape

In [None]:
model.evaluate(X_testsc,y_testsc)
#Not an accurate depiction due to scaling; must invert prior to quantifying error

In [None]:
# train_Xt=np.array(trainX).reshape(1772962,1)
# train_yt=np.array(trainY).reshape(1772962,)
# valid_Xt=np.array(validX).reshape(453143,1)
# valid_yt=np.array(validY).reshape(453143,)
# test_Xt=np.array(testX).reshape(215135,1)
# test_yt=np.array(testY).reshape(215135,)

In [None]:
# lin_reg = linear_model.LinearRegression()
# # train model
# lin_reg.fit(train_Xt,train_yt)
# # predict
# y_train_pred = lin_reg.predict(train_Xt)
# y_valid_pred = lin_reg.predict(valid_Xt)
# y_test_pred = lin_reg.predict(test_Xt)
# # Plot predictions
# fig=plt.figure()
# plt.scatter(y_train_pred, train_yt, c = "blue", marker = "s", label = "Training data")
# plt.scatter(y_valid_pred, valid_yt, c = "magenta", marker = "s", label = "Validation data")
# plt.scatter(y_test_pred, test_yt, c = "lightgreen", marker = "s", label = "Testing data")
# plt.legend()
# plt.show()

In [None]:
# from sklearn.metrics import mean_squared_error
# error =np.sqrt(mean_squared_error(valid_yt, y_valid_pred))
# print(error)

In [None]:
# prediction=lin_reg.predict(valid_Xt)

In [None]:
# plt.plot(valid_yt, linestyle='dotted');
# plt.plot(prediction, linestyle='dotted');

In [None]:
plt.figure()
plt.ylabel('loss'); plt.xlabel('epoch')
plt.semilogy(history.history['loss'])

In [None]:
axes=plt.axes()
#axes.plot(pd.DataFrame(history.history)['loss'], label='Loss')
axes.plot(pd.DataFrame(history.history)['val_loss'], label='Validation Loss')
axes.legend(loc=0)
axes.set_title('Model fitting performance')

In [None]:
#1,#3
axes=plt.axes()
axes.plot(pd.DataFrame(history.history)['loss'], label='Loss')
axes.plot(pd.DataFrame(history.history)['val_loss'], label='Validation Loss')
axes.legend(loc=0)
axes.set_title('Model fitting performance')

In [None]:
sc=StandardScaler()
sc.fit_transform(alt)
newtest=alt.loc["2020":"2022":,:].values
newtest=sc.transform(np.reshape(alt.loc["2020":"2022":,:].values, (-1, 1)))
newtest=np.reshape(newtest, (newtest.shape[0],newtest.shape[1],1))

In [None]:
newtest.shape

In [None]:
preds=model.predict(newtest)

In [None]:
preds=preds.reshape(11050,1)
unspreds=sc.inverse_transform(preds)

In [None]:
plt.plot(unspreds, color = '#135485', linestyle='solid', label = "Predictions")
plt.plot(alt.loc["2020":"2022":,:].values, color = 'pink', linestyle='dotted', label = "Real Data")

In [None]:
newtest=alt.loc["2020":"2022":,:].values
testScore = np.sqrt(keras.losses.mean_squared_error(newtest[0], unspreds[:,0]))
print('Test Score: %.6f RMSE' % (testScore))

In [None]:
sc=StandardScaler()
sc.fit_transform(alt)
#make predictions
trainPredict = model.predict(trainX)
validPredict = model.predict(validX)
#invert predictions
trainPredict=trainPredict.reshape(68801,1)
validPredict=validPredict.reshape(18902,1)
trainy=trainy.reshape(68801,)
validy=validy.reshape(18902,)
trainPredict = sc.inverse_transform(trainPredict)
trainy = sc.inverse_transform([trainy])
validPredict = sc.inverse_transform(validPredict)
validy = sc.inverse_transform([validy])

In [None]:
# calculate root mean squared error
trainScore = np.sqrt(keras.losses.mean_squared_error(trainy[0], trainPredict[:,0]))
print('Train Score: %.6f RMSE' % (trainScore))
validScore = np.sqrt(keras.losses.mean_squared_error(validy[0], validPredict[:,0]))
print('Valid Score: %.6f RMSE' % (validScore))

In [None]:
test=altsc.loc["2020":"2022"]

In [None]:
test.values

In [None]:
testPredict=model.predict(test.values.reshape(3363,1,1))
testPredict=testPredict.reshape(3363,1)
testy=test.values.reshape(3363,)
testPredict=sc.inverse_transform(testPredict)
testy=sc.inverse_transform([testy])
testScore = np.sqrt(keras.losses.mean_squared_error(testy[0], testPredict[:,0]))
print('Test Score: %.6f RMSE' % (testScore))

In [None]:
trainPredict.shape, validPredict.shape, testPredict.shape

In [None]:
#plt.plot(trainX.reshape(68801,1))
#plt.plot(validX.reshape(18902,1))
plt.plot(trainPredict.reshape(68801,1))
plt.plot(validPredict.reshape(18902,1))
plt.plot(testPredict.reshape(3363,1))

In [None]:
testPredict=testPredict.reshape(11050,1)

In [None]:
testScore = np.sqrt(keras.losses.mean_squared_error(testX[0], testPredict[:,0]))
print('Test Score: %.6f RMSE' % (testScore))

In [None]:
# make predictions
trainPredict = model.predict(trainX)
validPredict = model.predict(validX)
testPredict = model.predict(testX)

In [None]:
trainPredict.shape, validPredict.shape, testPredict.shape

In [None]:
# # make predictions
# #test_Xt=np.array(test_X)
# #test_yt=np.array(test_y)
# trainPredict.shape, testPredict.shape
# trainPredict=trainPredict.reshape(68801,1)
# validPredict=validPredict.reshape(18902,1)
# testPredict=testPredict.reshape(11050,1)
# trainPredict.shape, testPredict.shape, validPredict.shape

In [None]:
# # make predictions
# #test_Xt=np.array(test_X)
# #test_yt=np.array(test_y)
# trainPredict.shape, testPredict.shape
# trainPredict=trainPredict.reshape(298498,1)
# validPredict=validPredict.reshape(144987,1)
# testPredict=testPredict.reshape(103616,1)
# trainPredict.shape, testPredict.shape, validPredict.shape

In [None]:
# make predictions
#test_Xt=np.array(test_X)
#test_yt=np.array(test_y)
trainPredict=trainPredict.reshape(68801,1)
validPredict=validPredict.reshape(18902,1)
testPredict=testPredict.reshape(11050,1)
trainPredict.shape, validPredict.shape, testPredict.shape

In [None]:
# invert prediction
sc=StandardScaler()
newTrain=sc.fit_transform(trainPredict)
newTrain=sc.inverse_transform(trainPredict)
sc=StandardScaler()
newValid = sc.fit(validPredict)
newValid = sc.inverse_transform(validPredict)
#sc=StandardScaler().fit_transform(testPredict)
newTest = testPredict

In [None]:
newTrain.shape == trainPredict.shape, newValid.shape == validPredict.shape, newTest.shape == testPredict.shape

In [None]:
trainX.shape, trainPredict.shape, newTrain.shape

In [None]:
#plt.plot(X_train)
#plt.plot(trainPredict)
#plt.plot(newTrain)

In [None]:
print("Train": 1772964*91)
print("Valid": 614894*91)
print("Test": 53388*91)

In [None]:
# calculate root mean squared error
trainScore = np.sqrt(keras.losses.mean_squared_error(newTrain[0], trainPredict[:,0]))
print('Train Score: %.6f RMSE' % (trainScore))
validScore = np.sqrt(keras.losses.mean_squared_error(newValid[0], validPredict[:,0]))
print('Valid Score: %.6f RMSE' % (validScore))
testScore = np.sqrt(keras.losses.mean_squared_error(newTest[0], testPredict[:,0]))
print('Test Score: %.6f RMSE' % (testScore))

# 