In [None]:
# !pip install uv
# !uv pip install  -r requirements.txt

In [1]:
# import snowflake
# from snowflake.snowpark.context import get_active_session
#
# session = get_active_session()

In [2]:
# import warnings
# from benchmark_model_notebook_snowflake import landsat_val_features
#
# warnings.filterwarnings("ignore")

In [None]:
# # Stage or location to check
# stage_location = """snow://workspace/USER$.PUBLIC."EY-AI-and-Data-Challenge"/versions/live/terraclimate_training_soil_data/"""
#
# # Run LIST command to get files in the stage
# df_files = session.sql(f"LIST '{stage_location}'")
#
# # Count the number of files
# file_count = df_files.count()
#
# df_files.show()
#
# print(f"Number of files in {stage_location}: {file_count}")

In [None]:
# from pathlib import Path
# import zipfile
#
# folder_name = "terraclimate_training_vpd_data"
#
# source_dir = Path(f"./{folder_name}/")      # folder containing 60 CSVs
# zip_path   = Path(f"/tmp/{folder_name}.zip")
#
# csv_files = sorted(source_dir.glob("*.csv"))  # or "**/*.csv" for recursive
#
# if not csv_files:
#     raise FileNotFoundError(f"No CSV files found in {source_dir}")
#
# with zipfile.ZipFile(zip_path, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
#     for file in csv_files:
#         zf.write(file, arcname=file.name)
#
# print(f"Created: {zip_path} ({len(csv_files)} files)")
#
# session.sql(f"""
#     PUT file:///tmp/{folder_name}.zip
#     'snow://workspace/USER$.PUBLIC."EY-AI-and-Data-Challenge"/versions/live/'
#     AUTO_COMPRESS=FALSE
#     OVERWRITE=TRUE
# """).collect()
#
# print("File saved! Refresh the browser to see the files in the sidebar")

In [1]:
from pathlib import Path
import pandas as pd
from attr import dataclass

In [2]:
# User Entered Parameters
INCLUDE_RAW_BANDS = True
DATA_DIR = "data"
SOURCE_DATA_DIRS = ['landsat']

In [57]:
landsat_variables = ['NDMI', 'MNDWI'] # Baseline
landsat_variables_mvdb = ['EVI', 'OSAVI', 'GNDVI', 'GCVI', 'MSI', 'NBR', 'Green/Red Ratio', 'NDGI', 'UI (Urban Index)', 'NBR2', 'Red/NIR Ratio', 'Green/NIR Ratio', 'NDWI']
landsat_variables_ross = ['NVDI', 'SAVI', 'BSI', 'NDBI', 'TCWI', ]

landsat_variables.extend(landsat_variables_mvdb)
landsat_variables.extend(landsat_variables_ross)
landsat_variables = [var.lower() for var in landsat_variables]

bands_of_interest = ['qa', 'red', 'blue', 'drad', 'emis', 'emsd', 'lwir', 'trad', 'urad', 'atran', 'cdist', 'green', 'nir08', 'lwir', 'swir16', 'swir22', 'cloud_qa', 'qa_pixel', 'qa_radsat', 'atmos_opacity']
if INCLUDE_RAW_BANDS:
    landsat_variables.extend(bands_of_interest)

In [76]:
def make_dataframe_cols_lowercase(df):
    for col in df.columns:
        df.rename(columns={col: col.lower()}, inplace=True)

def add_formatted_join_column(df, join_col_name, drop_cols=None):
    df['sample date'] = pd.to_datetime(df['sample date'], format='mixed').dt.strftime('%d-%m-%Y')
    df['latitude'] = df['latitude'].round(6)
    df['longitude'] = df['longitude'].round(6)
    df[join_col_name] = df['latitude'].astype(str) + "~" + df['longitude'].astype(str) + "~" + df['sample date']
    if drop_cols:
        df.drop(columns=drop_cols, inplace=True)


In [97]:
training_dfs = {}
submission_dfs = {}

for i in range(len(SOURCE_DATA_DIRS)):
    current_dir = SOURCE_DATA_DIRS[i]
    source_dir = Path(f"./{DATA_DIR}/{current_dir}/")
    print(f"Processing data in: {source_dir}...")

    csv_files = sorted(source_dir.glob("*.csv"))
    training_df = pd.read_csv(f'./{DATA_DIR}/water_quality_training_dataset.csv')
    submission_df = pd.read_csv(f'./{DATA_DIR}/submission_template.csv')
    training_df_row_num = training_df.shape[0]
    submission_df_row_num = submission_df.shape[0]

    # Ensure columns are all lowercase
    make_dataframe_cols_lowercase(training_df)
    make_dataframe_cols_lowercase(submission_df)

    add_formatted_join_column(training_df, "join_column")
    add_formatted_join_column(submission_df, "join_column")

    join_columns = ['latitude', 'longitude', 'sample date']

    for i in range(len(csv_files)):

        print(f"\tProcessing {csv_files[i]}...")
        data_df = pd.read_csv(f"{csv_files[i]}")
        make_dataframe_cols_lowercase(data_df)


        # Only keep columns we want to keep
        keep_columns_train = [col for col in data_df.columns if col in landsat_variables and col not in training_df.columns]
        keep_columns_val = [col for col in data_df.columns if col in landsat_variables and col not in submission_df.columns]
        keep_columns_train = sorted(keep_columns_train)
        keep_columns_val = sorted(keep_columns_val)
        keep_columns_train.extend(join_columns)
        keep_columns_val.extend(join_columns)

        if 'training' in str(csv_files[i]):
            data_df = data_df[keep_columns_train]
            add_formatted_join_column(data_df, "join_column", drop_cols=join_columns)
            training_df = training_df.merge(data_df, on="join_column", how='inner')

        elif 'validation' in str(csv_files[i]):
            data_df = data_df[keep_columns_val]
            add_formatted_join_column(data_df, "join_column", drop_cols=join_columns)
            submission_df = submission_df.merge(data_df, on="join_column", how='inner')

    training_df.drop(columns=['join_column'], inplace=True)
    submission_df.drop(columns=['join_column'], inplace=True)

    assert training_df_row_num == training_df.shape[0], f"{training_df_row_num - training_df.shape[0]} rows dropped from training_df!"
    assert submission_df_row_num == submission_df.shape[0], f"{submission_df_row_num - submission_df.shape[0]} rows dropped from submission_df!"

    training_dfs[current_dir] = (training_df, submission_df)


Processing data in: data\landsat...
	Processing data\landsat\landsat_features_training_all_bands.csv...
	Processing data\landsat\landsat_features_training_baseline.csv...
	Processing data\landsat\landsat_features_training_mvdb.csv...
	Processing data\landsat\landsat_features_training_ross.csv...
	Processing data\landsat\landsat_features_validation_baseline.csv...
	Processing data\landsat\landsat_features_validation_ross.csv...


In [98]:
training_df = training_dfs['landsat'][0]
validation_df = training_dfs['landsat'][1]

In [99]:
# Landsat Training data
training_df.to_csv("./data/landsat_features_training_combined.csv", index=False)

In [100]:
# Landsat Validation data
validation_df.to_csv("./data/landsat_features_validation_combined.csv", index=False)