# Static joining

In [8]:
import pandas as pd
import numpy as np

In [9]:
df1 = pd.read_csv(r"data\CAMELS_IND_Catchments_Streamflow_Sufficient\attributes_csv\camels_ind_anth.csv")
df2 = pd.read_csv(r"data\CAMELS_IND_Catchments_Streamflow_Sufficient\attributes_csv\camels_ind_clim.csv")
df3 = pd.read_csv(r"data\CAMELS_IND_Catchments_Streamflow_Sufficient\attributes_csv\camels_ind_geol.csv")
df4 = pd.read_csv(r"data\CAMELS_IND_Catchments_Streamflow_Sufficient\attributes_csv\camels_ind_hydro.csv")
df5 = pd.read_csv(r"data\CAMELS_IND_Catchments_Streamflow_Sufficient\attributes_csv\camels_ind_land.csv")
df6 = pd.read_csv(r"data\CAMELS_IND_Catchments_Streamflow_Sufficient\attributes_csv\camels_ind_name.csv")
df7 = pd.read_csv(r"data\CAMELS_IND_Catchments_Streamflow_Sufficient\attributes_csv\camels_ind_soil.csv")
df8 = pd.read_csv(r"data\CAMELS_IND_Catchments_Streamflow_Sufficient\attributes_csv\camels_ind_topo.csv")

In [10]:
# Merge all dataframes on gauge_id to include all gauge IDs
df_merged = df1.merge(df2, on='gauge_id', how='outer') \
              .merge(df3, on='gauge_id', how='outer') \
              .merge(df4, on='gauge_id', how='outer') \
              .merge(df5, on='gauge_id', how='outer') \
              .merge(df6, on='gauge_id', how='outer') \
              .merge(df7, on='gauge_id', how='outer') \
              .merge(df8, on='gauge_id', how='outer')

In [11]:
# Check the shape and number of unique gauge IDs
print(f"Merged DataFrame shape: {df_merged.shape}")
print(f"Number of unique gauge IDs: {df_merged['gauge_id'].nunique()}")
print(f"Number of columns: {len(df_merged.columns)}")

Merged DataFrame shape: (242, 211)
Number of unique gauge IDs: 242
Number of columns: 211


In [12]:
# Check for any duplicate columns (shouldn't be any with proper merge)
duplicate_cols = df_merged.columns[df_merged.columns.duplicated()].tolist()
print(f"Duplicate columns: {duplicate_cols}")

Duplicate columns: []


In [13]:
# Remove any duplicate columns if they exist
df_merged = df_merged.loc[:, ~df_merged.columns.duplicated()]

In [14]:
# Save the complete merged dataset
df_merged.to_csv("static_all_gauges.csv", index=False)
print("Merged dataset saved as 'static_all_gauges.csv'")

Merged dataset saved as 'static_all_gauges.csv'


# Stactic Data Dimn Redn

In [1]:
import pandas as pd
import numpy as np

In [7]:
import umap.umap_ as umap
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
import warnings
warnings.filterwarnings('ignore')

print("=== UMAP DIMENSIONALITY REDUCTION FOR STATIC DATA ===\n")

df_static = pd.read_csv("gauge_data/static_all_gauges.csv")
print(f"Original static dataset shape: {df_static.shape}")

numerical_cols = df_static.select_dtypes(include=[np.number]).columns.tolist()
print(f"Number of numerical columns: {len(numerical_cols)}")

if 'gauge_id' in numerical_cols:
    numerical_cols.remove('gauge_id')

df_numeric = df_static[numerical_cols].copy()

categorical_cols = df_static.select_dtypes(include=['object', 'string']).columns.tolist()
print(f"Number of categorical columns: {len(categorical_cols)}")

# Encode categorical variables if they exist
if categorical_cols:
    print("Encoding categorical variables...")
    df_categorical_encoded = df_static[categorical_cols].copy()
    
    # Use OrdinalEncoder for categorical variables
    encoders = {}
    for col in categorical_cols:
        if col != 'gauge_id':  # Skip gauge_id
            encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
            df_categorical_encoded[col] = encoder.fit_transform(df_categorical_encoded[[col]])
            encoders[col] = encoder
    
    # Combine numerical and encoded categorical data
    df_for_umap = pd.concat([df_numeric, df_categorical_encoded], axis=1)
else:
    df_for_umap = df_numeric.copy()

print(f"Data for UMAP shape: {df_for_umap.shape}")

# Handle missing values
print(f"Missing values before cleaning: {df_for_umap.isnull().sum().sum()}")
df_for_umap = df_for_umap.fillna(df_for_umap.median())
print(f"Missing values after cleaning: {df_for_umap.isnull().sum().sum()}")

# Standardize the features
print("Standardizing features...")
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_for_umap)
print(f"Scaled dataset shape: {df_scaled.shape}")


=== UMAP DIMENSIONALITY REDUCTION FOR STATIC DATA ===

Original static dataset shape: (242, 211)
Number of numerical columns: 198
Number of categorical columns: 13
Encoding categorical variables...
Data for UMAP shape: (242, 210)
Missing values before cleaning: 3463
Missing values after cleaning: 0
Standardizing features...
Scaled dataset shape: (242, 210)


In [12]:
n_dim = 40

# Create UMAP with current dimension
umap_model = umap.UMAP(
    n_components=n_dim, 
    random_state=42, 
    n_neighbors=15, 
    min_dist=0.1,
    n_epochs=1000,
    verbose=False
)

embedding_nd = umap_model.fit_transform(df_scaled)

# Create base dataframe with gauge_id for joining
base_df = df_static[['gauge_id']].copy()

column_names = [f"umap_{n_dim}d_dim_{i+1}" for i in range(n_dim)]
embedding_df = pd.DataFrame(embedding_nd, columns=column_names)

# Add gauge_id
embedding_with_id = pd.concat([base_df, embedding_df], axis=1)

embedding_with_id.to_csv(f"output/static_all_umap_{n_dim}d.csv", index=False)
print(f"Saved  UMAP embedding to 'static_umap_{n_dim}d.csv'")

Saved  UMAP embedding to 'static_umap_40d.csv'


# Static and Dynamic joining

In [None]:
import os
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

dynamic_dir = "gauge_data/dynamic"
output_dir = "output/dynamic"
os.makedirs(output_dir, exist_ok=True)

df_static = pd.read_csv("output/static_all_umap_40d.csv")

for fname in os.listdir(dynamic_dir):
    if fname.endswith(".csv"):
        gauge_id = int(fname.split(".")[0].lstrip("0"))
        
        df_dyn = pd.read_csv(os.path.join(dynamic_dir, fname))
        
        if 'date' in df_dyn.columns:
            df_dyn = df_dyn.drop('date', axis=1)
            
        row = df_static[df_static['gauge_id'] == gauge_id].reset_index().loc[0]
        n = len(df_dyn)
        duplicates = pd.DataFrame([row] * n, index=df_dyn.index)
        merged_df = pd.concat([df_dyn, duplicates], axis=1)
        
        if 'index' in merged_df.columns:
            merged_df = merged_df.drop(columns=['index'])
        if 'gauge_id' in merged_df.columns:
            merged_df = merged_df.drop(columns=['gauge_id'])
            
        cat = merged_df.select_dtypes(include=['object', 'string']).columns
        
        for column in cat:
            le = OrdinalEncoder()
            reshaped = merged_df[[column]]
            merged_df[column] = le.fit_transform(reshaped)
        merged_df.to_csv(os.path.join(output_dir, f"merged_df_{gauge_id}.csv"), index=False)
