In [None]:
import zipfile
import os

with zipfile.ZipFile("GLOFdata.zip", 'r') as zip_ref:
    zip_ref.extractall("data")

In [None]:
import pandas as pd
mydf = pd.read_csv("C:\Users\aadee\OneDrive\Desktop\GLOF Final\prepared_dataset.csv", usecols=["lat", "lon", "mid_date", "v [m/yr]"])

In [35]:
import pandas as pd

# First ensure mid_date is a datetime type (if it isn't already)
mydf['mid_date'] = pd.to_datetime(mydf['mid_date'])

# Filter for dates between 2000-01-01 and 2021-12-31
filtered_df = mydf[(mydf['mid_date'] >= '2022-04-01') & (mydf['mid_date'] <= '2022-8-31')]

# Display the filtered data
print(f"Original data points: {len(mydf)}")
print(f"Filtered data points: {len(filtered_df)}")
filtered_df.head()

Original data points: 73590934
Filtered data points: 4899187


Unnamed: 0,mid_date,lon,lat,v [m/yr]
15,2022-06-29 05:47:01.220321024,74.353,36.25,4.0
27,2022-07-04 05:46:39.220404992,74.353,36.25,6.0
67,2022-07-20 13:05:53.109451008,74.353,36.25,44.0
68,2022-07-31 17:48:45.220420096,74.353,36.25,2.0
87,2022-05-20 05:48:59.220124928,74.353,36.25,2.0


In [36]:
df = filtered_df
df[(df["v [m/yr]"] == "2021-11-15")]

Unnamed: 0,mid_date,lon,lat,v [m/yr]


In [6]:
# Cell 1: Import necessary libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import matplotlib.pyplot as plt
import logging

# Set up logging to track training metrics
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
logger = logging.getLogger()

# Check device availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
logger.info(f"Using device: {device}")

In [38]:
# # Cell # 2
# def load_and_preprocess_data(df):
#     """
#     Load and preprocess the dataset for a Transformer model.

#     Parameters:
#     - file_path (str): Path to the CSV file containing the dataset.

#     Returns:
#     - data (np.array): Preprocessed data with scaled features and target.
#     - scaler_X (StandardScaler): Scaler for the features.
#     - scaler_y (StandardScaler): Scaler for the target.
#     """
#     # Load the dataset from CSV
#     df = df

#     # Drop rows with missing values to ensure data quality
#     df = df.dropna()

#     # Convert mid_date to datetime
#     df['mid_date'] = pd.to_datetime(df['mid_date'])

#     # Convert datetime to an ordinal timestamp including time of day
#     # Ordinal date + fractional day (hours, minutes, seconds as a fraction of 86400 seconds)
#     df['ordinal'] = df['mid_date'].apply(
#         lambda x: x.toordinal() + (x.hour * 3600 + x.minute * 60 + x.second) / 86400.0
#     )

#     # Define features and target
#     features = ['ordinal', 'lat', 'lon']
#     target = 'v [m/yr]'

#     # Extract feature and target arrays
#     X = df[features].values
#     y = df[target].values

#     # Scale features
#     scaler_X = StandardScaler()
#     X_scaled = scaler_X.fit_transform(X)
#     print(X_scaled[:10])


#     # Scale target
#     scaler_y = StandardScaler()
#     y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))
#     print(y_scaled[:10])
#     # Combine scaled features and target into a single array
#     data = np.hstack((X_scaled, y_scaled))

#     print(f"Preprocessed data shape: {data.shape}")
#     print(data[:2])
#     return data, scaler_X, scaler_y

In [50]:
import pandas as pd
import numpy as np

def investigate_and_summarize_velocity(df: pd.DataFrame) -> pd.DataFrame:
    """
    Processes a DataFrame to create a daily summary for EACH unique lat/lon pair,
    calculating the max and average velocity. Includes detailed investigation steps.

    Parameters:
    - df (pd.DataFrame): Input DataFrame with 'mid_date', 'lon', 'lat', 'v [m/yr]'.

    Returns:
    - pd.DataFrame: Summary DataFrame.
    """
    # Input validation
    if not isinstance(df, pd.DataFrame):
        raise ValueError("Input must be a pandas DataFrame.")
    required_cols = ['mid_date', 'lon', 'lat', 'v [m/yr]']
    if not all(col in df.columns for col in required_cols):
        raise ValueError(f"Input DataFrame missing required columns: {required_cols}")

    # Data Preparation
    proc_df = df.copy()
    proc_df = proc_df.dropna(subset=required_cols)
    try:
        proc_df['mid_date'] = pd.to_datetime(proc_df['mid_date'])
    except Exception as e:
        raise ValueError(f"Error converting 'mid_date' to datetime: {e}")

    proc_df = proc_df.sort_values(by=['lon', 'lat', 'mid_date'])
    proc_df['year'] = proc_df['mid_date'].dt.year
    proc_df['month'] = proc_df['mid_date'].dt.month
    proc_df['day'] = proc_df['mid_date'].dt.day

    # --- Investigation Step 1: Check Data Type ---
    print(f"\n--- Data type of 'v [m/yr]' before grouping: {proc_df['v [m/yr]'].dtype} ---")
    # Ensure it's numeric, attempt conversion if not (and if sensible)
    if not pd.api.types.is_numeric_dtype(proc_df['v [m/yr]']):
         print("Warning: 'v [m/yr]' is not a numeric type. Attempting conversion.")
         try:
             proc_df['v [m/yr]'] = pd.to_numeric(proc_df['v [m/yr]'])
             print(f"--- Data type after conversion: {proc_df['v [m/yr]'].dtype} ---")
         except Exception as e:
             raise ValueError(f"Could not convert 'v [m/yr]' to numeric: {e}")


    # --- Investigation Step 2: Inspect Raw Data for a Specific Group ---
    # Using the group from row 2 of your previous output head
    inspect_lon, inspect_lat, inspect_year, inspect_month, inspect_day = 74.353, 36.182, 1990, 4, 4
    print(f"\n--- Inspecting raw data for group (lon={inspect_lon}, lat={inspect_lat}, {inspect_year}-{inspect_month:02d}-{inspect_day:02d}) ---")
    specific_group_filter = (
        (proc_df['lon'] == inspect_lon) &
        (proc_df['lat'] == inspect_lat) &
        (proc_df['year'] == inspect_year) &
        (proc_df['month'] == inspect_month) &
        (proc_df['day'] == inspect_day)
    )
    specific_group_velocities = proc_df.loc[specific_group_filter, 'v [m/yr]']

    if not specific_group_velocities.empty:
        print(f"Number of raw velocity values in this group: {len(specific_group_velocities)}")
        print(f"Raw velocity values:\n{specific_group_velocities.tolist()}") # Show the actual values
        print(f"Manual Max calculation: {specific_group_velocities.max()}")
        print(f"Manual Mean calculation: {specific_group_velocities.mean()}")
    else:
        print("No raw data found for this specific group (check coordinates/date).")
    print("--------------------------------------------------------------------------")


    # --- Aggregation (same as before) ---
    print("\n--- Performing Aggregation ---")
    loc_daily_summary = proc_df.groupby(['lon', 'lat', 'year', 'month', 'day']).agg(
        max_velocity=('v [m/yr]', 'max'),
        avg_velocity=('v [m/yr]', 'mean')
    ).reset_index()
    print("Aggregation complete.")


    # --- Investigation Step 3: Check Final Result for Differences ---
    print("\n--- Checking summary rows where max_velocity != avg_velocity ---")
    # Use numpy.isclose() for safer float comparison, check if NOT close
    diff_rows = loc_daily_summary[~np.isclose(loc_daily_summary['max_velocity'], loc_daily_summary['avg_velocity'])]
    num_diff_rows = len(diff_rows)
    print(f"Number of summary rows where max_velocity significantly differs from avg_velocity: {num_diff_rows}")

    if num_diff_rows > 0:
        print(f"Percentage of rows with differing max/avg: {100 * num_diff_rows / len(loc_daily_summary):.2f}%")
        print("Example rows where they differ:")
        print(diff_rows.head()) # Show examples where they DO differ
    else:
        print("Still no rows found where max_velocity differs significantly from avg_velocity.")
    print("---------------------------------------------------------------")


    # --- Optional: Cyclical Feature Encoding (same as before) ---
    # ... (code remains the same) ...
    print("\n--- Adding Cyclical Features ---")
    loc_daily_summary['month_sin'] = np.sin(2 * np.pi * loc_daily_summary['month'] / 12)
    loc_daily_summary['month_cos'] = np.cos(2 * np.pi * loc_daily_summary['month'] / 12)
    try:
        temp_date_for_daysinmonth = pd.to_datetime(loc_daily_summary[['year', 'month', 'day']])
        days_in_month = temp_date_for_daysinmonth.dt.days_in_month
        loc_daily_summary['day_sin'] = np.sin(2 * np.pi * loc_daily_summary['day'] / days_in_month)
        loc_daily_summary['day_cos'] = np.cos(2 * np.pi * loc_daily_summary['day'] / days_in_month)
    except Exception as e:
        print(f"Warning: Could not perform daily cyclical encoding. Error: {e}")
        loc_daily_summary['day_sin'] = np.nan
        loc_daily_summary['day_cos'] = np.nan


    # --- Final Touches (same as before) ---
    # ... (code remains the same) ...
    final_columns_order = [
        'lon', 'lat', 'year', 'month', 'day',
        'max_velocity', 'avg_velocity',
        'month_sin', 'month_cos', 'day_sin', 'day_cos'
    ]
    final_columns_order = [col for col in final_columns_order if col in loc_daily_summary.columns]
    loc_daily_summary = loc_daily_summary[final_columns_order]
    print("--- Function Complete ---")
    return loc_daily_summary

# Example Usage:
# Run this function with your raw DataFrame
summary_df = investigate_and_summarize_velocity(mydf)
print("\n--- Final Resulting Summary DataFrame (Head) ---")
print(summary_df.head())
print(f"\nShape of the final DataFrame: {summary_df.shape}")


--- Data type of 'v [m/yr]' before grouping: float64 ---

--- Inspecting raw data for group (lon=74.353, lat=36.182, 1990-04-04) ---
Number of raw velocity values in this group: 1
Raw velocity values:
[53.0]
Manual Max calculation: 53.0
Manual Mean calculation: 53.0
--------------------------------------------------------------------------

--- Performing Aggregation ---
Aggregation complete.

--- Checking summary rows where max_velocity != avg_velocity ---
Number of summary rows where max_velocity significantly differs from avg_velocity: 9187342
Percentage of rows with differing max/avg: 76.79%
Example rows where they differ:
       lon     lat  year  month  day  max_velocity  avg_velocity
14  74.353  36.182  1991      2    6          53.0          39.5
23  74.353  36.182  1991      7    8           8.0           6.5
24  74.353  36.182  1991      7   16           8.0           5.5
25  74.353  36.182  1991      7   24          16.0          12.5
33  74.353  36.182  1991     10   28   

# **OLD CODE**

In [55]:
# mydf1 = summary_df[["lon", 'lat', 'year', 'month_sin', 'month_cos', 'day_sin', 'day_cos', 'max_velocity', 'avg_velocity']]
# mydf1

Unnamed: 0,lon,lat,year,month_sin,month_cos,day_sin,day_cos,max_velocity,avg_velocity
0,74.353,36.182,1989,5.000000e-01,-0.866025,0.101168,-0.994869,1.0,1.0
1,74.353,36.182,1989,-5.000000e-01,-0.866025,0.394356,0.918958,6.0,6.0
2,74.353,36.182,1990,8.660254e-01,-0.500000,0.743145,0.669131,53.0,53.0
3,74.353,36.182,1990,1.224647e-16,-1.000000,0.743145,-0.669131,6.0,6.0
4,74.353,36.182,1990,1.224647e-16,-1.000000,-0.866025,-0.500000,14.0,14.0
...,...,...,...,...,...,...,...,...,...
11964151,74.540,36.323,2024,-8.660254e-01,0.500000,0.571268,0.820763,20.0,13.5
11964152,74.540,36.323,2024,-8.660254e-01,0.500000,0.988468,0.151428,43.0,25.0
11964153,74.540,36.323,2024,-8.660254e-01,0.500000,0.790776,-0.612106,9.0,9.0
11964154,74.540,36.323,2024,-8.660254e-01,0.500000,0.101168,-0.994869,12.0,12.0


In [56]:
# Cell 3: Define the TimeSeriesDataset class
class TimeSeriesDataset(Dataset):
    """Custom Dataset for time series data."""
    def __init__(self, data, seq_length):
        self.data = data.astype(np.float32)
        self.seq_length = seq_length

    def __len__(self):
        return len(self.data) - self.seq_length

    def __getitem__(self, idx):
        """Return sequence of features and target."""
        X = self.data[idx:idx + self.seq_length, :-1]  # All features except target
        y = self.data[idx + self.seq_length - 1, -1]   # Last velocity value in sequence
        return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)


In [57]:
# Cell 4: Define the Transformer model
class TransformerRegressor(nn.Module):
    """Transformer model for velocity prediction."""
    def __init__(self, input_dim, d_model, nhead, num_layers, dropout=0.1):
        super(TransformerRegressor, self).__init__()
        self.embedding = nn.Linear(input_dim, d_model)
        self.pos_encoder = nn.Parameter(torch.zeros(1, 1000, d_model))  # Positional encoding
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dropout=dropout, batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(d_model, 1)

        # Initialize weights
        self._init_weights()

    def _init_weights(self):
        """Initialize model weights."""
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def forward(self, x):
        """Forward pass through the model."""
        batch_size, seq_len, _ = x.size()
        x = self.embedding(x)  # [batch_size, seq_len, d_model]
        x = x + self.pos_encoder[:, :seq_len, :]  # Add positional encoding
        x = self.transformer_encoder(x)  # [batch_size, seq_len, d_model]
        x = self.fc(x[:, -1, :])  # Predict from last time step
        return x

In [41]:
# Cell 5: Training function with metric tracking
import torch
from tqdm import tqdm
import matplotlib.pyplot as plt

def count_trainable_parameters(model):
    """
    Count and return the number of trainable parameters in the model.
    
    Parameters:
    - model (nn.Module): The PyTorch model.
    
    Returns:
    - int: Number of trainable parameters.
    """
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, scaler, num_epochs):
    """Train the model and track metrics."""
    # Print the number of trainable parameters before starting training
    trainable_params = count_trainable_parameters(model)
    print(f"Number of trainable parameters: {trainable_params}")
    
    # Lists to store metrics for plotting later
    train_losses, val_losses, val_maes = [], [], []

    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0
        pbar_train = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]")
        for X, y in pbar_train:
            X, y = X.to(device), y.to(device)
            optimizer.zero_grad()
            # Updated autocast syntax to kill that warning
            with torch.amp.autocast('cuda'):
                output = model(X)
                loss = criterion(output.squeeze(), y)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            train_loss += loss.item()
            # Show the batch loss in the progress bar
            pbar_train.set_postfix({'loss': loss.item()})

        # Average train loss for the epoch
        train_loss /= len(train_loader)
        train_losses.append(train_loss)

        # Validation phase
        model.eval()
        val_loss, val_mae = 0, 0
        pbar_val = tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Val]")
        with torch.no_grad():
            for X, y in pbar_val:
                X, y = X.to(device), y.to(device)
                with torch.amp.autocast('cuda'):
                    output = model(X)
                loss = criterion(output.squeeze(), y)
                val_loss += loss.item()
                val_mae += torch.mean(torch.abs(output.squeeze() - y)).item()
                # Show batch loss and MAE in the progress bar
                pbar_val.set_postfix({'loss': loss.item(), 'MAE': val_mae / len(val_loader)})

        # Average val loss and MAE for the epoch
        val_loss /= len(val_loader)
        val_mae /= len(val_loader)
        val_losses.append(val_loss)
        val_maes.append(val_mae)

        # Print all the metrics after each epoch
        print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {train_loss:.4f}, "
              f"Val Loss: {val_loss:.4f}, Val MAE: {val_mae:.4f}")

        # Step the scheduler based on val loss
        scheduler.step(val_loss)

    # Return the metrics for plotting or analysis
    return train_losses, val_losses, val_maes

In [58]:
torch.cuda.is_available()

True

In [None]:
# Cell 6: Main execution
if __name__ == "__main__":
    # Hyperparameters (adjustable for experimentation)
    SEQ_LENGTH = 32       # Sequence length
    BATCH_SIZE = 1024         # Batch size
    NUM_EPOCHS = 10          # Number of epochs
    LEARNING_RATE = 0.001    # Learning rate
    D_MODEL = 256            # Model dimension
    NHEAD = 8                # Number of attention heads
    NUM_LAYERS = 4           # Number of transformer layers
    DROPOUT = 0.2           # Dropout rate

    # Load and preprocess data
    # file_path = '/kaggle/input/df-raw-fydp/df_raw.csv'
    # file_path = '/home/ubuntu/data/df_raw.csv'

    data, scaler_X, scaler_y = load_and_preprocess_data(mydf1)

    # Split data into train, validation, and test sets
    train_size = int(0.7 * len(data))
    val_size = int(0.2 * len(data))
    train_data = data[:train_size]
    val_data = data[train_size:train_size + val_size]
    test_data = data

    # Create datasets and dataloaders
    train_dataset = TimeSeriesDataset(train_data, SEQ_LENGTH)
    val_dataset = TimeSeriesDataset(val_data, SEQ_LENGTH)
    test_dataset = TimeSeriesDataset(test_data, SEQ_LENGTH)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

    # Initialize model, loss, optimizer, and scheduler
    model = TransformerRegressor(
        input_dim=3, d_model=D_MODEL, nhead=NHEAD, num_layers=NUM_LAYERS, dropout=DROPOUT
    ).to(device)
    criterion = nn.L1Loss()
    optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2)
    scaler = GradScaler()


In [None]:
# Train the model
if(True):
    train_losses, val_losses, val_maes = train_model(
        model, train_loader, val_loader, criterion, optimizer, scheduler, scaler, NUM_EPOCHS
    )

Number of trainable parameters: 2500737


Epoch 1/10 [Train]: 100%|██████████| 30124/30124 [1:09:01<00:00,  7.27it/s, loss=0.941]
Epoch 1/10 [Val]: 100%|██████████| 8607/8607 [05:56<00:00, 24.11it/s, loss=0.292, MAE=0.417] 


Epoch 1/10 - Train Loss: 1.0262, Val Loss: 0.8748, Val MAE: 0.4167


Epoch 2/10 [Train]:  51%|█████▏    | 15497/30124 [35:41<33:35,  7.26it/s, loss=0.694] 

# *NEW CODE***

In [61]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.cuda.amp import GradScaler
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# Assuming 'device' is already defined (e.g., device = torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def investigate_and_summarize_velocity(df: pd.DataFrame) -> pd.DataFrame:
    """
    Processes a DataFrame to create a daily summary for EACH unique lat/lon pair,
    calculating the max and average velocity. Includes detailed investigation steps.
    """
    # ... (rest of the investigate_and_summarize_velocity function remains the same) ...
    proc_df = df.copy()
    proc_df = proc_df.dropna(subset=['mid_date', 'lon', 'lat', 'v [m/yr]'])
    try:
        proc_df['mid_date'] = pd.to_datetime(proc_df['mid_date'])
    except Exception as e:
        raise ValueError(f"Error converting 'mid_date' to datetime: {e}")

    proc_df = proc_df.sort_values(by=['lon', 'lat', 'mid_date'])
    proc_df['year'] = proc_df['mid_date'].dt.year
    proc_df['month'] = proc_df['mid_date'].dt.month
    proc_df['day'] = proc_df['mid_date'].dt.day

    if not pd.api.types.is_numeric_dtype(proc_df['v [m/yr]']):
        print("Warning: 'v [m/yr]' is not a numeric type. Attempting conversion.")
        try:
            proc_df['v [m/yr]'] = pd.to_numeric(proc_df['v [m/yr]'])
            print(f"--- Data type after conversion: {proc_df['v [m/yr]'].dtype} ---")
        except Exception as e:
            raise ValueError(f"Could not convert 'v [m/yr]' to numeric: {e}")

    loc_daily_summary = proc_df.groupby(['lon', 'lat', 'year', 'month', 'day']).agg(
        max_velocity=('v [m/yr]', 'max'),
        avg_velocity=('v [m/yr]', 'mean')
    ).reset_index()

    loc_daily_summary['month_sin'] = np.sin(2 * np.pi * loc_daily_summary['month'] / 12)
    loc_daily_summary['month_cos'] = np.cos(2 * np.pi * loc_daily_summary['month'] / 12)
    try:
        temp_date_for_daysinmonth = pd.to_datetime(loc_daily_summary[['year', 'month', 'day']])
        days_in_month = temp_date_for_daysinmonth.dt.days_in_month
        loc_daily_summary['day_sin'] = np.sin(2 * np.pi * loc_daily_summary['day'] / days_in_month)
        loc_daily_summary['day_cos'] = np.cos(2 * np.pi * loc_daily_summary['day'] / days_in_month)
    except Exception as e:
        print(f"Warning: Could not perform daily cyclical encoding. Error: {e}")
        loc_daily_summary['day_sin'] = np.nan
        loc_daily_summary['day_cos'] = np.nan

    final_columns_order = [
        'lon', 'lat', 'year', 'month_sin', 'month_cos', 'day_sin', 'day_cos', 'max_velocity'
    ]
    final_columns_order = [col for col in final_columns_order if col in loc_daily_summary.columns]
    loc_daily_summary = loc_daily_summary[final_columns_order]
    return loc_daily_summary

class TimeSeriesDataset(Dataset):
    """Custom Dataset for time series data."""
    def __init__(self, data, seq_length):
        self.data = data.astype(np.float32)
        self.seq_length = seq_length

    def __len__(self):
        return len(self.data) - self.seq_length

    def __getitem__(self, idx):
        """Return sequence of features and target."""
        X = self.data[idx:idx + self.seq_length, :-1]  # All features except target
        y = self.data[idx + self.seq_length - 1, -1]   # Last max_velocity value in sequence
        return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

class TransformerRegressor(nn.Module):
    """Transformer model for velocity prediction."""
    def __init__(self, input_dim, d_model, nhead, num_layers, dropout=0.1):
        super(TransformerRegressor, self).__init__()
        self.embedding = nn.Linear(input_dim, d_model)
        self.pos_encoder = nn.Parameter(torch.zeros(1, 1000, d_model))  # Positional encoding
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dropout=dropout, batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(d_model, 1)

        self._init_weights()

    def _init_weights(self):
        """Initialize model weights."""
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def forward(self, x):
        """Forward pass through the model."""
        batch_size, seq_len, _ = x.size()
        x = self.embedding(x)  # [batch_size, seq_len, d_model]
        x = x + self.pos_encoder[:, :seq_len, :]  # Add positional encoding
        x = self.transformer_encoder(x)  # [batch_size, seq_len, d_model]
        x = self.fc(x[:, -1, :])  # Predict from last time step
        return x

def count_trainable_parameters(model):
    """Count and return the number of trainable parameters in the model."""
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, scaler, num_epochs):
    """Train the model and track metrics."""
    trainable_params = count_trainable_parameters(model)
    print(f"Number of trainable parameters: {trainable_params}")

    train_losses, val_losses, val_maes = [], [], []

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        pbar_train = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]")
        for X, y in pbar_train:
            X, y = X.to(device), y.to(device)
            optimizer.zero_grad()
            with torch.amp.autocast('cuda'):
                output = model(X)
                loss = criterion(output.squeeze(), y)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            train_loss += loss.item()
            pbar_train.set_postfix({'loss': loss.item()})

        train_loss /= len(train_loader)
        train_losses.append(train_loss)

        model.eval()
        val_loss, val_mae = 0, 0
        pbar_val = tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Val]")
        with torch.no_grad():
            for X, y in pbar_val:
                X, y = X.to(device), y.to(device)
                with torch.amp.autocast('cuda'):
                    output = model(X)
                loss = criterion(output.squeeze(), y)
                val_loss += loss.item()
                val_mae += torch.mean(torch.abs(output.squeeze() - y)).item()
                pbar_val.set_postfix({'loss': loss.item(), 'MAE': val_mae / len(val_loader)})

        val_loss /= len(val_loader)
        val_mae /= len(val_loader)
        val_losses.append(val_loss)
        val_maes.append(val_mae)

        print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {train_loss:.4f}, "
              f"Val Loss: {val_loss:.4f}, Val MAE: {val_mae:.4f}")

        scheduler.step(val_loss)

    return train_losses, val_losses, val_maes

# Example Usage (replace 'mydf' with your DataFrame):
# mydf = pd.read_csv("your_data.csv") #replace with your data loading.
summary_df = investigate_and_summarize_velocity(mydf)

data = summary_df[[ 'lon', 'lat', 'year', 'month_sin', 'month_cos', 'day_sin', 'day_cos', 'max_velocity']].values


In [7]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.cuda.amp import GradScaler

# Assuming 'device' is already defined (e.g., device = torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def investigate_and_summarize_velocity(df: pd.DataFrame) -> pd.DataFrame:
    """
    Processes a DataFrame to create a daily summary for EACH unique lat/lon pair,
    calculating the max and average velocity. Includes detailed investigation steps.
    """
    if not isinstance(df, pd.DataFrame):
        raise ValueError("Input must be a pandas DataFrame.")
    required_cols = ['mid_date', 'lon', 'lat', 'v [m/yr]']
    if not all(col in df.columns for col in required_cols):
        raise ValueError(f"Input DataFrame missing required columns: {required_cols}")

    proc_df = df.copy()
    proc_df = proc_df.dropna(subset=required_cols)
    try:
        proc_df['mid_date'] = pd.to_datetime(proc_df['mid_date'])
    except Exception as e:
        raise ValueError(f"Error converting 'mid_date' to datetime: {e}")

    proc_df = proc_df.sort_values(by=['lon', 'lat', 'mid_date'])
    proc_df['year'] = proc_df['mid_date'].dt.year
    proc_df['month'] = proc_df['mid_date'].dt.month
    proc_df['day'] = proc_df['mid_date'].dt.day

    print(f"\n--- Data type of 'v [m/yr]' before grouping: {proc_df['v [m/yr]'].dtype} ---")
    if not pd.api.types.is_numeric_dtype(proc_df['v [m/yr]']):
        print("Warning: 'v [m/yr]' is not a numeric type. Attempting conversion.")
        try:
            proc_df['v [m/yr]'] = pd.to_numeric(proc_df['v [m/yr]'])
            print(f"--- Data type after conversion: {proc_df['v [m/yr]'].dtype} ---")
        except Exception as e:
            raise ValueError(f"Could not convert 'v [m/yr]' to numeric: {e}")

    inspect_lon, inspect_lat, inspect_year, inspect_month, inspect_day = 74.353, 36.182, 1990, 4, 4
    print(f"\n--- Inspecting raw data for group (lon={inspect_lon}, lat={inspect_lat}, {inspect_year}-{inspect_month:02d}-{inspect_day:02d}) ---")
    specific_group_filter = (
        (proc_df['lon'] == inspect_lon) &
        (proc_df['lat'] == inspect_lat) &
        (proc_df['year'] == inspect_year) &
        (proc_df['month'] == inspect_month) &
        (proc_df['day'] == inspect_day)
    )
    specific_group_velocities = proc_df.loc[specific_group_filter, 'v [m/yr]']

    if not specific_group_velocities.empty:
        print(f"Number of raw velocity values in this group: {len(specific_group_velocities)}")
        print(f"Raw velocity values:\n{specific_group_velocities.tolist()}")
        print(f"Manual Max calculation: {specific_group_velocities.max()}")
        print(f"Manual Mean calculation: {specific_group_velocities.mean()}")
    else:
        print("No raw data found for this specific group (check coordinates/date).")
    print("--------------------------------------------------------------------------")

    loc_daily_summary = proc_df.groupby(['lon', 'lat', 'year', 'month', 'day']).agg(
        max_velocity=('v [m/yr]', 'max'),
        avg_velocity=('v [m/yr]', 'mean')
    ).reset_index()

    print("\n--- Checking summary rows where max_velocity != avg_velocity ---")
    diff_rows = loc_daily_summary[~np.isclose(loc_daily_summary['max_velocity'], loc_daily_summary['avg_velocity'])]
    num_diff_rows = len(diff_rows)
    print(f"Number of summary rows where max_velocity significantly differs from avg_velocity: {num_diff_rows}")

    if num_diff_rows > 0:
        print(f"Percentage of rows with differing max/avg: {100 * num_diff_rows / len(loc_daily_summary):.2f}%")
        print("Example rows where they differ:")
        print(diff_rows.head())
    else:
        print("Still no rows found where max_velocity differs significantly from avg_velocity.")
    print("---------------------------------------------------------------")

    print("\n--- Adding Cyclical Features ---")
    loc_daily_summary['month_sin'] = np.sin(2 * np.pi * loc_daily_summary['month'] / 12)
    loc_daily_summary['month_cos'] = np.cos(2 * np.pi * loc_daily_summary['month'] / 12)
    try:
        temp_date_for_daysinmonth = pd.to_datetime(loc_daily_summary[['year', 'month', 'day']])
        days_in_month = temp_date_for_daysinmonth.dt.days_in_month
        loc_daily_summary['day_sin'] = np.sin(2 * np.pi * loc_daily_summary['day'] / days_in_month)
        loc_daily_summary['day_cos'] = np.cos(2 * np.pi * loc_daily_summary['day'] / days_in_month)
    except Exception as e:
        print(f"Warning: Could not perform daily cyclical encoding. Error: {e}")
        loc_daily_summary['day_sin'] = np.nan
        loc_daily_summary['day_cos'] = np.nan

    final_columns_order = [
        'lon', 'lat', 'year', 'month_sin', 'month_cos', 'day_sin', 'day_cos', 'avg_velocity', 'max_velocity'
    ]
    final_columns_order = [col for col in final_columns_order if col in loc_daily_summary.columns]
    loc_daily_summary = loc_daily_summary[final_columns_order]
    print("--- Function Complete ---")
    return loc_daily_summary

class TimeSeriesDataset(Dataset):
    """Custom Dataset for time series data."""
    def __init__(self, data, seq_length):
        self.data = data.astype(np.float32)
        self.seq_length = seq_length

    def __len__(self):
        return len(self.data) - self.seq_length

    def __getitem__(self, idx):
        """Return sequence of features and target."""
        X = self.data[idx:idx + self.seq_length, :-1]
        y = self.data[idx + self.seq_length - 1, -1]
        return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

class TransformerRegressor(nn.Module):
    """Transformer model for velocity prediction."""
    def __init__(self, input_dim, d_model, nhead, num_layers, dropout=0.1):
        super(TransformerRegressor, self).__init__()
        self.embedding = nn.Linear(input_dim, d_model)
        self.pos_encoder = nn.Parameter(torch.zeros(1, 1000, d_model))
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dropout=dropout, batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(d_model, 1)

        self._init_weights()

    def _init_weights(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def forward(self, x):
        batch_size, seq_len, _ = x.size()
        x = self.embedding(x)
        x = x + self.pos_encoder[:, :seq_len, :]
        x = self.transformer_encoder(x)
        x = self.fc(x[:, -1, :])
        return x

def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, scaler, num_epochs):
    trainable_params = count_trainable_parameters(model)
    print(f"Number of trainable parameters: {trainable_params}")

    train_losses, val_losses, val_maes = [], [], []

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        pbar_train = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]")
        for X, y in pbar_train:
            X, y = X.to(device), y.to(device)
            optimizer.zero_grad()
            with torch.amp.autocast('cuda'):
                output = model(X)
                loss = criterion(output.squeeze(), y)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            train_loss += loss.item()
            pbar_train.set_postfix({'loss': loss.item()})

        train_loss /= len(train_loader)
        train_losses.append(train_loss)

        model.eval()
        val_loss, val_mae = 0, 0
        pbar_val = tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Val]")
        with torch.no_grad():
            for X, y in pbar_val:
                X, y = X.to(device), y.to(device)
                with torch.amp.autocast('cuda'):
                    output = model(X)
                loss = criterion(output.squeeze(), y)
                val_loss += loss.item()
                val_mae += torch.mean(torch.abs(output.squeeze() - y)).item()
                pbar_val.set_postfix({'loss': loss.item(), 'MAE': val_mae / len(val_loader)})

        val_loss /= len(val_loader)
        val_mae /= len(val_loader)
        val_losses.append(val_loss)
        val_maes.append(val_mae)

        print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {train_loss:.4f}, "
              f"Val Loss: {val_loss:.4f}, Val MAE: {val_mae:.4f}")

        scheduler.step(val_loss)

    return train_losses, val_losses, val_maes

# Example Usage:
# mydf = pd.read_csv("your_data.csv") # Replace with your data loading
summary_df = investigate_and_summarize_velocity(mydf)

data = summary_df[[ 'lon', 'lat', 'year', 'month_sin', 'month_cos', 'day_sin', 'day_cos', 'avg_velocity', 'max_velocity']].values




--- Data type of 'v [m/yr]' before grouping: float64 ---

--- Inspecting raw data for group (lon=74.353, lat=36.182, 1990-04-04) ---
Number of raw velocity values in this group: 1
Raw velocity values:
[53.0]
Manual Max calculation: 53.0
Manual Mean calculation: 53.0
--------------------------------------------------------------------------

--- Checking summary rows where max_velocity != avg_velocity ---
Number of summary rows where max_velocity significantly differs from avg_velocity: 9187342
Percentage of rows with differing max/avg: 76.79%
Example rows where they differ:
       lon     lat  year  month  day  max_velocity  avg_velocity
14  74.353  36.182  1991      2    6          53.0          39.5
23  74.353  36.182  1991      7    8           8.0           6.5
24  74.353  36.182  1991      7   16           8.0           5.5
25  74.353  36.182  1991      7   24          16.0          12.5
33  74.353  36.182  1991     10   28           8.0           6.5
----------------------------

In [8]:
SEQ_LENGTH = 32
BATCH_SIZE = 2048
NUM_EPOCHS = 30
LEARNING_RATE = 0.00001
D_MODEL = 256
NHEAD = 8
NUM_LAYERS = 4
DROPOUT = 0.2

train_size = int(0.7 * len(data))
val_size = int(0.2 * len(data))
train_data = data[:train_size]
val_data = data[train_size:train_size + val_size]
test_data = data[train_size + val_size:]


train_dataset = TimeSeriesDataset(train_data, SEQ_LENGTH)
val_dataset = TimeSeriesDataset(val_data, SEQ_LENGTH)
test_dataset = TimeSeriesDataset(test_data, SEQ_LENGTH)
                               

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

model = TransformerRegressor(input_dim=8, d_model=D_MODEL, nhead=NHEAD, num_layers=NUM_LAYERS, dropout=DROPOUT).to(device) # input_dim is 8
criterion = nn.L1Loss()
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2)
scaler = GradScaler()


  scaler = GradScaler()


In [12]:
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)


In [10]:
train_losses, val_losses, val_maes = train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, scaler, NUM_EPOCHS)

Number of trainable parameters: 5518849


Epoch 1/30 [Train]: 100%|██████████| 4090/4090 [09:10<00:00,  7.42it/s, loss=40.6]
Epoch 1/30 [Val]: 100%|██████████| 1169/1169 [00:40<00:00, 28.98it/s, loss=25.2, MAE=37.8]


Epoch 1/30 - Train Loss: 40.4021, Val Loss: 37.8494, Val MAE: 37.8494


Epoch 2/30 [Train]: 100%|██████████| 4090/4090 [09:12<00:00,  7.41it/s, loss=32.4]
Epoch 2/30 [Val]: 100%|██████████| 1169/1169 [00:40<00:00, 28.67it/s, loss=18.2, MAE=29.6]


Epoch 2/30 - Train Loss: 36.9007, Val Loss: 29.6484, Val MAE: 29.6484


Epoch 3/30 [Train]: 100%|██████████| 4090/4090 [09:11<00:00,  7.41it/s, loss=26.3]
Epoch 3/30 [Val]: 100%|██████████| 1169/1169 [00:40<00:00, 28.70it/s, loss=16.8, MAE=27.5]


Epoch 3/30 - Train Loss: 31.8588, Val Loss: 27.5255, Val MAE: 27.5255


Epoch 4/30 [Train]: 100%|██████████| 4090/4090 [09:11<00:00,  7.42it/s, loss=29.7]
Epoch 4/30 [Val]: 100%|██████████| 1169/1169 [00:40<00:00, 28.70it/s, loss=16.5, MAE=26]  


Epoch 4/30 - Train Loss: 29.4904, Val Loss: 25.9766, Val MAE: 25.9766


Epoch 5/30 [Train]: 100%|██████████| 4090/4090 [09:11<00:00,  7.42it/s, loss=23.5]
Epoch 5/30 [Val]: 100%|██████████| 1169/1169 [00:40<00:00, 28.69it/s, loss=16.4, MAE=24.8]


Epoch 5/30 - Train Loss: 28.0398, Val Loss: 24.8303, Val MAE: 24.8303


Epoch 6/30 [Train]: 100%|██████████| 4090/4090 [09:10<00:00,  7.42it/s, loss=27.7]
Epoch 6/30 [Val]: 100%|██████████| 1169/1169 [00:40<00:00, 28.66it/s, loss=17, MAE=24.4]  


Epoch 6/30 - Train Loss: 26.9616, Val Loss: 24.4316, Val MAE: 24.4316


Epoch 7/30 [Train]: 100%|██████████| 4090/4090 [09:10<00:00,  7.42it/s, loss=24.4]
Epoch 7/30 [Val]: 100%|██████████| 1169/1169 [00:40<00:00, 28.67it/s, loss=15.5, MAE=23.1]


Epoch 7/30 - Train Loss: 26.0754, Val Loss: 23.0516, Val MAE: 23.0516


Epoch 8/30 [Train]: 100%|██████████| 4090/4090 [09:10<00:00,  7.43it/s, loss=26.2]
Epoch 8/30 [Val]: 100%|██████████| 1169/1169 [00:40<00:00, 28.69it/s, loss=15.6, MAE=22.6]


Epoch 8/30 - Train Loss: 25.3647, Val Loss: 22.6392, Val MAE: 22.6392


Epoch 9/30 [Train]: 100%|██████████| 4090/4090 [09:10<00:00,  7.42it/s, loss=24.7]
Epoch 9/30 [Val]: 100%|██████████| 1169/1169 [00:40<00:00, 28.64it/s, loss=15.6, MAE=22.6]


Epoch 9/30 - Train Loss: 24.8631, Val Loss: 22.5656, Val MAE: 22.5656


Epoch 10/30 [Train]: 100%|██████████| 4090/4090 [09:10<00:00,  7.43it/s, loss=25.9]
Epoch 10/30 [Val]: 100%|██████████| 1169/1169 [00:40<00:00, 28.72it/s, loss=14.7, MAE=21.7]


Epoch 10/30 - Train Loss: 24.4160, Val Loss: 21.7032, Val MAE: 21.7032


Epoch 11/30 [Train]: 100%|██████████| 4090/4090 [09:10<00:00,  7.44it/s, loss=23.3]
Epoch 11/30 [Val]: 100%|██████████| 1169/1169 [00:40<00:00, 28.71it/s, loss=15.9, MAE=22.5]


Epoch 11/30 - Train Loss: 24.0532, Val Loss: 22.5000, Val MAE: 22.5000


Epoch 12/30 [Train]: 100%|██████████| 4090/4090 [09:10<00:00,  7.43it/s, loss=21]  
Epoch 12/30 [Val]: 100%|██████████| 1169/1169 [00:40<00:00, 28.69it/s, loss=15.8, MAE=22.3]


Epoch 12/30 - Train Loss: 23.7443, Val Loss: 22.3252, Val MAE: 22.3252


Epoch 13/30 [Train]: 100%|██████████| 4090/4090 [09:10<00:00,  7.44it/s, loss=24.1]
Epoch 13/30 [Val]: 100%|██████████| 1169/1169 [00:40<00:00, 28.66it/s, loss=15.3, MAE=21.4]


Epoch 13/30 - Train Loss: 23.5285, Val Loss: 21.4232, Val MAE: 21.4232


Epoch 14/30 [Train]: 100%|██████████| 4090/4090 [09:09<00:00,  7.44it/s, loss=19.8]
Epoch 14/30 [Val]: 100%|██████████| 1169/1169 [00:40<00:00, 28.72it/s, loss=14.3, MAE=20.9]


Epoch 14/30 - Train Loss: 23.2585, Val Loss: 20.9307, Val MAE: 20.9307


Epoch 15/30 [Train]: 100%|██████████| 4090/4090 [09:09<00:00,  7.44it/s, loss=20.3]
Epoch 15/30 [Val]: 100%|██████████| 1169/1169 [00:40<00:00, 28.68it/s, loss=16.4, MAE=22.1]


Epoch 15/30 - Train Loss: 23.0872, Val Loss: 22.0885, Val MAE: 22.0885


Epoch 16/30 [Train]: 100%|██████████| 4090/4090 [09:09<00:00,  7.44it/s, loss=22.6]
Epoch 16/30 [Val]: 100%|██████████| 1169/1169 [00:40<00:00, 28.73it/s, loss=14.6, MAE=20.4]


Epoch 16/30 - Train Loss: 22.8290, Val Loss: 20.3625, Val MAE: 20.3625


Epoch 17/30 [Train]: 100%|██████████| 4090/4090 [09:09<00:00,  7.44it/s, loss=22.1]
Epoch 17/30 [Val]: 100%|██████████| 1169/1169 [00:40<00:00, 28.70it/s, loss=15.4, MAE=21]  


Epoch 17/30 - Train Loss: 22.6950, Val Loss: 20.9751, Val MAE: 20.9751


Epoch 18/30 [Train]: 100%|██████████| 4090/4090 [09:09<00:00,  7.44it/s, loss=24.6]
Epoch 18/30 [Val]: 100%|██████████| 1169/1169 [00:40<00:00, 28.71it/s, loss=14.6, MAE=20.4]


Epoch 18/30 - Train Loss: 22.5158, Val Loss: 20.4091, Val MAE: 20.4091


Epoch 19/30 [Train]: 100%|██████████| 4090/4090 [09:09<00:00,  7.44it/s, loss=25.3]
Epoch 19/30 [Val]: 100%|██████████| 1169/1169 [00:40<00:00, 28.73it/s, loss=15, MAE=20.4]  


Epoch 19/30 - Train Loss: 22.3762, Val Loss: 20.4209, Val MAE: 20.4209


Epoch 20/30 [Train]: 100%|██████████| 4090/4090 [09:09<00:00,  7.44it/s, loss=21.2]
Epoch 20/30 [Val]: 100%|██████████| 1169/1169 [00:40<00:00, 28.73it/s, loss=14.4, MAE=20.1]


Epoch 20/30 - Train Loss: 21.7884, Val Loss: 20.1331, Val MAE: 20.1331


Epoch 21/30 [Train]: 100%|██████████| 4090/4090 [09:09<00:00,  7.45it/s, loss=22.7]
Epoch 21/30 [Val]: 100%|██████████| 1169/1169 [00:40<00:00, 28.72it/s, loss=15.1, MAE=20.7]


Epoch 21/30 - Train Loss: 21.7393, Val Loss: 20.7208, Val MAE: 20.7208


Epoch 22/30 [Train]: 100%|██████████| 4090/4090 [09:09<00:00,  7.45it/s, loss=20.1]
Epoch 22/30 [Val]: 100%|██████████| 1169/1169 [00:40<00:00, 28.73it/s, loss=14.4, MAE=20.2]


Epoch 22/30 - Train Loss: 21.6972, Val Loss: 20.1861, Val MAE: 20.1861


Epoch 23/30 [Train]: 100%|██████████| 4090/4090 [09:08<00:00,  7.46it/s, loss=19.9]
Epoch 23/30 [Val]: 100%|██████████| 1169/1169 [00:40<00:00, 28.76it/s, loss=14.4, MAE=20]  


Epoch 23/30 - Train Loss: 21.6646, Val Loss: 20.0386, Val MAE: 20.0386


Epoch 24/30 [Train]: 100%|██████████| 4090/4090 [09:09<00:00,  7.45it/s, loss=22.1]
Epoch 24/30 [Val]: 100%|██████████| 1169/1169 [00:40<00:00, 28.77it/s, loss=14.4, MAE=20.1]


Epoch 24/30 - Train Loss: 21.6151, Val Loss: 20.0733, Val MAE: 20.0733


Epoch 25/30 [Train]: 100%|██████████| 4090/4090 [09:09<00:00,  7.45it/s, loss=22.5]
Epoch 25/30 [Val]: 100%|██████████| 1169/1169 [00:40<00:00, 28.76it/s, loss=14.8, MAE=20.6]


Epoch 25/30 - Train Loss: 21.5809, Val Loss: 20.5544, Val MAE: 20.5544


Epoch 26/30 [Train]: 100%|██████████| 4090/4090 [09:09<00:00,  7.45it/s, loss=20.5]
Epoch 26/30 [Val]: 100%|██████████| 1169/1169 [00:40<00:00, 28.72it/s, loss=14.9, MAE=20.3]


Epoch 26/30 - Train Loss: 21.5470, Val Loss: 20.3067, Val MAE: 20.3067


Epoch 27/30 [Train]: 100%|██████████| 4090/4090 [09:09<00:00,  7.44it/s, loss=21.9]
Epoch 27/30 [Val]: 100%|██████████| 1169/1169 [00:40<00:00, 28.76it/s, loss=14.6, MAE=20]  


Epoch 27/30 - Train Loss: 21.4531, Val Loss: 20.0394, Val MAE: 20.0394


Epoch 28/30 [Train]: 100%|██████████| 4090/4090 [09:09<00:00,  7.45it/s, loss=20.6]
Epoch 28/30 [Val]: 100%|██████████| 1169/1169 [00:40<00:00, 28.72it/s, loss=14.6, MAE=20.1]


Epoch 28/30 - Train Loss: 21.4501, Val Loss: 20.0521, Val MAE: 20.0521


Epoch 29/30 [Train]: 100%|██████████| 4090/4090 [09:09<00:00,  7.44it/s, loss=21.1]
Epoch 29/30 [Val]: 100%|██████████| 1169/1169 [00:40<00:00, 28.73it/s, loss=14.7, MAE=20.1]


Epoch 29/30 - Train Loss: 21.4411, Val Loss: 20.0843, Val MAE: 20.0843


Epoch 30/30 [Train]: 100%|██████████| 4090/4090 [09:09<00:00,  7.44it/s, loss=20.5]
Epoch 30/30 [Val]: 100%|██████████| 1169/1169 [00:40<00:00, 28.73it/s, loss=14.6, MAE=20]  

Epoch 30/30 - Train Loss: 21.4270, Val Loss: 20.0454, Val MAE: 20.0454





In [13]:
if(True):
    # # Cell 7: Plot training metrics
    # plt.figure(figsize=(12, 4))
    # plt.subplot(1, 2, 1)
    # plt.plot(train_losses, label='Train Loss')
    # plt.plot(val_losses, label='Val Loss')
    # plt.xlabel('Epoch')
    # plt.ylabel('Loss')
    # plt.legend()
    # plt.title('Training and Validation Loss')

    # plt.subplot(1, 2, 2)
    # plt.plot(val_maes, label='Val MAE')
    # plt.xlabel('Epoch')
    # plt.ylabel('MAE')
    # plt.legend()
    # plt.title('Validation MAE')
    # plt.tight_layout()
    # plt.show()

    # Cell 8: Evaluate on test set
    model.eval()
    test_loss, test_mae = 0, 0
    with torch.no_grad():
        for X, y in tqdm(test_loader, desc="Testing"):
            X, y = X.to(device), y.to(device)
            output = model(X)
            test_loss += criterion(output.squeeze(), y).item()
            test_mae += torch.mean(torch.abs(output.squeeze() - y)).item()

    test_loss /= len(test_loader)
    test_mae /= len(test_loader)
    logger.info(f"Test Loss: {test_loss:.4f}, Test MAE: {test_mae:.4f}")

    # Save the model
    torch.save(model.state_dict(), 'transformer_model.pth')
    logger.info("Model saved as 'transformer_model.pth'")

Testing: 100%|██████████| 585/585 [02:19<00:00,  4.20it/s]


In [16]:
test_mae

20.53790953220465

In [17]:
# Load model state dict
model.load_state_dict(torch.load("/kaggle/input/terraflow-5.5m/pytorch/default/1/terraflow_transformer_model.pth", map_location=device))
model.eval()  # Set to evaluation mode

  model.load_state_dict(torch.load("/kaggle/input/terraflow-5.5m/pytorch/default/1/terraflow_transformer_model.pth", map_location=device))


TransformerRegressor(
  (embedding): Linear(in_features=8, out_features=256, bias=True)
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
        (linear1): Linear(in_features=256, out_features=2048, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=2048, out_features=256, bias=True)
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
    )
  )
  (fc): Linear(in_features=256, out_features=1, bias=True)
)

In [21]:
import numpy as np
import torch

predictions = []
actuals = []
n = 50  # Number of predictions to make
i = 0

with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)

        # Convert to numpy and store
        for j in range(len(outputs)):
            if i >= n:  # Stop after 10 predictions
                break
            predictions.append(outputs[j].cpu().numpy())
            actuals.append(targets[j].cpu().numpy())
            i += 1
        if i >= n:
            break  # Exit the outer loop once we have 10 samples

# Convert to numpy arrays
predictions = np.array(predictions)
actuals = np.array(actuals)


In [23]:
print("Unscaled Predictions:", predictions[:])
print("Unscaled Actuals:", actuals[:])

Unscaled Predictions: [[11.595757 ]
 [12.968755 ]
 [21.906458 ]
 [27.22831  ]
 [27.846622 ]
 [68.2304   ]
 [ 9.040334 ]
 [18.85722  ]
 [ 8.690692 ]
 [25.8592   ]
 [24.048616 ]
 [ 8.062716 ]
 [26.703175 ]
 [ 7.4819183]
 [37.59923  ]
 [10.675871 ]
 [34.841415 ]
 [18.002289 ]
 [ 9.848359 ]
 [62.418068 ]
 [28.0903   ]
 [ 7.867091 ]
 [ 9.150924 ]
 [21.424036 ]
 [10.582641 ]
 [22.73966  ]
 [22.346846 ]
 [18.705713 ]
 [71.562195 ]
 [47.122406 ]
 [19.106821 ]
 [16.405764 ]
 [34.697266 ]
 [21.49151  ]
 [19.906448 ]
 [38.107635 ]
 [35.52249  ]
 [44.588303 ]
 [37.209545 ]
 [21.709034 ]
 [31.761469 ]
 [32.700367 ]
 [32.852226 ]
 [42.487335 ]
 [22.93914  ]
 [53.386208 ]
 [35.226692 ]
 [18.724316 ]
 [23.551628 ]
 [30.846554 ]]
Unscaled Actuals: [  9.  14.  23.  51.  17.  17.  13.  15.  11.  62.  20.  12.  30.  12.
   9.  20.  32.  19.   9.  15.  80.   9.  23.  16.   9.  48.  22.  28.
  21.  60.  24.  15.  22.  10.  27.  96.  27. 119.  33.  20.  56.  29.
  42.  39.  21. 109.  32.  19.  19.  27.]


In [52]:
actuals[:5]

array([-0.31663486,  0.00221978, -0.21034998, -0.1040651 , -0.05092266],
      dtype=float32)

In [65]:
df[(df['v [m/yr]'] == df['v [m/yr]'].max())]

Unnamed: 0,mid_date,lon,lat,v [m/yr]
40101287,2022-04-04 01:07:06.856773120,74.4637,36.317,3867.0


In [68]:
import torch
import numpy as np
import pandas as pd
from datetime import datetime

def to_ordinal(x):
    """Converts datetime string to ordinal format with fractional day."""
    x = pd.to_datetime(x)  # Convert string to datetime
    return x.toordinal() + (x.hour * 3600 + x.minute * 60 + x.second) / 86400.0

# Define your input values
time = '2022-04-04 01:07:06.856773120'
lat = 36.317
lon = 74.4637

# Convert time to ordinal
ordinal_time = to_ordinal(time)

# Convert to a NumPy array
new_input = np.array([[ordinal_time, lat, lon]])

# Scale using the same scaler used for training
new_input_scaled = scaler_X.transform(new_input)

# Convert to PyTorch tensor
new_input_tensor = torch.tensor(new_input_scaled, dtype=torch.float32).to(device)

print("Processed Input Tensor:", new_input_tensor)


Processed Input Tensor: tensor([[-1.6271,  1.5344,  0.3538]], device='cuda:0')


In [71]:
# Reshape input to (batch_size=1, seq_length=1, feature_dim=3)
new_input_tensor = new_input_tensor.unsqueeze(0)  

with torch.no_grad():
    vel_predicted = model(new_input_tensor).cpu().numpy()  # Convert tensor to NumPy

# Inverse transform to get the original velocity value
vel_predicted_original = scaler_y.inverse_transform(vel_predicted.reshape(-1, 1))

print(f"Predicted Velocity: {vel_predicted_original[0][0]:.4f} m/yr")


Predicted Velocity: 7.9246 m/yr
