In [None]:
pip install imbalanced-learn

In [None]:
%pip install torch

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [None]:
!pip install rtdl

In [2]:
import pandas as pd
import glob
import os

path = r'/kaggle/input/firesett' 
all_files = glob.glob(os.path.join(path, "*.csv"))
try:
    df_from_each_file = (pd.read_csv(f) for f in all_files)
    concatenated_df   = pd.concat(df_from_each_file, ignore_index=True)
    
    print("Successfully concatenated all CSV files.")
    print(f"Total rows in the combined DataFrame: {len(concatenated_df)}")
    print("First 5 rows of the combined data:")
    print(concatenated_df.head())

    concatenated_df.to_csv("combined_data.csv", index=False)
    print("\nCombined data has been saved to 'combined_data.csv'")

except ValueError:
    print("No CSV files found in the specified directory or the files are empty.")
except Exception as e:
    print(f"An error occurred: {e}")


Successfully concatenated all CSV files.
Total rows in the combined DataFrame: 446165
First 5 rows of the combined data:
   latitude  longitude  bright_ti4  scan  track    acq_date  acq_time  \
0  19.51895   42.53688      336.56  0.39   0.36  2020-01-01      1018   
1  18.70315   45.20741      338.43  0.40   0.37  2020-01-01      1018   
2  22.52510   54.04734      301.22  0.70   0.75  2020-01-01      2106   
3  22.52309   54.04637      301.26  0.70   0.75  2020-01-01      2106   
4  22.60682   54.06105      296.30  0.70   0.75  2020-01-01      2106   

  satellite instrument confidence  version  bright_ti5   frp daynight  type  
0         N      VIIRS          n        2      310.35  2.15        D     2  
1         N      VIIRS          n        2      306.76  2.25        D     0  
2         N      VIIRS          n        2      284.61  1.54        N     2  
3         N      VIIRS          n        2      284.52  1.04        N     0  
4         N      VIIRS          n        2      28

In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np

def preprocess_flameguard_data(df: pd.DataFrame):
    print("Starting preprocessing...")
    df_clean = df.copy()
    print(f"Initial shape: {df_clean.shape}")

    columns_to_drop = ['scan', 'track', 'satellite', 'instrument', 'version']
    df_clean.drop(columns=columns_to_drop, inplace=True, errors='ignore')
    print(f"Shape after pruning columns: {df_clean.shape}")

    df_clean.dropna(inplace=True)
    print(f"Shape after dropping missing values: {df_clean.shape}")

    df_clean.drop_duplicates(inplace=True)
    print(f"Shape after dropping duplicates: {df_clean.shape}")

    print("\nStarting feature engineering...")

    df_clean['acq_date'] = pd.to_datetime(df_clean['acq_date'], errors='coerce')
    df_clean.dropna(subset=['acq_date'], inplace=True)

    # Convert acq_time to string, pad with zeros, then extract hour as integer
    df_clean['hour'] = df_clean['acq_time'].astype(str).str.zfill(4).str[:2].astype(int)
    df_clean['month'] = df_clean['acq_date'].dt.month
    df_clean['year'] = df_clean['acq_date'].dt.year

    df_clean.drop(columns=['acq_date', 'acq_time'], inplace=True)
    print("Created 'hour', 'month', 'year' features and dropped originals.")
    print(f"Shape after feature engineering: {df_clean.shape}")

    print("\nStarting feature transformation and encoding...")

    numerical_features = ['latitude', 'longitude', 'bright_ti4', 'bright_ti5', 'frp']
    categorical_features = ['type', 'daynight', 'hour', 'month', 'year']
    
    existing_numerical = [f for f in numerical_features if f in df_clean.columns]
    existing_categorical = [f for f in categorical_features if f in df_clean.columns]
    
    print(f"Numerical features to scale: {existing_numerical}")
    print(f"Categorical features to encode: {existing_categorical}")

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), existing_numerical),
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), existing_categorical)
        ],
        remainder='passthrough'
    )
     
    print("Fitting and transforming data with the preprocessor...")
    processed_data = preprocessor.fit_transform(df_clean)

    new_cat_names = preprocessor.named_transformers_['cat'].get_feature_names_out(existing_categorical)
    
    feature_columns_in_transformer = existing_numerical + existing_categorical
    remainder_cols = [col for col in df_clean.columns if col not in feature_columns_in_transformer]

    final_columns = existing_numerical + list(new_cat_names) + remainder_cols

    processed_df = pd.DataFrame(processed_data, columns=final_columns, index=df_clean.index)
    
    print(f"\nPreprocessing complete. Final processed shape: {processed_df.shape}")
    confidence_mapping = {
        'n': 0,
        'h': 1,
        'l': 2
    }

    processed_df['confidence'] = processed_df['confidence'].map(confidence_mapping)

    return processed_df, preprocessor

if __name__ == '__main__':
    raw_df = pd.read_csv('/kaggle/working/combined_data.csv')

    print("--- Raw Sample DataFrame ---")
    print(raw_df)
    print("\n" + "="*50 + "\n")

    processed_df, preprocessor_obj = preprocess_flameguard_data(raw_df)

    print("\n--- Processed DataFrame (Head) ---")
    print(processed_df.head())
    
    print("\n--- Columns in Processed DataFrame ---")
    print(processed_df.columns)


--- Raw Sample DataFrame ---
        latitude  longitude  bright_ti4  scan  track    acq_date  acq_time  \
0       19.51895   42.53688      336.56  0.39   0.36  2020-01-01      1018   
1       18.70315   45.20741      338.43  0.40   0.37  2020-01-01      1018   
2       22.52510   54.04734      301.22  0.70   0.75  2020-01-01      2106   
3       22.52309   54.04637      301.26  0.70   0.75  2020-01-01      2106   
4       22.60682   54.06105      296.30  0.70   0.75  2020-01-01      2106   
...          ...        ...         ...   ...    ...         ...       ...   
446160  23.95027   38.28195      299.08  0.51   0.66  2022-12-31      2357   
446161  23.95134   38.28138      303.62  0.51   0.66  2022-12-31      2357   
446162  22.55081   39.43581      301.06  0.65   0.73  2022-12-31      2358   
446163  22.55534   39.43776      301.32  0.65   0.73  2022-12-31      2358   
446164  20.71781   39.82619      299.05  0.75   0.77  2022-12-31      2358   

       satellite instrument confid

In [4]:
processed_df['confidence'].value_counts()

confidence
0    411085
2     27024
1      8056
Name: count, dtype: int64

In [None]:
# === Spatio-Temporal Feature Engineering + FT-Transformer (V11 - Official Constructor) ===
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import BallTree
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import classification_report, accuracy_score, f1_score
from collections import Counter
import time
import gc
import rtdl
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# --- Configuration ---
K_NEIGHBORS = 32
BATCH_SIZE = 128
EPOCHS = 30
LEARNING_RATE = 0.0001

# --- Spatio-Temporal Feature Engineering Function ---
def create_neighborhood_features(X_coords, X_features, k):
    print(f"Starting neighborhood feature engineering for {len(X_coords)} points...")
    coords_rad = np.deg2rad(X_coords[['latitude', 'longitude']].to_numpy(dtype=float))
    if k >= len(coords_rad): k = max(1, len(coords_rad) - 1)
    if k == 0: return pd.DataFrame()
    tree = BallTree(coords_rad, metric="haversine")
    _, indices = tree.query(coords_rad, k=k+1)
    neighbor_indices = indices[:, 1:]
    features_np = X_features.to_numpy(dtype=float)
    mean_features = np.zeros_like(features_np); std_features = np.zeros_like(features_np)
    for i in range(len(X_coords)):
        neighbor_feature_matrix = features_np[neighbor_indices[i]]
        mean_features[i, :] = neighbor_feature_matrix.mean(axis=0)
        std_features[i, :] = neighbor_feature_matrix.std(axis=0)
    mean_df = pd.DataFrame(mean_features, index=X_features.index, columns=[f'mean_{col}_k{k}' for col in X_features.columns])
    std_df = pd.DataFrame(std_features, index=X_features.index, columns=[f'std_{col}_k{k}' for col in X_features.columns])
    X_rich = pd.concat([X_features, mean_df, std_df], axis=1)
    print("Neighborhood feature engineering complete.")
    return X_rich

# --- Main Execution Block ---
print(f"Initial dataset loaded. Shape: {processed_df.shape}")
X = processed_df.drop('confidence', axis=1)
y = processed_df['confidence']

print("Verifying and casting data types to numeric...")
for col in X.columns:
    X[col] = pd.to_numeric(X[col], errors='coerce')
X.fillna(0, inplace=True)
print(f"CONFIRMED: Found {len(X.select_dtypes(include=np.number).columns)} numeric columns.")

coord_cols = ['latitude', 'longitude']
feature_cols = [col for col in X.columns if col not in coord_cols]
X_coords = X[coord_cols]; X_features = X[feature_cols]
X_rich = create_neighborhood_features(X_coords, X_features, k=K_NEIGHBORS)
X_final = pd.concat([X_coords, X_rich], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42, stratify=y)

print(f"Original train distribution: {Counter(y_train)}")
over = SMOTE(sampling_strategy={1: 75000, 2: 75000}, random_state=42)
under = RandomUnderSampler(sampling_strategy={0: 150000}, random_state=42)
pipeline = Pipeline(steps=[('o', over), ('u', under)])
X_train_resampled, y_train_resampled = pipeline.fit_resample(X_train, y_train)
print(f"Resampled train distribution: {Counter(y_train_resampled)}")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

train_dataset = TensorDataset(torch.tensor(X_train_scaled, dtype=torch.float32), torch.tensor(y_train_resampled.values, dtype=torch.long))
test_dataset = TensorDataset(torch.tensor(X_test_scaled, dtype=torch.float32), torch.tensor(y_test.values, dtype=torch.long))
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

# --- Model Initialization ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = rtdl.FTTransformer.make_baseline(
    n_num_features=X_train_scaled.shape[1],
    cat_cardinalities=None,
    d_token=192,
    n_blocks=4,
    attention_dropout=0.2,
    ffn_d_hidden=256,
    ffn_dropout=0.1,
    residual_dropout=0.0,
    d_out=len(y.unique())
).to(device)

class_counts = y.value_counts().sort_index()
dampened_weights_tensor = torch.sqrt(torch.tensor( (len(y) / (len(class_counts) * class_counts)).values, dtype=torch.float32)).to(device)
criterion = nn.CrossEntropyLoss(weight=dampened_weights_tensor)
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)

print("\n--- Training FT-Transformer ---")
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for features, labels in train_loader:
        features, labels = features.to(device), labels.to(device)
        outputs = model(x_num=features, x_cat=None)
        loss = criterion(outputs, labels)
        optimizer.zero_grad(); loss.backward(); optimizer.step()
        total_loss += loss.item()
    scheduler.step()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{EPOCHS}] - Loss: {avg_loss:.4f} - LR: {scheduler.get_last_lr()[0]:.6f}")

print("\n--- Final Evaluation ---")
model.eval()
preds, labels_all = [], []
with torch.no_grad():
    for features, labels in test_loader:
        features, labels = features.to(device), labels.to(device)
        outputs = model(x_num=features, x_cat=None)
        _, predicted = torch.max(outputs, 1)
        preds.extend(predicted.cpu().numpy())
        labels_all.extend(labels.cpu().numpy())

print(f"Accuracy: {accuracy_score(labels_all, preds):.4f}")
print(f"Macro F1-Score: {f1_score(labels_all, preds, average='macro', zero_division=0):.4f}")
print(classification_report(labels_all, preds, zero_division=0))

Initial dataset loaded. Shape: (446165, 43)
Verifying and casting data types to numeric...
CONFIRMED: Found 42 numeric columns.
Starting neighborhood feature engineering for 446165 points...
Neighborhood feature engineering complete.
Original train distribution: Counter({0: 328868, 2: 21619, 1: 6445})
Resampled train distribution: Counter({0: 150000, 1: 75000, 2: 75000})
Using device: cuda

--- Training FT-Transformer ---
Epoch [1/30] - Loss: 0.1101 - LR: 0.000100
Epoch [2/30] - Loss: 0.0662 - LR: 0.000099
Epoch [3/30] - Loss: 0.0602 - LR: 0.000098
Epoch [4/30] - Loss: 0.0572 - LR: 0.000096
Epoch [5/30] - Loss: 0.0541 - LR: 0.000093
Epoch [6/30] - Loss: 0.0528 - LR: 0.000090
Epoch [7/30] - Loss: 0.0511 - LR: 0.000087
Epoch [8/30] - Loss: 0.0497 - LR: 0.000083
Epoch [9/30] - Loss: 0.0486 - LR: 0.000079
Epoch [10/30] - Loss: 0.0465 - LR: 0.000075
Epoch [11/30] - Loss: 0.0454 - LR: 0.000070
Epoch [12/30] - Loss: 0.0451 - LR: 0.000065
Epoch [13/30] - Loss: 0.0431 - LR: 0.000060
Epoch [14/3