In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("/kaggle/input/noaa-powout-prism-0-1-is-storm-lag/noaapowoutprism_01_Is_Storm_Lag (1).csv")

# Split into 80% (train+test) and 20% (holdout)
train_test, holdout_data = train_test_split(df, test_size=0.2, random_state=42)

# Split train_test into 70% train and 10% test (out of total)
# Since train_test is 80%, we calculate relative proportions
relative_test_size = 0.1 / 0.8  # = 0.125 of train_test
train_data, test_data = train_test_split(train_test, test_size=relative_test_size, random_state=42)

In [None]:
# Combine datasets for consistent encoding
data = pd.concat([train_data, test_data, holdout_data], ignore_index=True)

# Encoding using DAMAGE_PROPERTY (numerical, non-target variable)
categorical_cols = ['EVENT_TYPE', 'stability', 'WFO']
encoding_maps = {}

# Compute encoding mappings from training data using DAMAGE_PROPERTY
for col in categorical_cols:
    if col in train_data.columns:
        encoding_maps[col] = train_data.groupby(col)['ppt'].mean() 

# Apply encoding to combined data
for col in categorical_cols:
    if col in data.columns:
        data[col + '_encoded'] = data[col].map(encoding_maps[col]).fillna(train_data['ppt'].mean())

# Split back into train, test, holdout
train_data = data.iloc[:len(train_data)]
test_data = data.iloc[len(train_data):len(train_data) + len(test_data)]
holdout_data = data.iloc[len(train_data) + len(test_data):]

In [None]:
# Export
train_data.to_csv("/kaggle/working/train.csv", index=False)
test_data.to_csv("/kaggle/working/test.csv", index=False)
holdout_data.to_csv("/kaggle/working/holdout.csv", index=False)