### 0. Load Data

In [34]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

In [4]:
data_path = '/Users/park/Desktop/data/'

csv_files = [f for f in os.listdir(data_path) if f.endswith('.csv')]

data_df = []
for csv_file in csv_files:
    file_path = os.path.join(data_path, csv_file)
    df = pd.read_csv(file_path, sep=';') 
    file_location = csv_file.split('_')[-1].split('.')[0]
    df['Location'] = int(file_location)
    data_df.append(df)
    
data = pd.concat(data_df, ignore_index=True)
display(data)

Unnamed: 0,YYYY,MM,DD,DOY,2m_temp_max,2m_temp_mean,2m_temp_min,2m_dp_temp_max,2m_dp_temp_mean,2m_dp_temp_min,...,surf_net_solar_rad_max,surf_net_solar_rad_mean,surf_net_therm_rad_max,surf_net_therm_rad_mean,surf_press,total_et,prec,volsw_123,volsw_4,Location
0,1981,1,1,1,-8.1,-10.6,-14.6,-11.5,-15.3,-18.9,...,76,15,32,12,77187,-0.05,3.88,0.39,0.45,45
1,1981,1,2,2,-11.5,-13.3,-15.1,-13.4,-15.5,-16.9,...,109,22,58,20,77098,0.00,8.19,0.39,0.45,45
2,1981,1,3,3,-6.0,-8.6,-12.9,-7.0,-10.7,-14.3,...,89,19,32,5,76521,-0.05,10.02,0.39,0.45,45
3,1981,1,4,4,-5.3,-8.2,-13.8,-6.8,-9.6,-16.5,...,57,11,65,16,75728,-0.01,18.61,0.39,0.45,45
4,1981,1,5,5,-13.8,-15.5,-16.6,-16.7,-18.2,-19.6,...,73,14,63,23,76093,0.12,11.69,0.39,0.45,45
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1424395,2019,12,27,361,2.4,1.3,0.0,1.4,0.4,-2.2,...,43,9,18,7,92082,0.15,9.23,0.40,0.40,620
1424396,2019,12,28,362,0.5,-2.1,-5.5,-0.5,-3.7,-7.2,...,171,36,95,55,92973,0.31,0.03,0.40,0.40,620
1424397,2019,12,29,363,1.8,-3.9,-6.6,-4.5,-7.6,-9.6,...,217,46,103,72,92940,0.15,0.00,0.40,0.40,620
1424398,2019,12,30,364,4.4,-2.6,-6.1,-2.7,-7.6,-12.4,...,214,44,88,54,92708,0.05,0.00,0.40,0.41,620


### 1. Preprocessing

2-1. Train/Test Split

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [47]:
X = data.drop(columns=['YYYY', 'MM', 'DD', 'DOY', 'prec']).values
y = data['prec'].values 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=False)

scaler_X = MinMaxScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

scaler_y = MinMaxScaler()
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)
y_train_scaled = scaler_y.fit_transform(y_train_log.reshape(-1, 1))
y_test_scaled = scaler_y.transform(y_test_log.reshape(-1, 1))

def create_sequences(X, y, seq_length):
    X_seq, y_seq = [], []
    for i in range(len(X) - seq_length):
        X_seq.append(X[i:i + seq_length])
        y_seq.append(y[i + seq_length])
    return np.array(X_seq), np.array(y_seq)

seq_length = 10
X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train_scaled, seq_length)
X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test_scaled, seq_length)

model = Sequential([
    LSTM(50, activation='relu', input_shape=(seq_length, X_train_seq.shape[2])),
    Dense(1)
])
model.compile(optimizer='adam', loss='mse')

history = model.fit(
    X_train_seq, y_train_seq,
    epochs=20,
    batch_size=32,
    validation_data=(X_test_seq, y_test_seq),
    verbose=1
)

predictions_scaled = model.predict(X_test_seq)
y_test_rescaled = np.expm1(scaler_y.inverse_transform(y_test_seq))
predictions_rescaled = np.expm1(scaler_y.inverse_transform(predictions_scaled))

mse = mean_squared_error(y_test_rescaled, predictions_rescaled)
rmse = np.sqrt(mse)
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")

Epoch 1/20
 2679/31159 [=>............................] - ETA: 55s - loss: 0.0349

KeyboardInterrupt: 