In [2]:
import pandas as pd
import numpy as np

import os

In [None]:
# Define the relative path to the CSV file
file_path = '../data_with_state.csv'

# Read the CSV file
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe
data.head()


In [None]:
data.dtypes

In [None]:
# display shape
data.shape

In [None]:
data['STATE'].unique()

In [None]:
import matplotlib.pyplot as plt

data['gws_inst'].plot(kind='hist', bins=50, figsize=(10, 6))
plt.title("Histogram of gws_inst")
plt.xlabel('gws_inst Values')
plt.show()


In [None]:
# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = data['gws_inst'].quantile(0.25)
Q3 = data['gws_inst'].quantile(0.75)
IQR = Q3 - Q1

# Calculate lower and upper bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = data[(data['gws_inst'] < lower_bound) | (data['gws_inst'] > upper_bound)]

# Display the outliers
print(f"Number of outliers detected: {len(outliers)}")
outliers


In [None]:
from scipy import stats
import numpy as np

# Calculate Z-scores for the 'gws_inst' column
z_scores = np.abs(stats.zscore(data['gws_inst'].dropna()))  # Drop NA values before calculating Z-scores


z_scores.plot(kind='hist', bins=50, figsize=(10, 6))

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming your DataFrame is named 'data'
# Compute the correlation matrix
correlation_matrix = data[['gws_inst', 'rtzsm_inst', 'sfsm_inst']].corr()

# Plot the correlation matrix
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', square=True)
plt.title('Feature Correlation Matrix')
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

# Load the dataset
data = pd.read_csv('../data_with_state.csv')

# Convert the 'time' column to datetime
data['time'] = pd.to_datetime(data['time'], format='%d-%m-%Y')


# Sort the data by time
data = data.sort_values('time')

# Select the relevant features for the time-series analysis
features = ['lat', 'lon', 'rtzsm_inst', 'sfsm_inst', 'week_no']
target = 'gws_inst'

# Normalize the data
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data[features + [target]])


# Create sequences for LSTM (let's use 30 timesteps)
def create_sequences(data, seq_length):
    xs = []
    ys = []
    for i in range(len(data) - seq_length):
        x = data[i:i+seq_length, :-1]  # all features except target
        y = data[i+seq_length, -1]  # target column
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)
SEQ_LENGTH = 30  # Number of past time steps to consider
X, y = create_sequences(data_scaled, SEQ_LENGTH)
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the LSTM model
model = Sequential()

# First LSTM layer with Dropout
model.add(LSTM(units=128, return_sequences=True, input_shape=(SEQ_LENGTH, X_train.shape[2])))
model.add(Dropout(0.3))
# Second LSTM layer
model.add(LSTM(units=64, return_sequences=False))
model.add(Dropout(0.3))

# Output layer for regression
model.add(Dense(1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Early stopping to avoid overfitting (monitor validation loss)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model with fine-tuned batch size and more epochs


# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test),callbacks=[early_stopping])




In [None]:
# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper right')
plt.show()

In [None]:
# Make predictions
predictions = model.predict(X_test)

# Rescale the predictions back to original values
y_test_rescaled = scaler.inverse_transform(np.concatenate([np.zeros((len(y_test), X_test.shape[2])), y_test.reshape(-1,1)], axis=1))[:, -1]
predictions_rescaled = scaler.inverse_transform(np.concatenate([np.zeros((len(predictions), X_test.shape[2])), predictions], axis=1))[:, -1]



In [None]:
# Plot actual vs predicted
plt.figure(figsize=(10, 6))
plt.plot(y_test_rescaled, label='Actual Groundwater Storage')
plt.plot(predictions_rescaled, label='Predicted Groundwater Storage')
plt.title('Actual vs Predicted Groundwater Storage')
plt.xlabel('Samples')
plt.ylabel('Groundwater Storage')
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Calculate Mean Absolute Error
mae = mean_absolute_error(y_test_rescaled, predictions_rescaled)
print("Mean Absolute Error (MAE):", mae)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test_rescaled, predictions_rescaled)
print("Mean Squared Error (MSE):", mse)

# Calculate Root Mean Squared Error
rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)


In [None]:
from sklearn.metrics import r2_score

r2 = r2_score(y_test_rescaled, predictions_rescaled)
print("R² Score:", r2)


In [None]:
# Number of samples
n = len(y_test_rescaled)

p=1

# Calculate Adjusted R²
adjusted_r2 = 1 - ((1 - r2) * (n - 1)) / (n - p - 1)
adjusted_r2

In [None]:

import pickle

# Save the model as a .h5 file
model.save("lstm_model.h5")

# Optionally, you can save a reference to the file path using pickle
model_path = "lstm_model.h5"

# Save the path to a .pkl file
with open('lstm_path.pkl', 'wb') as file:
    pickle.dump(model_path, file)