In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [9]:
np.random.seed(42)

no_of_samples = 1000

# generating random variables with mean 0 and std dev 1, 1.5, and 2
a = np.random.normal(0,1, no_of_samples)
b = np.random.normal(0,1.25, no_of_samples)
c = np.random.normal(0,1.5, no_of_samples)
d = np.random.normal(0,1.75, no_of_samples)

# output is linear combination of x, y, z + some noise (randomness)
output = (0.4 * a) + (0.2 * b) + (0.1 * c) + (0.3 * d) + np.random.normal(0, 0.1, no_of_samples)

In [10]:
# printing the random data

print("Data:\n")
print(f"a range: {a.min():.2f} to {a.max():.2f}")
print(f"b range: {b.min():.2f} to {b.max():.2f}")
print(f"c range: {c.min():.2f} to {c.max():.2f}")
print(f"d range: {d.min():.2f} to {d.max():.2f}")
print(f"output range: {output.min():.2f} to {output.max():.2f}")

Data:

a range: -3.24 to 3.85
b range: -3.68 to 3.99
c range: -4.53 to 5.89
d range: -5.13 to 5.68
output range: -1.95 to 2.20


In [11]:
myDF = pd.DataFrame({
    'a' : a,
    'b' : b,
    'c' : c,
    'd' : d,
    'output' : output
})

print("\nDataFrame:\n")
print(myDF.shape)
print(myDF.head())


DataFrame:

(1000, 5)
          a         b         c         d    output
0  0.496714  1.749194 -1.012767 -3.338663 -0.640701
1 -0.138264  1.155792 -0.216778 -1.505674 -0.300648
2  0.647689  0.074538 -1.188630 -0.723810 -0.060221
3  1.523030 -0.808671 -0.461942  3.303453  1.439583
4 -0.234153  0.872779 -2.840422  0.973968 -0.047643


In [12]:
missing_data = np.random.choice(no_of_samples, size = int(0.1 * no_of_samples), replace = False)
myDF.loc[missing_data, 'a'] = np.nan

In [13]:
missing_data = np.random.choice(no_of_samples, size = int(0.05 * no_of_samples), replace = False)
myDF.loc[missing_data, 'c'] = np.nan

In [14]:
print("Number of missing values:\n")
print(myDF.isnull().sum())

Number of missing values:

a         100
b           0
c          50
d           0
output      0
dtype: int64


In [15]:
print(f"Original DataFrame shape: {myDF.shape}")

myDF_clean_dropped = myDF.dropna()
print(f"myDF_clean_dropped shape: {myDF_clean_dropped.shape}")

myDF_clean_filled = myDF.fillna(0)
print(f"myDF_clean_filled shape: {myDF_clean_filled.shape}")

Original DataFrame shape: (1000, 5)
myDF_clean_dropped shape: (856, 5)
myDF_clean_filled shape: (1000, 5)


In [16]:
class Autoencoder(nn.Module):
    def __init__(self, input_size = 4, output_size = 1):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, 8),
            nn.ReLU(),
            nn.Linear(8, 6),
            nn.ReLU(),
            nn.Linear(6, 3)
        )

        self.decoder = nn.Sequential(
            nn.Linear(3, 6),
            nn.ReLU(),
            nn.Linear(6, 8),
            nn.ReLU(),
            nn.Linear(8, output_size)
        )

    def forward(self, input):
        encoded = self.encoder(input)
        decoded = self.decoder(encoded)
        return decoded

In [17]:
model = Autoencoder(input_size = 4, output_size = 1)

In [18]:
# training

input_data = myDF_clean_filled[['a', 'b', 'c', 'd']].values
output_data = myDF_clean_filled['output'].values

print(f"input_data shape: {input_data.shape}")
print(f"output_data shape: {output_data.reshape(-1,1).shape}")

# normalizing data
scaled_input_data = StandardScaler().fit_transform(input_data)
scaled_output_data = StandardScaler().fit_transform(output_data.reshape(-1, 1)).flatten()

print("\nNormalized data:")
print(f"input range: {scaled_input_data.min():.2f} to {scaled_input_data.max():.2f}")
print(f"output range: {scaled_output_data.min():.2f} to {scaled_output_data.max():.2f}")

# splitting data into training and testing sets
input_train, input_test, output_train, output_test = train_test_split(
    scaled_input_data, scaled_output_data, test_size=0.2, random_state=42
)

print(f"\nTrain set: {input_train.shape[0]} samples")
print(f"Test set: {input_test.shape[0]} samples")


input_data shape: (1000, 4)
output_data shape: (1000, 1)

Normalized data:
input range: -3.21 to 4.14
output range: -2.73 to 3.05

Train set: 800 samples
Test set: 200 samples


In [19]:
# coverting to tensors

input_train_tensor = torch.FloatTensor(input_train)
input_test_tensor = torch.FloatTensor(input_test)
output_train_tensor = torch.FloatTensor(output_train).unsqueeze(1)
output_test_tensor = torch.FloatTensor(output_test).unsqueeze(1)

print(f"Training tensors:")
print(f"Input: {input_train_tensor.shape}")
print(f"Output: {output_train_tensor.shape}")


criterion = nn.MSELoss()  # Mean Squared Error loss
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adam optimizer

print(f"Loss function: {criterion}")
print(f"Optimizer: {optimizer}")

Training tensors:
Input: torch.Size([800, 4])
Output: torch.Size([800, 1])
Loss function: MSELoss()
Optimizer: Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    decoupled_weight_decay: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0
)
Loss function: MSELoss()
Optimizer: Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    decoupled_weight_decay: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0
)


In [20]:
no_of_epochs = 100
train_losses = []

for epoch in range(no_of_epochs):
    model.train()
    optimizer.zero_grad()

    predictions = model(input_train_tensor)
    
    loss = criterion(predictions, output_train_tensor)
    
    loss.backward()
    optimizer.step()

    train_losses.append(loss.item())

    if ((epoch + 1) % 10) == 0:
        print(f'Epoch [{epoch + 1}/{no_of_epochs}], Loss: {loss.item():.4f}')

    print(f"Final loss: {train_losses[-1]:.4f}")

Final loss: 1.0530
Final loss: 1.0509
Final loss: 1.0488
Final loss: 1.0469
Final loss: 1.0450
Final loss: 1.0432
Final loss: 1.0414
Final loss: 1.0398
Final loss: 1.0382
Epoch [10/100], Loss: 1.0366
Final loss: 1.0366
Final loss: 1.0352
Final loss: 1.0338
Final loss: 1.0324
Final loss: 1.0311
Final loss: 1.0299
Final loss: 1.0288
Final loss: 1.0276
Final loss: 1.0266
Final loss: 1.0255
Epoch [20/100], Loss: 1.0246
Final loss: 1.0246
Final loss: 1.0236
Final loss: 1.0227
Final loss: 1.0219
Final loss: 1.0211
Final loss: 1.0203
Final loss: 1.0195
Final loss: 1.0188
Final loss: 1.0181
Final loss: 1.0175
Epoch [30/100], Loss: 1.0168
Final loss: 1.0168
Final loss: 1.0162
Final loss: 1.0156
Final loss: 1.0151
Final loss: 1.0145
Final loss: 1.0140
Final loss: 1.0135
Final loss: 1.0129
Final loss: 1.0124
Final loss: 1.0119
Epoch [40/100], Loss: 1.0115
Final loss: 1.0115
Final loss: 1.0111
Final loss: 1.0106
Final loss: 1.0102
Final loss: 1.0098
Final loss: 1.0094
Final loss: 1.0091
Final loss

In [21]:
model.eval()

with torch.no_grad():
    test_predictions = model(input_test_tensor)
    test_loss = criterion(test_predictions, output_test_tensor)

print(f"Test Loss: {test_loss.item():.4f}")

test_predictions_numpy = test_predictions.numpy().flatten()
output_test_numpy = output_test_tensor.numpy().flatten()

mse = np.mean((test_predictions_numpy - output_test_numpy) ** 2)
mae = np.mean(np.abs(test_predictions_numpy - output_test_numpy))
rmse = np.sqrt(mse)

print(f"\nTest Metrics:")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

print(f"\nSample Predictions vs Actual:")
for i in range(5):
    print(f"Sample {i+1}: Predicted={test_predictions_numpy[i]:.3f}, Actual={output_test_numpy[i]:.3f}")

Test Loss: 0.9112

Test Metrics:
Mean Squared Error (MSE): 0.9112
Mean Absolute Error (MAE): 0.7431
Root Mean Squared Error (RMSE): 0.9546

Sample Predictions vs Actual:
Sample 1: Predicted=0.036, Actual=-0.561
Sample 2: Predicted=0.009, Actual=0.119
Sample 3: Predicted=-0.001, Actual=-1.444
Sample 4: Predicted=0.032, Actual=-0.180
Sample 5: Predicted=0.029, Actual=-0.431


In [37]:
actualDF = pd.read_excel("Copy of Folds5x2_pp.xlsx")
actualDF_clean = actualDF.fillna(0)

In [38]:
print(f"Actual Data Shape: {actualDF_clean.shape}")
print(f"Actual Data Columns: {actualDF_clean.columns.tolist()}")

Actual Data Shape: (9568, 5)
Actual Data Columns: ['AT', 'V', 'AP', 'RH', 'PE']


In [None]:
output_column = actualDF_clean.columns[-1]
input_columns = [col for col in actualDF_clean.columns if col != output_column]

In [40]:
input_data = actualDF_clean[input_columns].values
output_data = actualDF_clean[output_column].values

print(f"Input shape: {input_data.shape}, Output shape: {output_data.shape}")
print(f"\nFirst 5 rows of real data:")
print(actualDF_clean[input_columns + [output_column]].head())

Input shape: (9568, 4), Output shape: (9568,)

First 5 rows of real data:
      AT      V       AP     RH      PE
0  14.96  41.76  1024.07  73.17  463.26
1  25.18  62.96  1020.04  59.08  444.37
2   5.11  39.40  1012.16  92.14  488.56
3  20.86  57.32  1010.24  76.64  446.48
4  10.82  37.50  1009.23  96.62  473.90


In [41]:
scalar_input = StandardScaler().fit_transform(input_data)
scalar_output = StandardScaler().fit_transform(output_data.reshape(-1, 1)).flatten()

print(f"Original data:")
for i, col in enumerate(input_columns):
    print(f"{col}: {input_data[:, i].min():.2f} to {input_data[:, i].max():.2f}")
print(f"{output_column}: {output_data.min():.2f} to {output_data.max():.2f}")

print(f"\nNormalized data:")
for i, col in enumerate(input_columns):
    print(f"{col}: {scalar_input[:, i].min():.2f} to {scalar_input[:, i].max():.2f}")
print(f"{output_column}: {scalar_output.min():.2f} to {scalar_output.max():.2f}")

print(f"\nnormalized input shape: {scalar_input.shape}, normalized output shape: {scalar_output.shape}")

Original data:
AT: 1.81 to 37.11
V: 25.36 to 81.56
AP: 992.89 to 1033.30
RH: 25.56 to 100.16
PE: 420.26 to 495.76

Normalized data:
AT: -2.39 to 2.34
V: -2.28 to 2.14
AP: -3.43 to 3.37
RH: -3.27 to 1.84
PE: -2.00 to 2.43

normalized input shape: (9568, 4), normalized output shape: (9568,)


In [42]:
input_train_real, input_test_real, output_train_real, output_test_real = train_test_split(
    scalar_input, scalar_output, test_size=0.2, random_state=42
)

input_train_tensor_real = torch.FloatTensor(input_train_real)
input_test_tensor_real = torch.FloatTensor(input_test_real)
output_train_tensor_real = torch.FloatTensor(output_train_real).unsqueeze(1)
output_test_tensor_real = torch.FloatTensor(output_test_real).unsqueeze(1)

real_model = Autoencoder(input_size=4, output_size=1)
criterion_real = nn.MSELoss()
optimizer_real = optim.Adam(real_model.parameters(), lr=0.001)

In [None]:
#training

no_of_epochs_real = 500
train_losses_real = []

for epoch in range(no_of_epochs_real):
    real_model.train()
    optimizer_real.zero_grad()
    
    predictions_real = real_model(input_train_tensor_real)
    
    loss_real = criterion_real(predictions_real, output_train_tensor_real)
    
    loss_real.backward()
    optimizer_real.step()
    
    train_losses_real.append(loss_real.item())
    
    if ((epoch + 1) % 50) == 0:
        print(f'Epoch [{epoch + 1}/{no_of_epochs_real}], Loss: {loss_real.item():.4f}')

print(f"\nTraining completed!")
print(f"Final training loss: {train_losses_real[-1]:.4f}")

Epoch [50/500], Loss: 0.9934
Epoch [100/500], Loss: 0.7977
Epoch [150/500], Loss: 0.1669
Epoch [200/500], Loss: 0.0984
Epoch [250/500], Loss: 0.0737
Epoch [300/500], Loss: 0.0641
Epoch [350/500], Loss: 0.0615
Epoch [400/500], Loss: 0.0606
Epoch [450/500], Loss: 0.0601
Epoch [500/500], Loss: 0.0598

Training completed!
Final training loss: 0.0598


In [None]:
# testing

real_model.eval()
with torch.no_grad():
    test_predictions_real = real_model(input_test_tensor_real)
    test_loss_real = criterion_real(test_predictions_real, output_test_tensor_real)

print(f"Test Loss on Real Data: {test_loss_real.item():.4f}")

test_predictions_real_numpy = test_predictions_real.numpy().flatten()
output_test_real_numpy = output_test_tensor_real.numpy().flatten()

mse_real = np.mean((test_predictions_real_numpy - output_test_real_numpy) ** 2)
mae_real = np.mean(np.abs(test_predictions_real_numpy - output_test_real_numpy))
rmse_real = np.sqrt(mse_real)

print(f"\nTest Metrics on Real Data:")
print(f"Mean Squared Error (MSE): {mse_real:.4f}")
print(f"Mean Absolute Error (MAE): {mae_real:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_real:.4f}")

print(f"\nSample Predictions vs Actual on Real Data:")
for i in range(10):
    print(f"Sample {i+1}: Predicted={test_predictions_real_numpy[i]:.3f}, Actual={output_test_real_numpy[i]:.3f}")

Test Loss on Real Data: 0.0584

Test Metrics on Real Data:
Mean Squared Error (MSE): 0.0584
Mean Absolute Error (MAE): 0.1881
Root Mean Squared Error (RMSE): 0.2417

Sample Predictions vs Actual on Real Data:
Sample 1: Predicted=0.049, Actual=0.053
Sample 2: Predicted=-0.982, Actual=-1.058
Sample 3: Predicted=-1.157, Actual=-0.802
Sample 4: Predicted=-0.980, Actual=-1.170
Sample 5: Predicted=1.465, Actual=1.623
Sample 6: Predicted=-0.925, Actual=-1.072
Sample 7: Predicted=-0.305, Actual=-0.110
Sample 8: Predicted=-1.162, Actual=-1.122
Sample 9: Predicted=-1.228, Actual=-1.256
Sample 10: Predicted=1.077, Actual=0.709
