In [139]:
# Adapted from:
# https://medium.com/@ms_somanna/guide-to-adding-noise-to-your-data-using-python-and-numpy-c8be815df524

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import numexpr
import bottleneck

path = "Data/pickles/"

# Use the non-normalized data for this:
test_df = pd.read_pickle(path + "data_test.pkl")

test_df.head()

Unnamed: 0,energy(kWh/hh),isNA,year,month,day,hour,minute,dayNumber,weekend,CustomerCount,dipDay,ToU,AG__ACORN-,AG__ACORN-U,AG__Adversity,AG__Affluent,AG__Comfortable
151035318,1.044,0,2012,11,29,20,30,3,0,5530.0,0,0,0,0,0,1,0
151035319,0.932,0,2012,11,29,21,0,3,0,5530.0,0,0,0,0,0,1,0
151035320,0.7,0,2012,11,29,21,30,3,0,5530.0,0,0,0,0,0,1,0
151035321,0.828,0,2012,11,29,22,0,3,0,5530.0,0,0,0,0,0,1,0
151035322,1.048,0,2012,11,29,22,30,3,0,5530.0,0,0,0,0,0,1,0


In [140]:
test_df.shape

(16781703, 17)

In [141]:
y = test_df["energy(kWh/hh)"]
y[0:5]

151035318    1.044
151035319    0.932
151035320    0.700
151035321    0.828
151035322    1.048
Name: energy(kWh/hh), dtype: float64

# Amount of Noise

In [142]:
dataSize = y.size

# Set the percentage of data that should contain noise
noise_percentage = 0.1  # Setting to 10% noise

# Determine the size of the noise based on the noise precentage
noise_size = int(noise_percentage*dataSize)

# Randomly select indices for adding noise.
random_indices = np.random.choice(dataSize, noise_size)

# Gaussian Noise

In [143]:
std_deviation = 0.25
mean = 2

y_gaussian_noise = np.random.normal(mean, std_deviation, noise_size)

In [144]:
# Apply Gaussian noise only to the selected indices in gaussian_noise
gaussian_noise = y.copy()
gaussian_noise.to_numpy().flat[random_indices] += y_gaussian_noise

In [145]:
gaussianData = test_df.copy()

In [146]:
gaussianData["energy(kWh/hh)"] = gaussian_noise

In [147]:
# Find rows where values in df2 differ from df1
mask = (gaussianData != test_df).any(axis=1)

# Select 5 rows from df2 where values are different from df1
different_rows_df2 = gaussianData[mask].head(5)

# Print the selected rows
print("Rows in df2 with different values from df1:")
print(different_rows_df2)

Rows in df2 with different values from df1:
           energy(kWh/hh)  isNA  year  month  day  hour  minute  dayNumber  \
151035337        2.449307     0  2012     11   30     6       0          4   
151035342        2.567269     0  2012     11   30     8      30          4   
151035344        2.496551     0  2012     11   30     9      30          4   
151035346        2.246264     0  2012     11   30    10      30          4   
151035358        2.321848     0  2012     11   30    16      30          4   

           weekend  CustomerCount  dipDay  ToU  AG__ACORN-  AG__ACORN-U  \
151035337        0         5529.0       0    0           0            0   
151035342        0         5529.0       0    0           0            0   
151035344        0         5529.0       0    0           0            0   
151035346        0         5529.0       0    0           0            0   
151035358        0         5529.0       0    0           0            0   

           AG__Adversity  AG__Afflue

In [148]:
gaussianData["energy(kWh/hh)"].describe().transpose()

count    1.678170e+07
mean     3.476376e-01
std      6.360670e-01
min      0.000000e+00
25%      4.800000e-02
50%      1.020000e-01
75%      2.360000e-01
max      7.397589e+00
Name: energy(kWh/hh), dtype: float64

In [149]:
test_df["energy(kWh/hh)"].describe().transpose()

count    1.678170e+07
mean     1.573662e-01
std      2.331511e-01
min      0.000000e+00
25%      4.500000e-02
50%      8.900000e-02
75%      1.770000e-01
max      5.629000e+00
Name: energy(kWh/hh), dtype: float64

In [150]:
# Save
path = "Data/pickles/"
gaussianData.to_pickle(path + "data_noisy_gaussian_test.pkl")

# Salt and Pepper Noise

In [151]:
# Create a copy of the test data
y_peppered = y.copy()

# Create a noise list with random placements of min and max values
noise = np.random.choice([y.min(), y.max()], noise_size)

# Replace the values of the templated noised at random indices
y_peppered.to_numpy().flat[random_indices] = noise

pepperedData = test_df.copy()
pepperedData["energy(kWh/hh)"] = y_peppered

In [152]:
# Find rows where values in df2 differ from df1
mask = (pepperedData != test_df).any(axis=1)

# Select 5 rows from df2 where values are different from df1
different_rows_df2 = pepperedData[mask].head(5)

# Print the selected rows
print("Rows in df2 with different values from df1:")
print(different_rows_df2)

Rows in df2 with different values from df1:
           energy(kWh/hh)  isNA  year  month  day  hour  minute  dayNumber  \
151035337           5.629     0  2012     11   30     6       0          4   
151035342           0.000     0  2012     11   30     8      30          4   
151035344           0.000     0  2012     11   30     9      30          4   
151035346           5.629     0  2012     11   30    10      30          4   
151035358           5.629     0  2012     11   30    16      30          4   

           weekend  CustomerCount  dipDay  ToU  AG__ACORN-  AG__ACORN-U  \
151035337        0         5529.0       0    0           0            0   
151035342        0         5529.0       0    0           0            0   
151035344        0         5529.0       0    0           0            0   
151035346        0         5529.0       0    0           0            0   
151035358        0         5529.0       0    0           0            0   

           AG__Adversity  AG__Afflue

In [153]:
pepperedData["energy(kWh/hh)"].describe().transpose()

count    1.678170e+07
mean     4.101343e-01
std      1.187641e+00
min      0.000000e+00
25%      4.100000e-02
50%      8.900000e-02
75%      1.940000e-01
max      5.629000e+00
Name: energy(kWh/hh), dtype: float64

In [154]:
test_df["energy(kWh/hh)"].describe().transpose()

count    1.678170e+07
mean     1.573662e-01
std      2.331511e-01
min      0.000000e+00
25%      4.500000e-02
50%      8.900000e-02
75%      1.770000e-01
max      5.629000e+00
Name: energy(kWh/hh), dtype: float64

In [155]:
# Save
path = "Data/pickles/"
pepperedData.to_pickle(path + "data_noisy_pepper_test.pkl")

# Targeted Noise

In [156]:
y_targeted = y.copy()

noise = 2.4

# Replace the values of the templated noised at random indices
y_targeted.to_numpy().flat[random_indices < 1] = noise

targetedData = test_df.copy()
targetedData["energy(kWh/hh)"] = y_targeted

In [157]:
# Find rows where values in df2 differ from df1
mask = (targetedData != test_df).any(axis=1)

# Select 5 rows from df2 where values are different from df1
different_rows_df2 = targetedData[mask].head(5)

# Print the selected rows
print("Rows in df2 with different values from df1:")
print(different_rows_df2)

Rows in df2 with different values from df1:
Empty DataFrame
Columns: [energy(kWh/hh), isNA, year, month, day, hour, minute, dayNumber, weekend, CustomerCount, dipDay, ToU, AG__ACORN-, AG__ACORN-U, AG__Adversity, AG__Affluent, AG__Comfortable]
Index: []


In [158]:
targetedData["energy(kWh/hh)"].describe().transpose()

count    1.678170e+07
mean     1.573662e-01
std      2.331511e-01
min      0.000000e+00
25%      4.500000e-02
50%      8.900000e-02
75%      1.770000e-01
max      5.629000e+00
Name: energy(kWh/hh), dtype: float64

In [159]:
test_df["energy(kWh/hh)"].describe().transpose()

count    1.678170e+07
mean     1.573662e-01
std      2.331511e-01
min      0.000000e+00
25%      4.500000e-02
50%      8.900000e-02
75%      1.770000e-01
max      5.629000e+00
Name: energy(kWh/hh), dtype: float64

In [160]:
# Save
path = "Data/pickles/"
pepperedData.to_pickle(path + "data_noisy_targeted_test.pkl")