In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.autograd.variable import Variable

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler, Normalizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('data/data.csv')
df

Unnamed: 0,month,temperature,RH,wind_speed,day_night,daily_cdd,daily_hdd,burned_area
0,1,9.284265,0.689222,6.951015,1,0.000000,0.715735,1
1,1,10.481195,0.716552,15.878100,1,0.481195,0.000000,1
2,1,0.069574,0.751346,7.876296,1,0.000000,9.930426,1
3,1,1.479761,0.769021,10.005970,1,0.000000,8.520239,1
4,1,6.572412,0.608134,2.239136,1,0.000000,3.427588,1
...,...,...,...,...,...,...,...,...
5274,6,21.583490,0.557554,4.279598,1,11.583490,0.000000,1
5275,6,21.583490,0.557554,4.279598,1,11.583490,0.000000,1
5276,6,21.583490,0.557554,4.279598,1,11.583490,0.000000,1
5277,6,22.098138,0.870264,3.978109,1,12.098138,0.000000,1


In [3]:
#scaler = StandardScaler()
scaler = MinMaxScaler()
df.iloc[:, 1:] = scaler.fit_transform(df.iloc[:, 1:])
df

Unnamed: 0,month,temperature,RH,wind_speed,day_night,daily_cdd,daily_hdd,burned_area
0,1,0.287486,0.679306,0.309304,1.0,0.000000,0.054281,1.0
1,1,0.315080,0.709918,0.707834,1.0,0.015939,0.000000,1.0
2,1,0.075047,0.748891,0.350611,1.0,0.000000,0.753124,1.0
3,1,0.107558,0.768689,0.445686,1.0,0.000000,0.646175,1.0
4,1,0.224966,0.588479,0.098952,1.0,0.000000,0.259948,1.0
...,...,...,...,...,...,...,...,...
5274,6,0.571037,0.531824,0.190044,1.0,0.383685,0.000000,1.0
5275,6,0.571037,0.531824,0.190044,1.0,0.383685,0.000000,1.0
5276,6,0.571037,0.531824,0.190044,1.0,0.383685,0.000000,1.0
5277,6,0.582902,0.882091,0.176585,1.0,0.400732,0.000000,1.0


In [4]:
class Generator(nn.Module):
    def __init__(self, embedding_dim, noise_size, hidden_size, output_size):
        super(Generator, self).__init__()
        self.embed = nn.Embedding(num_embeddings=len(df["burned_area"].unique()), embedding_dim=embedding_dim)
        self.model = nn.Sequential(
            nn.Linear(embedding_dim + noise_size, 100),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(100, 80),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(80, 40),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(40, output_size)
        )

    def forward(self, noise, labels):
        month_embedding = self.embed(labels)

        x = torch.cat([noise, month_embedding], 1)

        return self.model(x)


In [5]:
class Discriminator(nn.Module):
    def __init__(self, input_size, embedding_dim, hidden_size, output_size):
        super(Discriminator, self).__init__()
        self.embed = nn.Embedding(num_embeddings=len(df["burned_area"].unique()), embedding_dim=embedding_dim)
        self.model = nn.Sequential(
            nn.Linear(embedding_dim + input_size, 150),
            nn.ReLU(),
            nn.Linear(150, 50),
            nn.ReLU(),
            nn.Linear(50, 25),
            nn.ReLU(),
            nn.Linear(25, output_size),
            nn.Sigmoid()
        )
    
    def forward(self, inputs, labels):
        month_embedding = self.embed(labels).squeeze()
        x = torch.cat([inputs, month_embedding], 1)
        return self.model(x)

In [6]:
# Hyperparameters
input_size = 5    # (temp, RH, wind, rain, area)
noise_size = 5   # Size of the input noise vector
embedding_dim = 10 # Size of month embedding vector
hidden_size = 256
output_size = 5   # Output size should match the number of features

In [7]:
generator = Generator(embedding_dim, noise_size, hidden_size, output_size)
discriminator = Discriminator(input_size, embedding_dim, hidden_size, 1)

In [8]:
criterion = nn.BCELoss()
optimizer_g = optim.Adam(generator.parameters(), lr = 0.0002)
optimizer_d = optim.Adam(discriminator.parameters(), lr=0.0002)

In [9]:
num_epochs = 80
batch_size = 64

In [10]:
for epoch in range(num_epochs):
    generator.train()
    discriminator.train()
    for i in range(0, len(df), batch_size):
        # Prepare real data
        real_data = torch.FloatTensor(df.iloc[i:i+batch_size, 1:].values)
        real_labels = torch.LongTensor(df.iloc[i:i+batch_size, :1].values)

        # Train the generator
        optimizer_g.zero_grad()
        z = torch.rand(batch_size, noise_size)
        fake_labels = torch.randint(0, len(df["burned_area"].unique()), (batch_size,))
        fake_data = generator(z, fake_labels)
        validity = discriminator(fake_data, fake_labels)
        g_loss = criterion(validity, torch.ones_like(validity))
        g_loss.backward()
        optimizer_g.step()

        # Train the discriminator
        optimizer_d.zero_grad()
        real_outputs = discriminator(real_data, real_labels)
        real_loss = criterion(real_outputs, torch.ones_like(real_outputs))

        optimizer_d.zero_grad()
        #z = Variable(torch.randn(batch_size, noise_size))
        fake_labels = Variable(torch.randint(0, len(df["burned_area"].unique()), (batch_size,)))
        fake_outputs = generator(z, fake_labels)
        fake_validity = discriminator(fake_outputs, fake_labels)
        fake_loss = criterion(fake_validity, torch.zeros_like(fake_validity))
        d_loss = (real_loss + fake_loss) / 2
        d_loss.backward()
        optimizer_d.step()

    if epoch % 20 == 0:
        print(f'Epoch [{epoch}/{num_epochs}], D Loss: {d_loss.mean().float()}, G Loss: {g_loss.mean().float()}')

IndexError: index out of range in self

In [1211]:
def inverse_normalize_data(data, norm_model):
    inverse_norm_data = np.array([norm_model.inverse_transform([row]) for row in data])
    return inverse_norm_data[:, 0, :]

def generate_instance(num_instance):
    generator.eval()
    noise = torch.rand(num_instance, noise_size)
    fake_labels = torch.randint(0, len(df["burned_area"].unique()), (num_instance,))

    generated_instance = generator(noise, fake_labels)
    generated_instance = generated_instance.detach().numpy()

    # Inverse normalization
    generated_instance = inverse_normalize_data(generated_instance, scaler)

    month_column = pd.DataFrame({'month': label_encoder.inverse_transform(fake_labels)}, columns=['month'])
    data_column = pd.DataFrame(generated_instance, columns=['temp', 'RH', 'wind', 'rain', 'area'])

    return pd.concat([month_column, data_column], axis=1)


In [1215]:
generate_instance(5)

Unnamed: 0,month,temp,RH,wind,rain,area
0,nov,16.507787,30.662388,14.895498,5.676175,1.159041
1,mar,12.426145,23.977853,3.386213,-0.209549,0.305411
2,nov,14.650657,27.180402,13.468269,5.130814,1.006604
3,jun,29.516298,39.010709,13.023327,2.046567,0.903709
4,may,29.74934,36.679473,10.569768,1.970961,1.224725


In [1172]:
df = pd.read_csv('data/data.csv')
df.describe()

Unnamed: 0,temp,RH,wind,rain,area
count,883.0,883.0,883.0,883.0,883.0
mean,24.531937,50.770102,8.710193,0.316648,0.353341
std,8.386661,17.705077,6.017583,1.256802,0.478278
min,2.2,15.0,0.4,0.0,0.0
25%,18.35,37.0,3.6,0.0,0.0
50%,24.2,49.0,5.8,0.0,0.0
75%,32.0,64.0,15.0,0.0,1.0
max,42.0,100.0,29.0,16.8,1.0


## Example Kaggle Code
<code>https://www.kaggle.com/code/darrenljw/intro-to-synthetic-data-and-gans-from-scratch</code>

In [11]:
import pandas as pd
from ctgan import CTGAN

In [12]:
ctgan = CTGAN(epochs=500)

In [13]:
real_data = pd.read_csv("data/data.csv")
real_data = real_data[real_data["burned_area"] == 0]
real_data

Unnamed: 0,month,temperature,RH,wind_speed,day_night,daily_cdd,daily_hdd,burned_area
11,2,9.507349,0.836865,6.633478,1,0.000000,0.492651,0
12,2,15.967310,0.414372,19.525938,0,5.967310,0.000000,0
13,2,15.967310,0.414372,19.525938,0,5.967310,0.000000,0
14,2,15.967310,0.414372,19.525938,0,5.967310,0.000000,0
15,2,13.523401,0.489311,22.422637,0,3.523401,0.000000,0
...,...,...,...,...,...,...,...,...
4988,4,24.786096,0.364852,3.982695,1,14.786096,0.000000,0
4989,4,15.852502,0.631576,9.036540,1,5.852502,0.000000,0
4990,4,23.520471,0.481307,2.106200,1,13.520471,0.000000,0
4991,4,13.762659,0.775863,9.239899,1,3.762659,0.000000,0


In [14]:
discrete_column = [
    'month',
    'burned_area',
    'day_night'
]

In [15]:
ctgan.fit(real_data, discrete_column)

In [16]:
synthetic_data = ctgan.sample(1500)
synthetic_data.describe()

Unnamed: 0,month,temperature,RH,wind_speed,day_night,daily_cdd,daily_hdd,burned_area
count,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
mean,3.468667,8.447726,0.305886,3.096148,0.834667,3.555679,1.509603,0.0
std,1.638742,7.632704,0.155122,4.500462,0.371605,5.604677,3.303286,0.0
min,1.0,-8.626074,0.05572,-6.371952,0.0,-5.268528,-1.265766,0.0
25%,2.0,3.888184,0.188051,0.031973,1.0,-0.035671,0.029199,0.0
50%,3.0,8.407665,0.277421,2.128818,1.0,1.431887,0.295077,0.0
75%,4.0,13.051489,0.383679,4.746955,1.0,5.732584,0.552952,0.0
max,8.0,39.340999,1.028921,30.285121,1.0,28.378456,17.557641,0.0


In [18]:
real_data.describe()

Unnamed: 0,month,temperature,RH,wind_speed,day_night,daily_cdd,daily_hdd,burned_area
count,414.0,414.0,414.0,414.0,414.0,414.0,414.0,414.0
mean,3.355072,13.072474,0.474006,6.001407,0.927536,4.323034,1.25056,0.0
std,1.351893,6.324552,0.157372,4.375819,0.259568,4.764245,2.542306,0.0
min,1.0,-3.185645,0.142141,0.193362,0.0,0.0,0.0,0.0
25%,2.0,9.201868,0.358795,2.689507,1.0,0.0,0.0,0.0
50%,3.0,13.149942,0.436581,4.767321,1.0,3.149942,0.0,0.0
75%,4.0,16.641107,0.57702,8.415886,1.0,6.641107,0.798132,0.0
max,8.0,33.99679,0.97553,22.422637,1.0,23.99679,13.185645,0.0


In [8]:
synthetic_data.to_csv("synthetic_zero.csv", index=False)

In [9]:
ctgan.save("generator.pkl")