In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

import pandas as pd
import numpy as np

In [2]:
input_dim = 13040
latent_dim = 16

train_val = pd.read_csv("train_values.csv")
train_labels = pd.read_csv("train_labels.csv")

geo_data_raw = train_val[["geo_level_1_id","geo_level_2_id","geo_level_3_id"]]
geo_data_raw = geo_data_raw.astype("category")
geo_data = pd.get_dummies(geo_data_raw)
geo_data.shape

(260601, 13040)

In [11]:
geo_np = geo_data.to_numpy()

rand_ind = np.random.choice(geo_np.shape[0], size = 60000, replace=False)
#geo_np = geo_np[np.random.randint(geo_np.shape[0], size=60000,), :]
geo_np = geo_np[rand_ind,:]

geo_np.shape

(60000, 13040)

In [12]:
geo_np_test = geo_np[-10000:,:]
geo_np = geo_np[0:50000,:]
print(geo_np.shape)
print(geo_np_test.shape)

(50000, 13040)
(10000, 13040)


In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

dummy_data = torch.from_numpy(geo_np).float().to(device)
dummy_data_test = torch.from_numpy(geo_np_test).float().to(device)

In [17]:
# Check if a GPU is available
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class Autoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 256),
            nn.ReLU(),
            nn.Linear(256, input_dim),
            nn.Sigmoid()  # Sigmoid activation to scale values between 0 and 1
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

torch.manual_seed(42)

model = Autoencoder(input_dim, latent_dim).to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

batch_size = 1024
data_loader = DataLoader(TensorDataset(dummy_data), batch_size=batch_size, shuffle=True)
test_data_loader = DataLoader(TensorDataset(dummy_data_test), batch_size=batch_size, shuffle=True)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    for batch in data_loader:
        inputs, = batch
        inputs = inputs.to(device)

        outputs = model(inputs)

        loss = criterion(outputs, inputs)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    model.eval()
    total_test_loss = 0.0
    num_batches_test = 0

    with torch.no_grad():
        for test_batch in test_data_loader:
            test_inputs, = test_batch
            test_inputs = test_inputs.to(device)

            test_outputs = model(test_inputs)

            test_loss = criterion(test_outputs, test_inputs)
            total_test_loss += test_loss.item()
            num_batches_test += 1

    average_test_loss = total_test_loss / num_batches_test

    #print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.6f}')
    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {loss.item():.6f}, Test Loss: {average_test_loss:.6f}')


torch.save(model.state_dict(), 'autoencoder_model.pth')


Epoch [1/10], Train Loss: 0.000245, Test Loss: 0.000241
Epoch [2/10], Train Loss: 0.000230, Test Loss: 0.000230
Epoch [3/10], Train Loss: 0.000228, Test Loss: 0.000228
Epoch [4/10], Train Loss: 0.000227, Test Loss: 0.000226
Epoch [5/10], Train Loss: 0.000226, Test Loss: 0.000226
Epoch [6/10], Train Loss: 0.000226, Test Loss: 0.000226
Epoch [7/10], Train Loss: 0.000226, Test Loss: 0.000226
Epoch [8/10], Train Loss: 0.000226, Test Loss: 0.000226
Epoch [9/10], Train Loss: 0.000226, Test Loss: 0.000226
Epoch [10/10], Train Loss: 0.000225, Test Loss: 0.000226


In [10]:
geo_np_all = geo_data.to_numpy()
geo_np_all.shape

(260601, 13040)

In [11]:
print(model)

Autoencoder(
  (encoder): Sequential(
    (0): Linear(in_features=13040, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=16, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=16, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=13040, bias=True)
    (3): Sigmoid()
  )
)


In [12]:
weight_1 = model.encoder[0].weight.detach().numpy()
weight_2 = model.encoder[2].weight.detach().numpy()
# x * (x > 0)

In [13]:
geo_new = np.empty((1,16))
for i in range(0,26):
    # print(10000*i)
    # print(10000*i+9999+1)

    start = 10000*i
    end = 10000*i+10000

    temp = geo_np_all[start:end,:]
    #print(temp.shape[0])

    out = weight_1 @ temp.T
    out = out * (out > 0)
    out = weight_2 @ out
    out = out.T

    geo_new = np.vstack([geo_new, out])

geo_new = np.delete(geo_new,0,axis=0)
geo_new.shape

(260000, 16)

In [31]:
temp = geo_np_all[-601:,:]
print(temp.shape)

(601, 13040)


In [32]:
out = weight_1 @ temp.T
out = out * (out > 0)
out = weight_2 @ out
out = out.T

geo_new = np.vstack([geo_new, out])
geo_new.shape

(260601, 16)

In [33]:
np.savetxt("Geo_Data_Train_Sigmoid.csv", geo_new, delimiter=",")