In [1]:
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler

fs = pa.hdfs.connect()

  fs = pa.hdfs.connect()


In [2]:
from sklearn.metrics import mean_squared_error as mse, mean_absolute_percentage_error as mape, r2_score
def metrics(actual, predicted):    
    y_true = actual
    y_pred = predicted
    print(f"rmse: {mse(y_true, y_pred)**(0.5)}")
    print(f"mape: {mape(y_true, y_pred)}")
    print(f"r2_score: {r2_score(y_true, y_pred)}")

In [3]:
class NeuralNetwork(nn.Module):
  def __init__(self, input_size, hidden_size1, hidden_size2, output_size, dropout_rate = 0.5):
        super(NeuralNetwork, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(dropout_rate)
        self.fc3 = nn.Linear(hidden_size2, output_size)
        self.sigmoid = nn.Sigmoid()

  def forward(self, x):
      x = self.fc1(x)
      x = self.relu1(x)
      x = self.dropout1(x)
      x = self.fc2(x)
      x = self.relu2(x)
      x = self.dropout2(x)
      x = self.fc3(x)
      x = self.sigmoid(x)
      return x


class Model:

  class _Dataset(Dataset):
    def __init__(self, x, y):
      self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
      self.x = torch.tensor(x).float().to(self.device)
      self.y = torch.tensor(y).float().reshape(-1, 1).to(self.device)
    
    def __len__(self, ):
      return len(self.x)
    
    def __getitem__(self, idx):
      x = self.x[idx]
      y = self.y[idx]
      return x, y

  def __init__(self, neural_network, neural_network_layers_info):
    self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    self._neural_network = neural_network
    self._neural_network_layers_info = neural_network_layers_info
    self.model = self._neural_network(**self._neural_network_layers_info).to(self.device)
    self._training_done = False
    self._compile_done = False

  def __clear(self, ):
    self.model = self._neural_network(**self._neural_network_layers_info)
    self._training_done = False
    self._compile_done = False
    self._validation_loss_history = []

  def compile(self, criterion, optimizer, learning_rate):
    if self._training_done:
      raise Exception("Cannot compile an already trained model")
    
    if self._compile_done:
      print("Warn: Previously trained Model will be replaced.")
      self.__clear()

    self.criterion = criterion
    self._optimizer = optimizer
    self._lr = learning_rate
    self.optimizer = self._optimizer(self.model.parameters(), lr = self._lr)
    self._compile_done = True

  def _get_dataset_loader(self, x, y, batch_size):
    dataset = self._Dataset(x, y)
    dataset_loader = DataLoader(dataset, batch_size = batch_size, shuffle=True, drop_last=True)
    return dataset_loader
  
  def fit(self, x_train, y_train, num_epochs, batch_size = 32, x_val = None, y_val = None, early_stopping = None):
    if not self._compile_done:
      raise Exception("compile before training")
    
    if self._training_done:
      print("Warn: Previously trained Model will be replaced.")
      self.__clear()
      self.compile(self.criterion, self._optimizer, self._lr)
    
    do_validation = (x_val is not None) and (y_val is not None)

    train_data_loader = self._get_dataset_loader(x_train, y_train, batch_size)
    if do_validation:
      validation_data = self._Dataset(x_val, y_val)
      if not early_stopping:
        early_stopping = np.inf
      self._fit_with_validation(train_data_loader, validation_data, epochs = num_epochs, early_stopping = early_stopping)
    else:
      self._fit_without_validation(train_data_loader, epochs = num_epochs)

    self._training_done = True

  def predict(self, x_test):
    self.model.eval()
    with torch.no_grad():
      x_test = torch.tensor(x_test).float().to(self.device)
      y_test = self.model(x_test)
    
      return y_test.cpu().detach().numpy()
    
  def _fit_with_validation(self, train_data_loader, validation_data, epochs, early_stopping):
    best_val_loss = np.inf
    best_epoch = 0
    val_loss_history = []
    best_model = None
    for epoch in range(epochs):
      if epoch - best_epoch > early_stopping:
        print("Early stopping at epoch:", epoch+1)
        break
      
      self.model.train()
      train_loss = []

      for x, y in train_data_loader:
        y_pred = self.model(x)
        loss = self.criterion(y_pred, y)
        train_loss.append(float(loss.item()))

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

      self.model.eval()
      with torch.no_grad():
        y_pred = self.model(validation_data.x)
        val_loss = float(self.criterion(y_pred, validation_data.y))
        val_loss_history.append(val_loss)
        if val_loss < best_val_loss:
          print(f"Best validation loss at epoch={epoch+1}, saving model")
          best_val_loss = val_loss
          best_epoch = epoch + 1
          best_model = self.model.state_dict().copy()
      
        print(f"Epoch #{epoch + 1} - training loss: {sum(train_loss)/len(train_loss)} \t validation loss: {val_loss}")
    self.model.load_state_dict(best_model)
    self._validation_loss_history = val_loss_history

  def _fit_without_validation(self, train_data_loader, epochs):
    for epoch in range(epochs):

      self.model.train()
      train_loss = []

      for x, y in train_data_loader:
        y_pred = self.model(x)
        loss = self.criterion(y_pred, y)
        train_loss.append(float(loss.item()))

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
      print(f"Epoch #{epoch + 1} - training loss: {sum(train_loss)/len(train_loss)}")


In [4]:
def train_val_split(train, val_size = 0.2):
    train = train.sort_values(by=["yQuantity"]).reset_index(drop=True)
    val_indexes = np.array([])
    for i in range(0, train.shape[0], 50):
        left = i
        right = min(i+50, train.shape[0])
        cnt = (right - left) * val_size

        # get cnt random numbers between [left, right)
        idx = np.random.randint(left, right, int(cnt))
        val_indexes = np.concatenate((val_indexes, idx))
    
    validation_data = train.iloc[val_indexes.astype(int)]
    train_data = train[~train.index.isin(val_indexes.astype(int))]
    return train_data, validation_data

## Model with baseline model's features

### Data

In [9]:
base_dir = "/data/Archive/bhavesh/InventoryPrediction"
run_date = "2023-04-30"
run_dir = f"{base_dir}/{run_date}"
train_path = {
    "menShirts": f"{run_dir}/TransformedDataset/baseline/menShirts/data/train",
    "womenKurtas": f"{run_dir}/TransformedDataset/baseline/womenKurtas/data/train",
}
test_path = {
    "menShirts": f"{run_dir}/TransformedDataset/baseline/menShirts/data/test",
    "womenKurtas": f"{run_dir}/TransformedDataset/baseline/womenKurtas/data/test",
}

In [6]:
def fetch_data(of):
    train = pq.ParquetDataset(train_path[of], fs).read().to_pandas()
    test = pq.ParquetDataset(test_path[of], fs).read().to_pandas()

    print(train.shape)
    print(test.shape)
    train.info()

    return train, test

### Experiment

In [7]:
def experiment(of, epochs = 200, lr = 0.001):
    train, test = fetch_data(of)

    scaler_x = MinMaxScaler()
    scaler_y = MinMaxScaler()
    cols = list(filter(lambda x: not (x.endswith("0") or x.endswith("1")), train.drop(columns=["productid", "similargrouplevel", "yQuantity"]).columns))
    print(cols)
    train.loc[:, cols] = scaler_x.fit_transform(train[cols])
    train["yQuantity"] = scaler_y.fit_transform(train[["yQuantity"]])
    print(scaler_y.data_min_, scaler_y.data_max_)

    train, val = train_val_split(train)
    print(train.shape, val.shape)

    x_train, y_train = train.drop(columns=["productid", "similargrouplevel", "yQuantity"]), train["yQuantity"]
    x_val, y_val = val.drop(columns=["productid", "similargrouplevel", "yQuantity"]), val["yQuantity"]

    neural_network = Model(NeuralNetwork, {"input_size": 33, "output_size": 1, "hidden_size1": 66, "hidden_size2": 33})
    print(neural_network.device)
    neural_network.compile(nn.BCELoss(), optim.Adam, lr)
    neural_network.fit(x_train.values, y_train.values, epochs, x_val=x_val.values, y_val=y_val.values)

    x_test = test.drop(columns=["productid", "similargrouplevel"])
    x_test.loc[:, cols] = scaler_x.fit_transform(x_test[cols])
    y_pred = neural_network.predict(x_test.values).reshape(-1)
    y_pred = scaler_y.inverse_transform(y_pred.reshape(-1, 1)).reshape(-1)

    predictions = pd.DataFrame({"productid": test["productid"], "predictedyQuantity": y_pred})
    print(predictions.head())

    pq.write_to_dataset(table=pa.Table.from_pandas(predictions), root_path=f"{run_dir}/experiments/exp_neural_network/predictions/ModelForEachBrickProductLevel/{of}/predictions", filesystem=fs, compression="snappy", use_legacy_dataset=True)

In [10]:
experiment("menShirts", epochs = 500, lr = 0.0005)

(37698, 36)
(97077, 35)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37698 entries, 0 to 37697
Data columns (total 36 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   similargrouplevel       37698 non-null  object 
 1   productid               37698 non-null  object 
 2   0_monthSales            37698 non-null  float64
 3   1_monthSales            37698 non-null  float64
 4   2_monthSales            37698 non-null  float64
 5   3_monthSales            37698 non-null  float64
 6   4_monthSales            37698 non-null  float64
 7   5_monthSales            37698 non-null  float64
 8   6_monthSales            37698 non-null  float64
 9   7_monthSales            37698 non-null  float64
 10  8_monthSales            37698 non-null  float64
 11  9_monthSales            37698 non-null  float64
 12  10_monthSales           37698 non-null  float64
 13  11_monthSales           37698 non-null  float64
 14  avgSales      

  pq.write_to_dataset(table=pa.Table.from_pandas(predictions), root_path=f"{run_dir}/experiments/exp_neural_network/predictions/ModelForEachBrickProductLevel/{of}/predictions", filesystem=fs, compression="snappy", use_legacy_dataset=True)


In [11]:
experiment("womenKurtas", epochs = 500, lr = 0.0005)

(22001, 36)
(55630, 35)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22001 entries, 0 to 22000
Data columns (total 36 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   similargrouplevel       22001 non-null  object 
 1   productid               22001 non-null  object 
 2   0_monthSales            22001 non-null  float64
 3   1_monthSales            22001 non-null  float64
 4   2_monthSales            22001 non-null  float64
 5   3_monthSales            22001 non-null  float64
 6   4_monthSales            22001 non-null  float64
 7   5_monthSales            22001 non-null  float64
 8   6_monthSales            22001 non-null  float64
 9   7_monthSales            22001 non-null  float64
 10  8_monthSales            22001 non-null  float64
 11  9_monthSales            22001 non-null  float64
 12  10_monthSales           22001 non-null  float64
 13  11_monthSales           22001 non-null  float64
 14  avgSales      

  pq.write_to_dataset(table=pa.Table.from_pandas(predictions), root_path=f"{run_dir}/experiments/exp_neural_network/predictions/ModelForEachBrickProductLevel/{of}/predictions", filesystem=fs, compression="snappy", use_legacy_dataset=True)


## Model with 0-4 months sales, wishlist, availableQuantity

### Data

In [None]:
base_dir = "/data/Archive/bhavesh/inventoryPrediction/temp/local"
train_path = {
    "menShirts": f"{base_dir}/menShirts/data/train",
    "womenKurtas": f"{base_dir}/womenKurtas/data/train",
}
test_path = {
    "menShirts": f"{base_dir}/menShirts/data/test",
    "womenKurtas": f"{base_dir}/womenKurtas/data/test",
}

def fetch_data2(of):
    cols_to_read = [f"{i}_sales" for i in range(4)] + [f"{i}_wishlist" for i in range(4)] + [f"{i}_availableQuantity" for i in range(4)]
    cols_to_read = ["productid", "similargrouplevel"] + cols_to_read
    train = pq.ParquetDataset(train_path[of], fs).read(columns=cols_to_read + ["yQuantity"]).to_pandas()
    test = pq.ParquetDataset(test_path[of], fs).read(columns=cols_to_read).to_pandas()

    print(train.shape)
    print(test.shape)
    train.info()

    return train, test

### Experiment

In [None]:
def experiment2(of, epochs = 200, lr = 0.001):
    train, test = fetch_data2(of)

    scaler_x = MinMaxScaler()
    scaler_y = MinMaxScaler()
    cols = train.drop(columns=["productid", "similargrouplevel", "yQuantity"]).columns
    print(cols)
    train.loc[:, cols] = scaler_x.fit_transform(train[cols])
    train["yQuantity"] = scaler_y.fit_transform(train[["yQuantity"]])
    print(scaler_y.data_min_, scaler_y.data_max_)

    train, val = train_val_split(train)
    print(train.shape, val.shape)

    x_train, y_train = train.drop(columns=["productid", "similargrouplevel", "yQuantity"]), train["yQuantity"]
    x_val, y_val = val.drop(columns=["productid", "similargrouplevel", "yQuantity"]), val["yQuantity"]

    neural_network = Model(NeuralNetwork, {"input_size": 12, "output_size": 1, "hidden_size1": 24, "hidden_size2": 6})
    print(neural_network.device)
    neural_network.compile(nn.BCELoss(), optim.Adam, lr)
    neural_network.fit(x_train.values, y_train.values, epochs, x_val=x_val.values, y_val=y_val.values)

    x_test = test.drop(columns=["productid", "similargrouplevel"])
    x_test.loc[:, cols] = scaler_x.fit_transform(x_test[cols])
    y_pred = neural_network.predict(x_test.values).reshape(-1)
    y_pred = scaler_y.inverse_transform(y_pred.reshape(-1, 1)).reshape(-1)

    predictions = pd.DataFrame({"productid": test["productid"], "predictedyQuantity": y_pred})
    print(predictions.head())

    pq.write_to_dataset(table=pa.Table.from_pandas(predictions), root_path=f"{base_dir}/experiments/exp_neural_network_new/predictions/ModelForEachBrickProductLevel/{of}/predictions", filesystem=fs, compression="snappy", use_legacy_dataset=True)

In [None]:
experiment2("menShirts", epochs = 200, lr = 0.001)

In [None]:
experiment2("womenKurtas", epochs = 200, lr = 0.001)