In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split



In [2]:
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
# from sklearn.linear_model import LinearRegression
# import xgboost as xgb
from sklearn.tree import DecisionTreeRegressor
fs = pa.hdfs.connect()

  fs = pa.hdfs.connect()


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim

class MyModel(nn.Module):
    def __init__(self, input_dim, embedding_size):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_size)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(embedding_size, 50)
        self.fc2 = nn.Linear(50, 15)
        self.fc3 = nn.Linear(15, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x_embed = self.embedding(x)
        x = self.flatten(x_embed)
        x = nn.functional.relu(self.fc1(x))
        x = nn.functional.relu(self.fc2(x))
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x_embed, x


def calculate_accuracy(predictions, labels):
    _, predicted_classes = torch.max(predictions, 1)
    correct_predictions = (predicted_classes == labels).sum().item()
    total_predictions = labels.size(0)
    accuracy = correct_predictions / total_predictions
    return accuracy

def calculate_f1_score(predictions, labels):
    _, predicted_classes = torch.max(predictions, 1)
    predicted_classes = predicted_classes.numpy()
    labels = labels.numpy()
    
    f1 = f1_score(labels, predicted_classes, average='macro')  # Calculate macro-averaged F1 score
    
    return f1

def calculate_mae(predictions, labels):
    loss_function = nn.L1Loss()
    loss = loss_function(predictions, labels)
    return loss.item() 

def calculate_error(predictions, labels):
    loss_function = nn.CrossEntropyLoss()
    loss = loss_function(predictions, labels)
    return loss.item()    
    
def train_model_for_embedding(epochs, input_dimension, embedding_size, x_data, y_data, x_val, y_val):
    
    model = MyModel(input_dim=input_dimension, embedding_size=embedding_size)
    best_model = None
    best_embeds = None
    best_val_loss = np.inf
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters())

    x_data = torch.tensor(x_data, dtype=torch.long)
    y_data = torch.tensor(y_data, dtype=torch.float).unsqueeze(1)
    x_val = torch.tensor(x_val, dtype=torch.long)
    y_val = torch.tensor(y_val, dtype=torch.float).unsqueeze(1)

    for epoch in range(epochs):
        model.train()  # Set model to training mode
        optimizer.zero_grad()
        embed, outputs = model(x_data)
        loss = criterion(outputs, y_data)
        loss.backward()
        optimizer.step()

        train_accuracy = calculate_mae(outputs, y_data)  # Calculate training accuracy
        train_error = loss.item()  # Training error

        model.eval()  # Set model to evaluation mode
        with torch.no_grad():
            _, val_outputs = model(x_val)
            val_loss = criterion(val_outputs, y_val)
            val_accuracy = calculate_mae(val_outputs, y_val)  # Calculate validation accuracy
            val_error = val_loss.item()  # Validation error
            if val_error < best_val_loss:
                best_val_loss = val_error
                best_embeds = embed
                best_model = model.state_dict().copy()

        print('Epoch [{}/{}], Train Loss: {:.4f}, Train mae: {:.4f}, Val Loss: {:.4f}, Val mae: {:.4f}'
              .format(epoch+1, epochs, train_error, train_accuracy, val_loss, val_accuracy))
    
    model.load_state_dict(best_model)
    return model, best_embeds


In [4]:
df = pq.ParquetDataset("/data/Archive/bhavesh/InventoryPrediction/2023-04-30/Embeddings/data", fs).read().to_pandas()
df.head()

Unnamed: 0,productid,similargrouplevel,colorfamily,brandname,sleeve,pattern,styletype,yQuantity
0,469215738_blue,830216013,blue,high-star,full-length sleeve,check,classic,2.0
1,441122912_maroon,830303011,maroon,avaasa,3-4th sleeve,floral,flared,12.0
2,464962116_cream,830303011,cream,juniper,,embellished,straight,1.0
3,465767578_purple,830216013,purple,vertusy,full-length sleeve,solid,indian,18.0
4,441029617_blue,830303011,blue,siyahi,3-4th sleeve,floral,regular-kurtas,2.0


In [5]:
df["similargrouplevel"].unique()

array(['830216013', '830303011'], dtype=object)

In [6]:
menShirts = df[df["similargrouplevel"] == "830216013"].copy().drop(columns=["similargrouplevel"]).fillna("Null")
womenKurtas = df[df["similargrouplevel"] == "830303011"].copy().drop(columns=["similargrouplevel"]).fillna("Null")

In [7]:
def generate_embeddings(df, attribute, proxy_name, embedding_size = 2, epochs=500):

    y_data = df["yQuantity"].values
    y_data = (y_data - y_data.min())/(y_data.max() - y_data.min())

    label_encoder = LabelEncoder()
    x_data = label_encoder.fit_transform(df[attribute])

    x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.2, random_state=42)
    input_dimension = len(set(x_data))
    
    model, embed_tensor = train_model_for_embedding(epochs, input_dimension, embedding_size, x_train, y_train, x_val, y_val)
    embed = embed_tensor.detach().numpy()

    embed_df = pd.DataFrame(embed, columns=[proxy_name + "_" + str(i) for i in range(embedding_size)]).reset_index()
    pid_df = pd.DataFrame(x_train, columns=['value']).reset_index()
    attr_df = pd.merge(pid_df, embed_df, on=['index'], how='inner').drop("index", axis=1)
    attr_df[attribute] = label_encoder.inverse_transform(attr_df["value"])
    # Find magnitude, i.e, sqrt(x_1^2 + x_2^2, x_3^2, ..., x_embedding_size^2)
    attr_df["magnitude"] = np.sqrt(np.sum(np.square(attr_df.drop(columns=[attribute, "value"])), axis=1))
    for column in attr_df.drop(columns=[attribute, "value", "magnitude"]).columns:
        attr_df[f"normalized_{column}"] = attr_df[column] / attr_df["magnitude"]

    return attr_df.drop_duplicates().reset_index(drop=True).drop(columns=["value"])

In [8]:
attr_proxy = {
    "colorfamily": "color",
    "brandname": "brand",
    "styletype": "style",
    "pattern": "pattern",
    "sleeve": "sleeve"
}.items()
attr_proxy

dict_items([('colorfamily', 'color'), ('brandname', 'brand'), ('styletype', 'style'), ('pattern', 'pattern'), ('sleeve', 'sleeve')])

### Men shirts

In [10]:
for attr, proxy in attr_proxy:
    attr_ms = generate_embeddings(menShirts, attr, proxy, epochs=500)
    if not os.path.exists("/app/data/bhavesh/inventoryPrediction/embeddings/menShirts"):
        os.makedirs("/app/data/bhavesh/inventoryPrediction/embeddings/menShirts")
    attr_ms.to_csv(f"/app/data/bhavesh/inventoryPrediction/embeddings/menShirts/{proxy}.csv", index=False)
    with open(f"/app/data/bhavesh/inventoryPrediction/embeddings/menShirts/{proxy}.csv", "rb") as f:
        pa.hdfs.HadoopFileSystem.upload(fs, f"/data/Archive/bhavesh/InventoryPrediction/2023-04-30/Embeddings/menShirts/{attr}.csv", f)
    os.remove(f"/app/data/bhavesh/inventoryPrediction/embeddings/menShirts/{proxy}.csv")

Epoch [1/500], Train Loss: 0.7369, Train mae: 0.5176, Val Loss: 0.7284, Val mae: 0.5139
Epoch [2/500], Train Loss: 0.7282, Train mae: 0.5134, Val Loss: 0.7200, Val mae: 0.5098
Epoch [3/500], Train Loss: 0.7198, Train mae: 0.5094, Val Loss: 0.7120, Val mae: 0.5059
Epoch [4/500], Train Loss: 0.7118, Train mae: 0.5055, Val Loss: 0.7043, Val mae: 0.5022
Epoch [5/500], Train Loss: 0.7042, Train mae: 0.5018, Val Loss: 0.6971, Val mae: 0.4986
Epoch [6/500], Train Loss: 0.6970, Train mae: 0.4982, Val Loss: 0.6900, Val mae: 0.4951
Epoch [7/500], Train Loss: 0.6899, Train mae: 0.4947, Val Loss: 0.6831, Val mae: 0.4917
Epoch [8/500], Train Loss: 0.6831, Train mae: 0.4912, Val Loss: 0.6764, Val mae: 0.4883
Epoch [9/500], Train Loss: 0.6764, Train mae: 0.4879, Val Loss: 0.6699, Val mae: 0.4850
Epoch [10/500], Train Loss: 0.6699, Train mae: 0.4846, Val Loss: 0.6636, Val mae: 0.4817
Epoch [11/500], Train Loss: 0.6637, Train mae: 0.4814, Val Loss: 0.6576, Val mae: 0.4786
Epoch [12/500], Train Loss: 0.

### Women Kurtas

In [11]:
for attr, proxy in attr_proxy:
    attr_wk = generate_embeddings(womenKurtas, attr, proxy, epochs=500)
    if not os.path.exists("/app/data/bhavesh/inventoryPrediction/embeddings/womenKurtas"):
        os.makedirs("/app/data/bhavesh/inventoryPrediction/embeddings/womenKurtas")
    attr_wk.to_csv(f"/app/data/bhavesh/inventoryPrediction/embeddings/womenKurtas/{proxy}.csv", index=False)
    with open(f"/app/data/bhavesh/inventoryPrediction/embeddings/womenKurtas/{proxy}.csv", "rb") as f:
        pa.hdfs.HadoopFileSystem.upload(fs, f"/data/Archive/bhavesh/InventoryPrediction/2023-04-30/Embeddings/womenKurtas/{attr}.csv", f)
    os.remove(f"/app/data/bhavesh/inventoryPrediction/embeddings/womenKurtas/{proxy}.csv")

Epoch [1/500], Train Loss: 0.6271, Train mae: 0.4592, Val Loss: 0.6188, Val mae: 0.4544
Epoch [2/500], Train Loss: 0.6192, Train mae: 0.4550, Val Loss: 0.6112, Val mae: 0.4502
Epoch [3/500], Train Loss: 0.6115, Train mae: 0.4507, Val Loss: 0.6036, Val mae: 0.4460
Epoch [4/500], Train Loss: 0.6039, Train mae: 0.4465, Val Loss: 0.5961, Val mae: 0.4418
Epoch [5/500], Train Loss: 0.5962, Train mae: 0.4423, Val Loss: 0.5886, Val mae: 0.4376
Epoch [6/500], Train Loss: 0.5887, Train mae: 0.4381, Val Loss: 0.5814, Val mae: 0.4335
Epoch [7/500], Train Loss: 0.5814, Train mae: 0.4339, Val Loss: 0.5742, Val mae: 0.4294
Epoch [8/500], Train Loss: 0.5742, Train mae: 0.4298, Val Loss: 0.5672, Val mae: 0.4253
Epoch [9/500], Train Loss: 0.5670, Train mae: 0.4256, Val Loss: 0.5602, Val mae: 0.4212
Epoch [10/500], Train Loss: 0.5600, Train mae: 0.4215, Val Loss: 0.5533, Val mae: 0.4171
Epoch [11/500], Train Loss: 0.5530, Train mae: 0.4174, Val Loss: 0.5463, Val mae: 0.4130
Epoch [12/500], Train Loss: 0.