In [None]:
import pandas as pd
import keras
from keras import models
from keras.layers import Embedding, Dense, Flatten
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [None]:
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
# from sklearn.linear_model import LinearRegression
# import xgboost as xgb
from sklearn.tree import DecisionTreeRegressor
fs = pa.hdfs.connect()

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class MyModel(nn.Module):
    def __init__(self, input_dim, embedding_size):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_size)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(embedding_size, 50)
        self.fc2 = nn.Linear(50, 15)
        self.fc3 = nn.Linear(15, 1)
        
    def forward(self, x):
        x_embed = self.embedding(x)
        x = self.flatten(x_embed)
        x = nn.functional.relu(self.fc1(x))
        x = nn.functional.relu(self.fc2(x))
        x = self.fc3(x)
        return x_embed, x


def calculate_accuracy(predictions, labels):
    _, predicted_classes = torch.max(predictions, 1)
    correct_predictions = (predicted_classes == labels).sum().item()
    total_predictions = labels.size(0)
    accuracy = correct_predictions / total_predictions
    return accuracy

def calculate_f1_score(predictions, labels):
    _, predicted_classes = torch.max(predictions, 1)
    predicted_classes = predicted_classes.numpy()
    labels = labels.numpy()
    
    f1 = f1_score(labels, predicted_classes, average='macro')  # Calculate macro-averaged F1 score
    
    return f1

def calculate_mae(predictions, labels):
    loss_function = nn.L1Loss()
    loss = loss_function(predictions, labels)
    return loss.item() 

def calculate_error(predictions, labels):
    loss_function = nn.CrossEntropyLoss()
    loss = loss_function(predictions, labels)
    return loss.item()    
    
def train_model_for_embedding(epochs, input_dimension, embedding_size, x_data, y_data, x_val, y_val):
    
    model = MyModel(input_dim=input_dimension, embedding_size=embedding_size)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters())

    x_data = torch.tensor(x_data, dtype=torch.long)
    y_data = torch.tensor(y_data, dtype=torch.float).unsqueeze(1)
    x_val = torch.tensor(x_val, dtype=torch.long)
    y_val = torch.tensor(y_val, dtype=torch.float).unsqueeze(1)

    for epoch in range(epochs):
        model.train()  # Set model to training mode
        optimizer.zero_grad()
        embed, outputs = model(x_data)
        loss = criterion(outputs, y_data)
        loss.backward()
        optimizer.step()

        train_accuracy = calculate_mae(outputs, y_data)  # Calculate training accuracy
        train_error = loss.item()  # Training error

        model.eval()  # Set model to evaluation mode
        with torch.no_grad():
            _, val_outputs = model(x_val)
            val_loss = criterion(val_outputs, y_val)
            val_accuracy = calculate_mae(val_outputs, y_val)  # Calculate validation accuracy
            val_error = val_loss.item()  # Validation error

        print('Epoch [{}/{}], Train Loss: {:.4f}, Train mae: {:.4f}, Val Loss: {:.4f}, Val mae: {:.4f}'
              .format(epoch+1, epochs, train_error, train_accuracy, val_loss, val_accuracy))

    return model, embed


In [None]:
df = pq.ParquetDataset("/data/Archive/bhavesh/inventoryPrediction/embeddings/data", fs).read().to_pandas()
df.head()

In [None]:
df["similargrouplevel"].unique()

In [None]:
menShirts = df[df["similargrouplevel"] == "830216013"].copy().drop(columns=["similargrouplevel"]).fillna("Null")
womenKurtas = df[df["similargrouplevel"] == "830303011"].copy().drop(columns=["similargrouplevel"]).fillna("Null")

In [None]:
def generate_embeddings(df, attribute, proxy_name, embedding_size = 2, epochs=500):

    y_data = df["yQuantity"].values
    y_data = (y_data - y_data.min())/(y_data.max() - y_data.min())

    label_encoder = LabelEncoder()
    x_data = label_encoder.fit_transform(df[attribute])

    x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.2, random_state=42)
    input_dimension = len(set(x_data))
    
    model, embed_tensor = train_model_for_embedding(epochs, input_dimension, embedding_size, x_train, y_train, x_val, y_val)
    embed = embed_tensor.detach().numpy()

    embed_df = pd.DataFrame(embed, columns=[proxy_name + "_" + str(i) for i in range(embedding_size)]).reset_index()
    pid_df = pd.DataFrame(x_train, columns=['value']).reset_index()
    attr_df = pd.merge(pid_df, embed_df, on=['index'], how='inner').drop("index", axis=1)
    attr_df[attribute] = label_encoder.inverse_transform(attr_df["value"])
    # Find magnitude, i.e, sqrt(x_1^2 + x_2^2, x_3^2, ..., x_embedding_size^2)
    attr_df["magnitude"] = np.sqrt(np.sum(np.square(attr_df.drop(columns=[attribute, "value"])), axis=1))
    for column in attr_df.drop(columns=[attribute, "value", "magnitude"]).columns:
        attr_df[f"normalized_{column}"] = attr_df[column] / attr_df["magnitude"]

    return attr_df.drop_duplicates().reset_index(drop=True).drop(columns=["value"])

In [None]:
attr_proxy = {
    "colorfamily": "color",
    "brandname": "brand",
    "styletype": "style",
    "pattern": "pattern",
    "sleeve": "sleeve"
}.items()
attr_proxy

### Men shirts

In [None]:
for attr, proxy in attr_proxy:
    attr_ms = generate_embeddings(menShirts, attr, proxy, epochs=10)
    if not os.path.exists("/app/data/bhavesh/inventoryPrediction/embeddings/menShirts"):
        os.makedirs("/app/data/bhavesh/inventoryPrediction/embeddings/menShirts")
    attr_ms.to_csv(f"/app/data/bhavesh/inventoryPrediction/embeddings/menShirts/{proxy}.csv", index=False)
    with open(f"/app/data/bhavesh/inventoryPrediction/embeddings/menShirts/{proxy}.csv", "rb") as f:
        pa.hdfs.HadoopFileSystem.upload(fs, f"/data/Archive/bhavesh/inventoryPrediction/embeddings/menShirts/{attr}.csv", f)
    os.remove(f"/app/data/bhavesh/inventoryPrediction/embeddings/menShirts/{proxy}.csv")
    break

### Women Kurtas

In [None]:
for attr, proxy in attr_proxy:
    attr_wk = generate_embeddings(womenKurtas, attr, proxy, epochs=500)
    if not os.path.exists("/app/data/bhavesh/inventoryPrediction/embeddings/womenKurtas"):
        os.makedirs("/app/data/bhavesh/inventoryPrediction/embeddings/womenKurtas")
    attr_wk.to_csv(f"/app/data/bhavesh/inventoryPrediction/embeddings/womenKurtas/{proxy}.csv", index=False)
    with open(f"/app/data/bhavesh/inventoryPrediction/embeddings/womenKurtas/{proxy}.csv", "rb") as f:
        pa.hdfs.HadoopFileSystem.upload(fs, f"/data/Archive/bhavesh/inventoryPrediction/embeddings/womenKurtas/{attr}.csv", f)
    os.remove(f"/app/data/bhavesh/inventoryPrediction/embeddings/womenKurtas/{proxy}.csv")