In [None]:
%%init_spark
launcher.jars = ["/app/setup/commons-ip.jar"]
launcher.conf.spark.app.name = "bhavesh_notebook"
launcher.conf.spark.queue = "default"
launcher.conf.spark.local.dir = "/app/tmp"
launcher.conf.spark.sql.shuffle.partitions = 210
launcher.conf.spark.sql.shuffle.minPartitions = 20
launcher.conf.spark.driver.memory = "50g"
launcher.conf.spark.ui.showConsoleProgress = "true"
launcher.master = "local[15]"

In [None]:
import ai.couture.obelisk.commons.Constants._
import ai.couture.obelisk.commons.Constants.STANDARD_COL_NAMES._
import ai.couture.obelisk.commons.Constants._
import ai.couture.obelisk.commons.io._
import ai.couture.obelisk.commons.utils.DateTimeUtil._
import ai.couture.obelisk.commons.utils.DataFrameUtil.minMaxScaler
import org.apache.spark.sql._
import org.apache.spark.sql.expressions._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import spark.implicits._

### Generate train/test Dataset for model per brick at product level training/prediction.

In [None]:
def generateTransformedTrainForProductLevelDataset(of: String, datasetPath: String, transformedDatasetPath: String, performNormalization: Boolean = true): Unit = {
    var xData: DataFrame = ParquetToDF.getDF(s"$datasetPath/$of/XData")
    
    xData = xData
    // .select("productid", "0_sales", "1_sales", "2_sales", "3_sales", "4_sales", "5_sales", "6_sales", "7_sales", "8_sales", "9_sales", "10_sales", "11_sales",
    //        "sales_avg_12_months", "sales_std_12_months", "0_returns", "0_users", "0_PLPViewsPerDay", "0_PLPClicksPerDay", "0_PDPCountPerDay",
    //        "0_TotalAddToCartPerDay", "0_wishlist", "0_availableQuantity")
    // .toDF("productid", "0_monthSales", "1_monthSales", "2_monthSales", "3_monthSales", "4_monthSales", "5_monthSales", "6_monthSales", "7_monthSales", "8_monthSales", 
    //       "9_monthSales", "10_monthSales", "11_monthSales", "avgSales", "stddevSales", "totalReturn", "totalUsers", "totalPLPViews", "totalPLPClicks", 
    //       "totalPDPCount", "totalATC", "totalWishList", "totalAvailableQuantity")
    var data: DataFrame = null
    if (of == "train") {
        var yData: DataFrame = ParquetToDF.getDF(s"$datasetPath/$of/YData")
        data = xData.join(yData, Seq(PRODUCTID))
    }
    else if(of == "test"){
        data = xData
    }
    val productAttributesLegosFNL: DataFrame = ParquetToDF.getDF("/data/Archive/inventory/productAttributesLegosFNL")
    .select(PRODUCTID, SIMILAR_GROUP_LEVEL, "colorfamily", "brandname", "styletype", "pattern", "sleeve")
    // .select(col(ITEM_ID).as(PRODUCTID), col(SIMILAR_GROUP_LEVEL), col("colorfamily"), col("brandname"), col("styletype"), col("pattern"))
    // .dropDuplicates(PRODUCTID)
    
    data = data.join(productAttributesLegosFNL, Seq(PRODUCTID)).na.fill("Null")
    var attributes = Array("colorfamily", "pattern", "brandname", "styletype", "sleeve")
    attributes.foreach(
        attribute => {
            var attrDFMS = CSVToDF.getDF(s"/data/Archive/bhavesh/inventoryPrediction/embeddings/menShirts/${attribute}.csv", inferSchema=true)
            
            attrDFMS = attrDFMS
            .select(attribute, attrDFMS.columns.filter(_.startsWith("normalized")): _*)
            .withColumn(SIMILAR_GROUP_LEVEL, lit("830216013"))
            
            var attrDFWK = CSVToDF.getDF(s"/data/Archive/bhavesh/inventoryPrediction/embeddings/womenKurtas/${attribute}.csv", inferSchema=true)
            
            attrDFWK = attrDFWK
            .select(attribute, attrDFMS.columns.filter(_.startsWith("normalized")): _*)
            .withColumn(SIMILAR_GROUP_LEVEL, lit("830303011"))
            
            var cols = attrDFWK.columns
            
            var attrDF = attrDFMS.select(cols.head, cols.tail: _*).union(attrDFWK)
            
            data = data.join(attrDF, Seq(SIMILAR_GROUP_LEVEL, attribute)).drop(attribute)
        }
    )
    
//     val color: DataFrame = spark.read.option("inferSchema", "true").option("header", "true")
//     .csv("/data/ayush/AttributeLevel/DenseEmbeddings/MenShirts/Embeddings/color_embeddings.csv").na.fill("")
//     .withColumn("magnitude", sqrt(pow(col("color_1"), 2) + pow(col("color_2"), 2)))
//     .withColumn("color_1", col("color_1")/col("magnitude"))
//     .withColumn("color_2", col("color_2")/col("magnitude"))
//     .drop("magnitude")
//     .withColumn(SIMILAR_GROUP_LEVEL, lit("830216013"))
//     .select("colorfamily", "color_1", "color_2", SIMILAR_GROUP_LEVEL)
//     .union(
//         spark.read.option("inferSchema", "true").option("header", "true")
//         .csv("/data/ayush/AttributeLevel/DenseEmbeddings/WomenKurtas/Embeddings/color_embeddings.csv").na.fill("")
//         .withColumn("magnitude", sqrt(pow(col("color_1"), 2) + pow(col("color_2"), 2)))
//         .withColumn("color_1", col("color_1")/col("magnitude"))
//         .withColumn("color_2", col("color_2")/col("magnitude"))
//         .drop("magnitude")
//         .withColumn(SIMILAR_GROUP_LEVEL, lit("830303011"))
//         .select("colorfamily", "color_1", "color_2", SIMILAR_GROUP_LEVEL)
//     )

//     val brand: DataFrame = spark.read.option("inferSchema", "true").option("header", "true")
//     .csv("/data/ayush/AttributeLevel/DenseEmbeddings/MenShirts/Embeddings/brand_embeddings.csv").na.fill("")
//     .withColumn("magnitude", sqrt(pow(col("brand_1"), 2) + pow(col("brand_2"), 2)))
//     .withColumn("brand_1", col("brand_1")/col("magnitude"))
//     .withColumn("brand_2", col("brand_2")/col("magnitude"))
//     .drop("magnitude")
//     .withColumn(SIMILAR_GROUP_LEVEL, lit("830216013"))
//     .select("brandname", "brand_1", "brand_2", SIMILAR_GROUP_LEVEL)
//     .union(
//         spark.read.option("inferSchema", "true").option("header", "true")
//         .csv("/data/ayush/AttributeLevel/DenseEmbeddings/WomenKurtas/Embeddings/brand_embeddings.csv").na.fill("")
//         .withColumn("magnitude", sqrt(pow(col("brand_1"), 2) + pow(col("brand_2"), 2)))
//         .withColumn("brand_1", col("brand_1")/col("magnitude"))
//         .withColumn("brand_2", col("brand_2")/col("magnitude"))
//         .drop("magnitude")
//         .withColumn(SIMILAR_GROUP_LEVEL, lit("830303011"))
//         .select("brandname", "brand_1", "brand_2", SIMILAR_GROUP_LEVEL)
//     )
    

//     val pattern: DataFrame = spark.read.option("inferSchema", "true").option("header", "true")
//     .csv("/data/ayush/AttributeLevel/DenseEmbeddings/MenShirts/Embeddings/pattern_embeddings.csv").na.fill("")
//     .withColumn("magnitude", sqrt(pow(col("pattern_1"), 2) + pow(col("pattern_2"), 2)))
//     .withColumn("pattern_1", col("pattern_1")/col("magnitude"))
//     .withColumn("pattern_2", col("pattern_2")/col("magnitude"))
//     .drop("magnitude")
//     .withColumn(SIMILAR_GROUP_LEVEL, lit("830216013"))
//     .select("pattern", "pattern_1", "pattern_2", SIMILAR_GROUP_LEVEL)
//     .union(
//         spark.read.option("inferSchema", "true").option("header", "true")
//         .csv("/data/ayush/AttributeLevel/DenseEmbeddings/WomenKurtas/Embeddings/pattern_embeddings.csv").na.fill("")
//         .withColumn("magnitude", sqrt(pow(col("pattern_1"), 2) + pow(col("pattern_2"), 2)))
//         .withColumn("pattern_1", col("pattern_1")/col("magnitude"))
//         .withColumn("pattern_2", col("pattern_2")/col("magnitude"))
//         .drop("magnitude")
//         .withColumn(SIMILAR_GROUP_LEVEL, lit("830303011"))
//         .select("pattern", "pattern_1", "pattern_2", SIMILAR_GROUP_LEVEL)
//     )
    
//     val style: DataFrame = spark.read.option("inferSchema", "true").option("header", "true")
//     .csv("/data/ayush/AttributeLevel/DenseEmbeddings/MenShirts/Embeddings/styletype_embeddings.csv").na.fill("")
//     .withColumn("magnitude", sqrt(pow(col("style_1"), 2) + pow(col("style_2"), 2)))
//     .withColumn("style_1", col("style_1")/col("magnitude"))
//     .withColumn("style_2", col("style_2")/col("magnitude"))
//     .drop("magnitude")
//     .withColumn(SIMILAR_GROUP_LEVEL, lit("830216013"))
//     .select("styletype", "style_1", "style_2", SIMILAR_GROUP_LEVEL)
//     .union(
//         spark.read.option("inferSchema", "true").option("header", "true")
//         .csv("/data/ayush/AttributeLevel/DenseEmbeddings/WomenKurtas/Embeddings/styletype_embeddings.csv").na.fill("")
//         .withColumn("magnitude", sqrt(pow(col("style_1"), 2) + pow(col("style_2"), 2)))
//         .withColumn("style_1", col("style_1")/col("magnitude"))
//         .withColumn("style_2", col("style_2")/col("magnitude"))
//         .drop("magnitude")
//         .withColumn(SIMILAR_GROUP_LEVEL, lit("830303011"))
//         .select("styletype", "style_1", "style_2", SIMILAR_GROUP_LEVEL)
//     )
    
    // data = data.join(color, Seq("colorfamily", SIMILAR_GROUP_LEVEL), "inner")
    // .join(brand, Seq("brandname", SIMILAR_GROUP_LEVEL), "inner")
    // .join(pattern, Seq("pattern", SIMILAR_GROUP_LEVEL), "inner")
    // .join(style, Seq("styletype", SIMILAR_GROUP_LEVEL), "inner")
    // .drop("styletype", "pattern", "brandname", "colorfamily")
    
    var productPriceBucketMap = ParquetToDF.getDF("/data/Archive/bhavesh/inventoryPrediction/productPriceBucketMap")
    
    data = data.join(productPriceBucketMap, Seq(PRODUCTID))
    
    data.columns.filter(column => column.endsWith("monthSales") || column.equals("yQuantity")).foreach(
        column => data = data.withColumn(column, col(column).cast(DoubleType))
    )
    
    val menShirtsData: DataFrame = data.filter(col(SIMILAR_GROUP_LEVEL) === "830216013")
    val womenKurtasData: DataFrame = data.filter(col(SIMILAR_GROUP_LEVEL) === "830303011")
    
    DFToParquet.putDF(s"$transformedDatasetPath/combined/data/$of", data)
    DFToParquet.putDF(s"$transformedDatasetPath/menShirts/data/$of", menShirtsData)
    DFToParquet.putDF(s"$transformedDatasetPath/womenKurtas/data/$of", womenKurtasData)
    
    if (performNormalization){
        
        var colsToScale: Array[String] = Array(
            "0_monthSales", "1_monthSales", "2_monthSales", "3_monthSales", "4_monthSales", "5_monthSales", "6_monthSales", "7_monthSales", 
            "8_monthSales", "9_monthSales", "10_monthSales", "11_monthSales", "avgSales", "stddevSales", "totalReturn", "totalUsers", 
            "totalPLPViews", "totalPLPClicks", "totalPDPCount", "totalATC", "totalWishList", "totalAvailableQuantity")

        if(of == "train"){

            colsToScale = colsToScale ++ Array("yQuantity")
        }

        DFToParquet.putDF(s"$transformedDatasetPath/combined/normalizedData/global/$of", data.transform(minMaxScaler(colsToScale)))
        DFToParquet.putDF(s"$transformedDatasetPath/menShirts/normalizedData/global/$of", menShirtsData.transform(minMaxScaler(colsToScale)))
        DFToParquet.putDF(s"$transformedDatasetPath/womenKurtas/normalizedData/global/$of", womenKurtasData.transform(minMaxScaler(colsToScale)))

        // min max scale at similar group level, no meaning doing this for menshirtrs/womenkurtas as they are filtered out and are a single similargoruplevel themselves.
        DFToParquet.putDF(
            s"$transformedDatasetPath/combined/normalizedData/$SIMILAR_GROUP_LEVEL/$of", 
            ParquetToDF.getDF(s"$transformedDatasetPath/menShirts/normalizedData/global/$of").union(
                ParquetToDF.getDF(s"$transformedDatasetPath/womenKurtas/normalizedData/global/$of")
            )
        )
    }
}

In [None]:
val originalDatasetPath: String = "/data/Archive/bhavesh/inventoryPrediction/OriginalDataset/date_when_prediction_is_made=2023-04-16"
val transformedDatasetPath: String = "/data/Archive/bhavesh/inventoryPrediction/TransformedDataset/date_when_prediction_is_made=2023-04-16"
generateTransformedTrainForProductLevelDataset("train", originalDatasetPath, transformedDatasetPath)
generateTransformedTrainForProductLevelDataset("test", originalDatasetPath, transformedDatasetPath)

In [None]:
var df = ParquetToDF.getDF("/data/Archive/bhavesh/inventoryPrediction/TransformedDataset/date_when_prediction_is_made=2023-04-16/menShirts/normalizedData/global/train")
df.printSchema

### Generate Query sigma map

In [None]:
var PREDICTED="predicted"
var ACTUAL="actual"

var rundate = "2023-04-15"
// var predictionend = getFutureMonthDateFromHere(1,rundate,"yyyy-MM-dd")
var predictionend = "2023-05-15"
var dateForWhichPredictionsMade = getFutureDateFromHere(1,rundate,"yyyy-MM-dd")
var trainendDate =getOldDateFromHere(1,rundate,"yyyy-MM-dd")
var trainStartDate = "2022-03-13"
var suffix = "colorfamily_pricebucket_styletype_pattern_sleeve_brandname"

var date = getFutureDateFromHere(1, rundate, "yyyy-MM-dd")
var queriesToProductMap = ParquetToDF.getDF(s"/data/Archive/bhavesh/inventoryPrediction/queryToProductMap/date_when_prediction_is_made=$date/suffix=$suffix").na.drop().distinct()
var baseDir = "/data/Archive/bhavesh/inventoryPrediction"

val getNumOfMonthsBetweenTwoDatesUDF = udf((startDate: String, endDate: String)=> {
    getNumOfMonthsBetweenTwoDates(startDate, endDate, "yyyy-MM-dd", "yyyy-MM-dd")
})

val getOldMonthDateFromHereUDF = udf((numMonths: Int, date: String) => {
    getOldMonthDateFromHere(numMonths, date, "yyyy-MM-dd", "yyyy-MM-dd")
})

In [None]:
// var productLevelDailyQuantity = spark.read.parquet("/data/ecomm/ajio/processed/interactionsDB").filter(col("event") === "Checkout" && col("purchase") === "New" && col("productid").isNotNull && col("booked_rev") > 0 && col("userid").isNotNull && col("quantity") > 0 && col("date")<=predictionend && col("date")>=trainStartDate).groupBy("productid", "date").agg(sum("quantity").cast(DoubleType).as("quantity"))
// productLevelDailyQuantity.write.mode("overwrite").parquet(s"$baseDir/historyQuantitySold/date_when_prediction_is_made=${dateForWhichPredictionsMade}/productLevel/dayWise")

var productLevelDailyQuantity = spark.read.parquet(s"$baseDir/historyQuantitySold/date_when_prediction_is_made=${dateForWhichPredictionsMade}/productLevel/dayWise")

var productLevelMonthlyQuantity = productLevelDailyQuantity.filter(col("date")>=trainStartDate && col("date")<=predictionend).filter(!(col("date")===rundate))
.withColumn("pastmonth",when(col("date")<=predictionend && col("date")>= dateForWhichPredictionsMade,0).otherwise(getNumOfMonthsBetweenTwoDatesUDF(col(DATE),lit(trainendDate))+1))
.groupBy("productid","pastmonth").agg(sum("quantity").as("quantity")).withColumn("tempDateForTimeSeries", getOldMonthDateFromHereUDF(col("pastmonth"),lit(rundate)))
productLevelMonthlyQuantity.write.mode("overwrite").parquet(s"$baseDir/historyQuantitySold/date_when_prediction_is_made=${dateForWhichPredictionsMade}/productLevel/monthWise")

In [None]:
// var queryLevelDailyQuantity = spark.read.parquet(s"$baseDir/historyQuantitySold/date_when_prediction_is_made=${dateForWhichPredictionsMade}/productLevel/dayWise")
// .join(queriesToProductMap, Seq(PRODUCTID))
// .groupBy("similargrouplevel","query", "date").agg(sum(QUANTITY).as(QUANTITY))
// queryLevelDailyQuantity.write.mode("overwrite").parquet(s"$baseDir/historyQuantitySold/date_when_prediction_is_made=${dateForWhichPredictionsMade}/queryLevel/suffix=$suffix/dayWise")

var queryLevelDailyQuantity = ParquetToDF.getDF(s"$baseDir/historyQuantitySold/date_when_prediction_is_made=${dateForWhichPredictionsMade}/queryLevel/suffix=$suffix/dayWise")

var queryLevelMonthlyQuantity = spark.read.parquet(s"$baseDir/historyQuantitySold/date_when_prediction_is_made=${dateForWhichPredictionsMade}/queryLevel/suffix=$suffix/dayWise")
.filter(col("date")>=trainStartDate && col("date")<=predictionend).filter(!(col("date")===rundate))
.withColumn("pastmonth",when(col("date")<=predictionend && col("date")>= dateForWhichPredictionsMade,0).otherwise(getNumOfMonthsBetweenTwoDatesUDF(col(DATE),lit(trainendDate))+1))
.groupBy("similargrouplevel","query","pastmonth").agg(sum("quantity").as("quantity")).withColumn("tempDateForTimeSeries",getOldMonthDateFromHereUDF(col("pastmonth"),lit(rundate)))
queryLevelMonthlyQuantity.write.mode("overwrite").parquet(s"$baseDir/historyQuantitySold/date_when_prediction_is_made=${dateForWhichPredictionsMade}/queryLevel/suffix=$suffix/monthWise")

In [None]:
productLevelDailyQuantity.printSchema
productLevelMonthlyQuantity.printSchema
queryLevelDailyQuantity.printSchema
queryLevelMonthlyQuantity.printSchema

In [None]:
var productLevelMonthlyQuantity = ParquetToDF.getDF(s"$baseDir/historyQuantitySold/date_when_prediction_is_made=${dateForWhichPredictionsMade}/productLevel/monthWise")
.drop("tempDateForTimeSeries").filter(col("pastmonth")>=1 && col("pastmonth")<=12)

var statsDF = productLevelMonthlyQuantity.groupBy(PRODUCTID).agg(stddev("quantity").as("stddev"),avg("quantity").as("avg")).na.fill(0.0, Seq("stddev"))
statsDF.write.mode("overwrite").parquet(s"$baseDir/quantityStatsAcrossMonths/date_when_prediction_is_made=${dateForWhichPredictionsMade}/productLevel/past12MonthsStats")

In [None]:
var queryLevelMonthlyQuantity = ParquetToDF.getDF(s"$baseDir/historyQuantitySold/date_when_prediction_is_made=${dateForWhichPredictionsMade}/queryLevel/suffix=$suffix/monthWise")
.drop("tempDateForTimeSeries").filter(col("pastmonth")>=1 && col("pastmonth")<=12)
var statsDF = queryLevelMonthlyQuantity.groupBy("similargrouplevel","query").agg(stddev("quantity").as("stddev"),avg("quantity").as("avg"))
statsDF.write.mode("overwrite").parquet(s"$baseDir/quantityStatsAcrossMonths/date_when_prediction_is_made=${dateForWhichPredictionsMade}/queryLevel/suffix=$suffix/past12MonthsStats")

In [None]:
var statsDF = ParquetToDF.getDF(s"$baseDir/quantityStatsAcrossMonths/date_when_prediction_is_made=${dateForWhichPredictionsMade}/queryLevel/suffix=$suffix/past12MonthsStats")
statsDF.printSchema

### Data Generation

In [None]:
import java.time.format.DateTimeFormatter.ofPattern
import java.time.temporal.ChronoUnit
import java.time.LocalDate
def getMonths: UserDefinedFunction = udf((startDate: String, endDate: String, inputDateFormat: String, outputDateFormat: String) => {

val dateBefore: LocalDate = LocalDate.parse(startDate, ofPattern(inputDateFormat))

val dateAfter: LocalDate = LocalDate.parse(endDate, ofPattern(outputDateFormat))

ChronoUnit.MONTHS.between(dateBefore, dateAfter)
})

In [None]:
val productAttrs = ParquetToDF.getDF("/data/Archive/inventory/productAttributesLegosFNL")
.select(col(PRODUCTID), col(SIMILAR_GROUP_LEVEL))
.dropDuplicates(PRODUCTID)
.filter(col(SIMILAR_GROUP_LEVEL).isin("830216013", "830303011"))
.persist()

val dateMapX: Map[String, Map[String, String]] = Map("train" -> Map("start" -> "2022-03-13", "end" -> "2023-03-13"),
                                                    "test" -> Map("start" -> "2022-04-14", "end" -> "2023-04-14"))
val dateMapY: Map[String, Map[String, String]] = Map("train" -> Map("start" -> "2023-03-15", "end" -> "2023-04-14"),
                                                    "test" -> Map("start" -> "2023-04-16", "end" -> "2023-05-15"))

val dateForWhichPredictionsAreMade: String = dateMapY("test")("start")
val baseDir = s"/data/Archive/bhavesh/inventoryPrediction/dataset/date_when_prediction_is_made=$dateForWhichPredictionsAreMade"

In [None]:
def saveNLoadDF(path: String)(df: DataFrame): DataFrame = {
    DFToParquet.putDF(path, df)
    ParquetToDF.getDF(path)
}

def getMonthlyMeanStd(monthsLimit: Int, colsForStatistics: Array[String], calculateStd: Boolean)(df: DataFrame): DataFrame = {
    var intermediateDF = df
    val monthsArray = (0 until monthsLimit).toArray
    val colsToUseForCalculations = colsForStatistics.map(column => (column, monthsArray.map(month => s"${month}_${column}"))).toMap
    colsToUseForCalculations.foreach(
        columnInfo => {
            val attr = columnInfo._1
            val colsToUse = columnInfo._2
            val meanExpr = colsToUse.map(col).reduce((col1, col2) => col1 + col2)
            intermediateDF = intermediateDF.withColumn(s"${attr}_avg_${monthsLimit}_months", meanExpr/lit(colsToUse.length).cast(DoubleType))
            
            if(calculateStd){
                print(attr)
                val stdExpr = colsToUse.map(column => pow(col(column) - col(s"${attr}_avg_${monthsLimit}_months"), lit(2)))
                .reduce((col1, col2) => col1 + col2)
                intermediateDF = intermediateDF.withColumn(s"${attr}_std_${monthsLimit}_months",
                                                           sqrt(stdExpr/lit(colsToUse.length - 1)))
            }
        }
    )
    intermediateDF
}

In [None]:
def generateSalesXData(of: String, dateMap: Map[String, Map[String, String]], productAttrs: DataFrame, baseDir: String): Unit = {
    val startDate = dateMap(of)("start")
    val endDate = dateMap(of)("end")
    val features = Array("sales")
    val tempDir = "/data/Archive/bhavesh/inventoryPrediction/temp/generateSalesXData"
    
    var interactions = ParquetToDF.getDF("/data/ecomm/ajio/processed/interactionsDB")
    .filter(col("event") === "Checkout" && col("purchase") === "New" 
            && col("productid").isNotNull && col("booked_rev") > 0 
            && col("userid").isNotNull && col("quantity") > 0
            && col(DATE).between(startDate, endDate)
           )
    .groupBy(PRODUCTID, DATE)
    .agg(sum(QUANTITY) as "sales")
    .join(productAttrs, Seq(PRODUCTID))
    .transform(saveNLoadDF(s"$tempDir/interactions_1"))
    .withColumn("months", getMonths(col(DATE), lit(endDate), lit("yyyy-MM-dd"), lit("yyyy-MM-dd")))
    .filter(col("months") < 12)
    .groupBy(PRODUCTID)
    .pivot("months")
    .agg(sum("sales").cast(DoubleType) as "sales", first("sales") as "temp")
    
    val colsToDrop = interactions.columns.filter(_.endsWith("temp"))
    interactions = interactions.drop(colsToDrop: _*).na.fill(0)
    .transform(saveNLoadDF(s"$tempDir/interactions_2"))
    .transform(getMonthlyMeanStd(3, Array("sales"), false))
    .transform(getMonthlyMeanStd(6, Array("sales"), false))
    .transform(getMonthlyMeanStd(12, Array("sales"), true))
    
    DFToParquet.putDF(s"$baseDir/$of/sales", interactions)
    HdfsUtils.delete(tempDir)
}


def generateGAXData(of: String, dateMap: Map[String, Map[String, String]], productAttrs: DataFrame, baseDir: String): Unit = {
    val startDate = dateMap(of)("start")
    val endDate = dateMap(of)("end")
    val features = Array("PLPViewsPerDay", "PLPClicksPerDay", "PDPCountPerDay", "TotalAddToCartPerDay")
    val aggExprs = features.map(feature => sum(feature).cast(DoubleType).alias(feature))
    val tempDir = "/data/Archive/bhavesh/inventoryPrediction/temp/generateGAXData"
    
    val ga = ParquetToDF.getDF("/data/ecomm/ajio/processed/ProcessedGAData")
    .filter(col(DATE).between(startDate, endDate))
    .groupBy(PRODUCTID, DATE)
    .agg(aggExprs.head, aggExprs.tail: _*)
    .join(productAttrs, Seq(PRODUCTID))
    .transform(saveNLoadDF(s"$tempDir/ga_1"))
    .withColumn("months", getMonths(col(DATE), lit(endDate), lit("yyyy-MM-dd"), lit("yyyy-MM-dd")))
    .filter(col("months") < 12)
    .groupBy(PRODUCTID)
    .pivot("months")
    .agg(aggExprs.head, aggExprs.tail: _*)
    .na.fill(0)
    .transform(saveNLoadDF(s"$tempDir/ga_2"))
    .transform(getMonthlyMeanStd(3, features, false))
    .transform(getMonthlyMeanStd(6, features, false))
    .transform(getMonthlyMeanStd(12, features, true))
    
    DFToParquet.putDF(s"$baseDir/$of/ga", ga)
    HdfsUtils.delete(tempDir)
}

def generateWishlistData(of: String, dateMap: Map[String, Map[String, String]], productAttrs: DataFrame, baseDir: String): Unit = {
    val startDate = dateMap(of)("start")
    val endDate = dateMap(of)("end")
    val features = Array("wishlist")
    val tempDir = "/data/Archive/bhavesh/inventoryPrediction/temp/generateWishlistData"
    
    var wishlist = ParquetToDF.getDF("/data/ecomm/ajio/processed/processedWishlist")
    .filter(col(DATE).between(startDate, endDate))
    .groupBy(PRODUCTID, DATE)
    .agg(countDistinct("wishlistid").cast(DoubleType) as "wishlist")
    .join(productAttrs, Seq(PRODUCTID))
    .transform(saveNLoadDF(s"$tempDir/wishlist_1"))
    .withColumn("months", getMonths(col(DATE), lit(endDate), lit("yyyy-MM-dd"), lit("yyyy-MM-dd")))
    .filter(col("months") < 12)
    .groupBy(PRODUCTID)
    .pivot("months")
    .agg(sum("wishlist") as "wishlist", first("wishlist") as "temp")
    
    val colsToDrop = wishlist.columns.filter(_.endsWith("temp"))
    wishlist = wishlist.drop(colsToDrop: _*).na.fill(0)
    .transform(saveNLoadDF(s"$tempDir/wishlist_2"))
    .transform(getMonthlyMeanStd(3, features, false))
    .transform(getMonthlyMeanStd(6, features, false))
    // .transform(getMonthlyMeanStd(12, features, true))
    
    DFToParquet.putDF(s"$baseDir/$of/wishlist", wishlist)
    HdfsUtils.delete(tempDir)
}

def generateReturnsData(of: String, dateMap: Map[String, Map[String, String]], productAttrs: DataFrame, baseDir: String): Unit = {
    val startDate = dateMap(of)("start")
    val endDate = dateMap(of)("end")
    val features = Array("returns")
    val tempDir = "/data/Archive/bhavesh/inventoryPrediction/temp/generateReturns"
    
    var returns = ParquetToDF.getDF("/data/ecomm/ajio/processed/interactionsDB")
    .filter(col("event") === "Return" && col(QUANTITY) >= 0 &&
        col(USERID).isNotNull && col(PRODUCTID).isNotNull)
    .filter(col(DATE).between(startDate, endDate))
    .groupBy(PRODUCTID, DATE)
    .agg(sum(QUANTITY).cast(DoubleType) as "returns")
    .join(productAttrs, Seq(PRODUCTID))
    .transform(saveNLoadDF(s"$tempDir/returns_1"))
    .withColumn("months", getMonths(col(DATE), lit(endDate), lit("yyyy-MM-dd"), lit("yyyy-MM-dd")))
    .filter(col("months") < 12)
    .groupBy(PRODUCTID)
    .pivot("months")
    .agg(sum("returns") as "returns", first("returns") as "temp")
    
    val colsToDrop = returns.columns.filter(_.endsWith("temp"))
    returns = returns.drop(colsToDrop: _*).na.fill(0)
    .transform(saveNLoadDF(s"$tempDir/returns_2"))
    .transform(getMonthlyMeanStd(3, features, false))
    .transform(getMonthlyMeanStd(6, features, false))
    .transform(getMonthlyMeanStd(12, features, true))
    
    DFToParquet.putDF(s"$baseDir/$of/returns", returns)
    HdfsUtils.delete(tempDir)
}

def generateUsersXData(of: String, dateMap: Map[String, Map[String, String]], productAttrs: DataFrame, baseDir: String): Unit = {
    val startDate = dateMap(of)("start")
    val endDate = dateMap(of)("end")
    val features = Array("users")
    val tempDir = "/data/Archive/bhavesh/inventoryPrediction/temp/generateUsersXData"
    
    var interactions = ParquetToDF.getDF("/data/ecomm/ajio/processed/interactionsDB")
    .filter(col("event") === "Checkout" && col("purchase") === "New" 
            && col("productid").isNotNull && col("booked_rev") > 0 
            && col("userid").isNotNull && col("quantity") > 0
            && col(DATE).between(startDate, endDate)
           )
    .join(productAttrs, Seq(PRODUCTID))
    .transform(saveNLoadDF(s"$tempDir/interactions_1"))
    .withColumn("months", getMonths(col(DATE), lit(endDate), lit("yyyy-MM-dd"), lit("yyyy-MM-dd")))
    .filter(col("months") < 12)
    .groupBy(PRODUCTID)
    .pivot("months")
    .agg(countDistinct(USERID).cast(DoubleType) as "users", first(USERID) as "temp")
    
    val colsToDrop = interactions.columns.filter(_.endsWith("temp"))
    interactions = interactions.drop(colsToDrop: _*).na.fill(0)
    .transform(saveNLoadDF(s"$tempDir/interactions_2"))
    .transform(getMonthlyMeanStd(3, features, false))
    .transform(getMonthlyMeanStd(6, features, false))
    .transform(getMonthlyMeanStd(12, features, true))
    
    DFToParquet.putDF(s"$baseDir/$of/users", interactions)
    HdfsUtils.delete(tempDir)
}

def generateAvailabilityQuantityXData(of: String, dateMap: Map[String, Map[String, String]], productAttrs: DataFrame, baseDir: String): Unit = {
    val startDate = dateMap(of)("start")
    val endDate = dateMap(of)("end")
    val features = Array("availableQuantity")
    val tempDir = "/data/Archive/bhavesh/inventoryPrediction/temp/generateAvailabilityQuantityXData"
    
    var inventory = ParquetToDF.getDF("/data/ecomm/ajio/processed/LiveProductsInventoryHistoryLegos")
    .withColumnRenamed(ITEM_ID, PRODUCTID)
    .filter(col(DATE).between(startDate, endDate))
    .groupBy(PRODUCTID, DATE)
    .agg(sum("availablequantity").as("availableQuantity"))
    .join(productAttrs, Seq(PRODUCTID))
    .transform(saveNLoadDF(s"$tempDir/inventory_1"))
    .withColumn("months", getMonths(col(DATE), lit(endDate), lit("yyyy-MM-dd"), lit("yyyy-MM-dd")))
    .filter(col("months") < 12)
    .groupBy(PRODUCTID)
    .pivot("months")
    .agg(sum("availableQuantity").cast(DoubleType) as "availableQuantity", first("availableQuantity") as "temp")
    
    val colsToDrop = inventory.columns.filter(_.endsWith("temp"))
    inventory = inventory.drop(colsToDrop: _*).na.fill(0)
    .transform(saveNLoadDF(s"$tempDir/inventory_2"))
    .transform(getMonthlyMeanStd(3, features, false))
    .transform(getMonthlyMeanStd(6, features, false))
    .transform(getMonthlyMeanStd(12, features, true))
    
    DFToParquet.putDF(s"$baseDir/$of/availableQuantity", inventory)
    HdfsUtils.delete(tempDir)
}

generateSalesXData("train", dateMapX, productAttrs, baseDir)
generateSalesXData("test", dateMapX, productAttrs, baseDir)
generateGAXData("train", dateMapX, productAttrs, baseDir)
generateGAXData("test", dateMapX, productAttrs, baseDir)
generateWishlistData("train", dateMapX, productAttrs, baseDir)
generateWishlistData("test", dateMapX, productAttrs, baseDir)
generateReturnsData("train", dateMapX, productAttrs, baseDir)
generateReturnsData("test", dateMapX, productAttrs, baseDir)
generateUsersXData("train", dateMapX, productAttrs, baseDir)
generateUsersXData("test", dateMapX, productAttrs, baseDir)
generateAvailabilityQuantityXData("train", dateMapX, productAttrs, baseDir)
generateAvailabilityQuantityXData("test", dateMapX, productAttrs, baseDir)

In [None]:
def computeTarget(of: String, dateMap: Map[String, Map[String, String]], productAttrs: DataFrame, baseDir: String): Unit = {
    val startDate = dateMap(of)("start")
    val endDate = dateMap(of)("end")
    
    var interactions = ParquetToDF.getDF("/data/ecomm/ajio/processed/interactionsDB").filter(col(DATE).between(startDate, endDate))
    .filter(col("event") === "Checkout" && col("purchase") === "New" && col("productid").isNotNull && col("booked_rev") > 0 && col("userid").isNotNull && col("quantity") > 0)
    .groupBy(PRODUCTID)
    .agg(sum(QUANTITY) as "sales")
    .join(productAttrs, Seq(PRODUCTID))
    
    DFToParquet.putDF(s"$baseDir/$of/target", interactions)
}
computeTarget("train", dateMapY, productAttrs, baseDir)
computeTarget("test", dateMapY, productAttrs, baseDir)

In [None]:
def CreateDataset(of: String, xPath: String, yPath: String, basePath: String): Unit = {
    val tempDir = "/data/Archive/bhavesh/inventoryPrediction/temp/createDataset"
    var xData = ParquetToDF.getDF(s"$xPath/$of/sales").drop(SIMILAR_GROUP_LEVEL)
    .join(ParquetToDF.getDF(s"$xPath/$of/ga").drop(SIMILAR_GROUP_LEVEL), Seq(PRODUCTID), "outer")
    .transform(saveNLoadDF(s"$tempDir/dataset_1"))
    .join(ParquetToDF.getDF(s"$xPath/$of/wishlist").drop(SIMILAR_GROUP_LEVEL), Seq(PRODUCTID), "outer")
    .join(ParquetToDF.getDF(s"$xPath/$of/returns").drop(SIMILAR_GROUP_LEVEL), Seq(PRODUCTID), "outer")
    .transform(saveNLoadDF(s"$tempDir/dataset_2"))
    .join(ParquetToDF.getDF(s"$xPath/$of/users").drop(SIMILAR_GROUP_LEVEL), Seq(PRODUCTID), "outer")
    .join(ParquetToDF.getDF(s"$xPath/$of/availableQuantity").drop(SIMILAR_GROUP_LEVEL), Seq(PRODUCTID), "outer")
    .na.fill(0)
    
    DFToParquet.putDF(s"$basePath/$of/XData", xData)
    var yData = ParquetToDF.getDF(s"$yPath/$of/target").drop(SIMILAR_GROUP_LEVEL)
    DFToParquet.putDF(s"$basePath/$of/YData", yData)
    HdfsUtils.delete(tempDir)
}
CreateDataset("train", baseDir, baseDir, s"/data/Archive/bhavesh/inventoryPrediction/OriginalDataset/date_when_prediction_is_made=$dateForWhichPredictionsAreMade")
CreateDataset("test", baseDir, baseDir, s"/data/Archive/bhavesh/inventoryPrediction/OriginalDataset/date_when_prediction_is_made=$dateForWhichPredictionsAreMade")

### Convert local predictions to required format

In [None]:
def thresholdColumns(columns: Array[String], minThreshold: Double = 0, maxThreshold: Double = Double.PositiveInfinity)(df: DataFrame): DataFrame = {

    var scaledDF = df
    columns.foreach(
    column => {
    scaledDF = scaledDF.withColumn(column,
      when(col(column) > maxThreshold, lit(maxThreshold)).otherwise(col(column)))
      .withColumn(column, when(col(column) < minThreshold, lit(minThreshold)).otherwise(col(column)))
    })
    scaledDF
}

var suffix: String = "colorfamily_pricebucket_styletype_pattern_sleeve_brandname"
var prodLevelSigma = ParquetToDF.getDF("/data/Archive/bhavesh/inventoryPrediction/quantityStatsAcrossMonths/date_when_prediction_is_made=2023-04-16/productLevel/past12MonthsStats").select(PRODUCTID, "stddev")
var queryLevelSigma = ParquetToDF.getDF(s"/data/Archive/bhavesh/inventoryPrediction/quantityStatsAcrossMonths/date_when_prediction_is_made=2023-04-16/queryLevel/suffix=$suffix/past12MonthsStats").select("query", "stddev", SIMILAR_GROUP_LEVEL)
var productQueryMap = ParquetToDF.getDF(s"/data/Archive/bhavesh/inventoryPrediction/queryToProductMap/date_when_prediction_is_made=2023-04-16/suffix=$suffix")

def convertLocalToRequiredFormat(dateForWhichPredictionsAreMade: String, basePath: String, level: String, localFile: String, fileType: String = "csv"): Unit = {
    
    var actual = ParquetToDF.getDF(s"/data/Archive/bhavesh/inventoryPrediction/actualDataForPredictionPeriod/date=$dateForWhichPredictionsAreMade/sales")
    var fileName: String = if(level == "product"){
       "productLevelPredictions"
    } else{
        "queryAggregatedPredictions"
    }
    def doConversion(brick: String, brickId: String): DataFrame = {
        var localPredictionsPath = s"$basePath/$brick/$localFile"
        var df = if(fileType=="csv"){
            CSVToDF.getDF(localPredictionsPath, inferSchema = true)
        }
        else{
            ParquetToDF.getDF(localPredictionsPath)
        }

        
        df = df
        .withColumn(SIMILAR_GROUP_LEVEL, lit(brickId))
        .withColumn("model", lit("lr"))
        .withColumn("predicted", col("predictedyQuantity"))
        .select(PRODUCTID, SIMILAR_GROUP_LEVEL, "predicted", "model")
        
        if(level == "product") {
            df.transform(thresholdColumns(Array("predicted")))
            .join(prodLevelSigma, Seq(PRODUCTID))
            .withColumn("lowerBound", col("predicted") - (lit(2.0) * col("stddev")))
            .withColumn("upperBound", col("predicted") + (lit(2.0) * col("stddev")))
            .transform(thresholdColumns(Array("lowerBound")))
        }
        else {
            df.transform(thresholdColumns(Array("predicted")))
            .join(productQueryMap, Seq(SIMILAR_GROUP_LEVEL, PRODUCTID))
            .groupBy("model", "query", SIMILAR_GROUP_LEVEL)
            .agg(sum("predicted") as "predicted")
            .join(queryLevelSigma, Seq(SIMILAR_GROUP_LEVEL, "query"))
            .withColumn("lowerBound", col("predicted") - (lit(2.0) * col("stddev")))
            .withColumn("upperBound", col("predicted") + (lit(2.0) * col("stddev")))
            .transform(thresholdColumns(Array("lowerBound")))
        }
    }
    var menShirts = doConversion("menShirts", "830216013")
    DFToParquet.putDF(s"$basePath/menShirts/$fileName", menShirts, partitionedColumn="model")
    var womenKurtas = doConversion("womenKurtas", "830303011")
    DFToParquet.putDF(s"$basePath/womenKurtas/$fileName", womenKurtas, partitionedColumn="model")
}

In [None]:
def convertLocal(exp: String, date: String, filename: String, format: String){
    convertLocalToRequiredFormat(date, s"/data/Archive/bhavesh/inventoryPrediction/experiments/exp_$exp/predictions/ModelForEachBrickProductLevel", "product", filename, format)
    convertLocalToRequiredFormat(date, s"/data/Archive/bhavesh/inventoryPrediction/experiments/exp_$exp/predictions/ModelForEachBrickProductLevel", "query", filename, format)
}

In [None]:
convertLocal(
    exp = "pca",
    date = "2023-04-16",
    filename = "predictions_pca.csv",
    format = "csv"
)

In [None]:
convertLocal(
    exp = "trials",
    date = "2023-04-16",
    filename = "predictions_4_months.csv",
    format = "csv"
)

In [None]:
convertLocal(
    exp = "neural_network_new",
    date = "2023-04-16",
    filename = "predictions",
    format = "parquet"
)

In [None]:
def metricsShow(exp: String): Unit = {
    var pl = CSVToDF.getDF(s"/data/Archive/bhavesh/inventoryPrediction/experiments/exp_${exp}/metrics/productLevelModelForEachBrick/productLevel", inferSchema=true).orderBy("model", "similargrouplevel").drop(DATE).persist()
    pl.printSchema

    var show = pl.orderBy("metric").groupBy("model", "metric").pivot("similargrouplevel").agg(first("value")).orderBy("model", "metric").withColumn("final", concat(lit("Men Shirts: "), col("830216013"), lit("\n"), lit("Women Kurtas: "), col("830303011"))).groupBy("model").pivot("metric").agg(first("final") as "final").persist()
    show.orderBy("model").show(false)
    // show.orderBy("model").drop("model").show(false)
    DFToCSV.putDF("/data/Archive/plp/bhavesh/temp/ip/pl_show", show.orderBy("model").coalesce(1))
    pl.unpersist()

    var ql = CSVToDF.getDF(s"/data/Archive/bhavesh/inventoryPrediction/experiments/exp_${exp}/metrics/productLevelModelForEachBrick/queryLevel", inferSchema=true).orderBy("model", "similargrouplevel").drop(DATE).persist()
    ql.printSchema

    show = ql.orderBy("metric").groupBy("model", "metric").pivot("similargrouplevel").agg(first("value")).orderBy("model", "metric").withColumn("final", concat(lit("Men Shirts: "), col("830216013"), lit("\n"), lit("Women Kurtas: "), col("830303011"))).groupBy("model").pivot("metric").agg(first("final") as "final").persist()
    show.orderBy("model").show(false)
    // show.orderBy("model").drop("model").show(false)
    DFToCSV.putDF("/data/Archive/plp/bhavesh/temp/ip/ql_show", show.orderBy("model").coalesce(1))
    ql.unpersist()
}

In [None]:
metricsShow("neural_network_new")

### Miscellaneous

In [None]:
val originalDatasetPath: String = "/data/Archive/bhavesh/inventoryPrediction/OriginalDataset/date_when_prediction_is_made=2023-04-16"
val transformedDatasetPath: String = "/data/Archive/bhavesh/inventoryPrediction/temp/local"
generateTransformedTrainForProductLevelDataset("train", originalDatasetPath, transformedDatasetPath, performNormalization=false)
generateTransformedTrainForProductLevelDataset("test", originalDatasetPath, transformedDatasetPath, performNormalization=false)

In [None]:
var df = ParquetToDF.getDF("/data/Archive/bhavesh/inventoryPrediction/OriginalDataset/date_when_prediction_is_made=2023-04-16/train/YData")
var productAttrs = ParquetToDF.getDF("/data/Archive/inventory/productAttributesLegosFNL").select(PRODUCTID, SIMILAR_GROUP_LEVEL, "colorfamily", "brandname", "sleeve", "pattern", "styletype")
df = df.join(productAttrs, Seq(PRODUCTID))

In [None]:
df.printSchema

In [None]:
df.count

In [None]:
DFToParquet.putDF("/data/Archive/bhavesh/inventoryPrediction/embeddings/data", df)

In [None]:
import org.apache.spark.sql.functions.{udf, explode}
import org.apache.spark.sql.types._


var predictionsDate = "2023-04-16"
var rundate = "2023-04-15"
var suffix = rundate+"/queryProductMap_colorfamily_pricebucket_styletype_pattern_sleeve_brandname"

var actualDf = spark.read.parquet("/data/Archive/bhavesh/inventoryPrediction/experiments/exp_combined_arima_my_exp/metricsDF/date=2023-04-16").select("similargrouplevel","query","actual","predicted").withColumn("predicted",when(col("predicted")>0.0,col("predicted")).otherwise(0.0))

var statssdf = spark.read.parquet("/data/Archive/inventory/"+suffix+"/past12MonthsStats").select("query","similargrouplevel","stddev").distinct().na.fill(0.0)

actualDf = actualDf.join(statssdf,Seq("query","similargrouplevel"))

var queriesTemplateMap = spark.read.parquet("/data/ecomm/ajio/processed/inventory/"+suffix).select("query","template","similargrouplevel").distinct().withColumn("category_name",when(col("similargrouplevel")==="830216013",lit("Men-Shirts")).when(col("similargrouplevel")==="830303011",lit("Women-Kurtas")))

var finalDfSimulator = {actualDf.join(queriesTemplateMap,Seq("similargrouplevel","query"),"right")
    .na.fill(0.0)
    .withColumn("actual",col("actual").cast(DoubleType))
    .withColumn("predicted",col("predicted").cast(DoubleType))

    }
finalDfSimulator.show(false)

finalDfSimulator.write.mode("overwrite").parquet("/data/ecomm/ajio/processed/inventory/integrationSimulator/predictions")

In [None]:
var suffix = "2023-04-15/queryProductMap_colorfamily_pricebucket_styletype_pattern_sleeve_brandname"

  var productAttr = spark.read.parquet("/data/Archive/inventory/"+suffix+"/temp/priceproductAttr").filter(col("similargrouplevel").isin(List("830216013","830303011"):_*)).select(col("productid") as "productid",col("similargrouplevel"),col("colorfamily"),col("brandname"),col("styletype"),col("pattern"),col("sleeve").as("sleeve"),col("WeightedAvgPrice").as("price")).na.drop(Seq("productid"))

var productTitle = spark.read.parquet("/data/Archive/inventory/productAttributesLegosFNL_unnormalised").filter(col("similargrouplevel").isin(List("830216013","830303011"):_*)).select(col("itemid") as "productid",col("title")).distinct

productAttr = productAttr.join(productTitle,Seq("productid"))

var brickDetails = spark.read.parquet("/data/ecomm/ajio/metadata/BrickDetailsLegos")
var imageUrls = spark.read.parquet("/data/ecomm/ajio/processed/productImageURLs").withColumnRenamed("pcode","productid")

productAttr.join(brickDetails,Seq("similargrouplevel")).join(imageUrls,Seq("productid")).withColumn("category_name",when(col("similargrouplevel")==="830216013",lit("Men-Shirts")).when(col("similargrouplevel")==="830303011",lit("Women-Kurtas"))).write.mode("overwrite").parquet("/data/ecomm/ajio/processed/inventory/integrationSimulator/productDetails")

var queryMonthly = spark.read.parquet("/data/Archive/inventory/"+suffix+"/queryLevelMonthlyquantity").filter(col("pastmonth")>0)
queryMonthly.withColumn("category_name",when(col("similargrouplevel")==="830216013",lit("Men-Shirts")).when(col("similargrouplevel")==="830303011",lit("Women-Kurtas"))).write.mode("overwrite").parquet("/data/ecomm/ajio/processed/inventory/integrationSimulator/monthlySales")