In [None]:
%%init_spark
launcher.jars = ["/app/setup/commons.jar"]
launcher.conf.spark.app.name = "bhavesh_notebook"
launcher.conf.spark.queue = "default"
launcher.conf.spark.local.dir = "/app/tmp"
launcher.conf.spark.sql.shuffle.partitions = 210
launcher.conf.spark.sql.shuffle.minPartitions = 20
launcher.conf.spark.driver.memory = "50g"
launcher.conf.spark.ui.showConsoleProgress = "true"
launcher.master = "local[15]"

In [None]:
import ai.couture.obelisk.commons.Constants.{DB_PRODUCT_INTERACTIONS_PATH, PROCESSED_GA_DATA, GA_USER_CLICK_DATA}
import ai.couture.obelisk.commons.Constants.STANDARD_COL_NAMES._
import ai.couture.obelisk.commons.Constants._
import ai.couture.obelisk.commons.io._
import ai.couture.obelisk.commons.utils.DateTimeUtil._
import org.apache.spark.sql._
import org.apache.spark.sql.expressions._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

import spark.implicits._

// val B2B_PLP_RELEVANCE_DAG_INIT_PATH: String = "/data/ecomm/ajiob2b/metadata/plp/init"
// val LIVE_STYLES_DATE_WISE_PATH = "/data/ecomm/ajiob2b/processed/LiveStyleCodesDateWise"
// val LIVE_STYLES_DATE_WISE_PATH_V2 = "/data/ecomm/ajiob2b/processed/LiveStyleCodesDateWiseV2"
// val COMBINED_USER_PRODUCT_INTERACTIONS_PATH = "/data/ecomm/ajiob2b/processed/CombinedUPI"
// val GA_INTERACTIONS_PATH = "/data/ecomm/ajiob2b/processed/ga/ULevelProductGA"
// val USER_CLUSTER_MAPPING_PATH = "/data/ecomm/ajiob2b/processed/output/clustering/AssignedUsersClusters.json"
// val WISHLIST_PATH = "/data/ecomm/ajiob2b/processed/WishlistMaster"
val DAYS: String = "daysInHis"
val DATASET: String = "dataset"
val MODEL: String = "model"
val TARGET: String = "target"
val REVENUE: String = "revenue"
val DISTINCT_PURCHASERS: String = "distinctPurchasers"
val SALES = "sales"
val REVENUE_PER_VIEW: String = "RevenuePerView"
val QUANTITY_PER_VIEW: String = "QuantityPerView"
val WISHLIST_PER_VIEW: String = "WishlistPerView"
val CONSIDERATION: String = "Consideration"
val CONVERSION: String = "Conversion"
val PRODUCT_CLICKS: String = "productClicks"
val PRODUCT_VIEWS: String = "productViews"
val L1_NAME: String = "L1_NAME"
val L2_NAME: String = "L2_NAME"

// preprod paths
val B2B_PLP_RELEVANCE_DAG_INIT_PATH: String = "/data/ecomm/ajiob2b/metadata/plp/init"
val B2B_PLP_UC_DAG_INIT_PATH: String = "/data/ecomm/ajiob2b/metadata/plp/uc/init"
val STYLECODE_TO_JIOCODE_MAPPING_PATH: String = "/data/ecomm/ajiob2b/processed/20230511/StyleCodeToJioCodeMapping"
val B2B_BRICK_DETAILS_PATH: String = "/data/ecomm/ajiob2b/processed/20230511/metadata/BrickDetails"
val B2B_PROCESSED_BRICK_DETAILS_PATH: String = "/data/ecomm/ajiob2b/processed/20230511/metadata/processedBrickDetailsPLP"
val B2B_IMAGE_PER_STYLE_CODE_PATH: String = "/data/ecomm/ajiob2b/processed/20230512/ImagePerStyleCode"
val LIVE_STYLES_DATE_WISE_PATH = "/data/ecomm/ajiob2b/processed/LiveStyleCodesDateWise"
val LIVE_STYLES_DATE_WISE_PATH_V2 = "/data/ecomm/ajiob2b/processed/LiveStyleCodesDateWiseV2"
val COMBINED_USER_PRODUCT_INTERACTIONS_PATH = "/data/ecomm/ajiob2b/processed/20230511/CombinedUPI"
val COMBINED_PACK_ATTRIBUTES_PATH = "/data/ecomm/ajiob2b/processed/20230511/CombinedPackAttributes"
val REFINED_STYLE_ATTRIBUTES_PATH = "/data/ecomm/ajiob2b/processed/20230512/RefinedStyleAttributes"
val GA_INTERACTIONS_PATH = "/data/ecomm/ajiob2b/processed/ga/ULevelProductGA"
val USER_CLUSTER_MAPPING_PATH = "/data/ecomm/ajiob2b/processed/20230511/AssignedUsersClusters.json"
val WISHLIST_PATH = "/data/ecomm/ajiob2b/processed/20230511/WishlistMaster"

In [None]:
var upi = (
    ParquetToDF.getDF(COMBINED_USER_PRODUCT_INTERACTIONS_PATH)
    .filter(col(DATE).between("2023-04-07", "2023-05-06"))
    .filter(col(INTERACTIONTYPE) === "Order" && col("versionId").isNull)
    .withColumn("rank", dense_rank().over(Window.partitionBy(INTERACTION_CODE, "styleproduct").orderBy(DATE)))
    .filter(col("rank") === 1)
    .drop("rank")
)

var ga = (
    ParquetToDF.getDF(GA_INTERACTIONS_PATH)
      .withColumn(DATE, date_format(to_timestamp(col(DATE).cast(StringType), "yyyyMMdd"), "yyyy-MM-dd"))
      .filter(col(DATE).between("2023-04-07", "2023-05-06"))
)

var liveStyles = (
    ParquetToDF.getDF(LIVE_STYLES_DATE_WISE_PATH_V2)
    .withColumn(DATE, date_format(to_timestamp(col(DATE).cast(StringType), "yyyyMMdd"), "yyyy-MM-dd"))
)

var liveStylesData = (
    ParquetToDF.getDF("/data/ecomm/ajiob2b/processed/plp/liveStyles")
)

### Train/test individual checks

In [None]:
var df = ParquetToDF.getDF("/data/ecomm/ajiob2b/processed/plp/features/individual/GARelated/dataset=train/Algorithm=GlobalPLP")
df.printSchema

In [None]:
df.groupBy(length(col("stylecode"))).count().show()

In [None]:
var df = ParquetToDF.getDF("/data/ecomm/ajiob2b/processed/plp/features/individual/UPIRelated/dataset=train/Algorithm=GlobalPLP")
df.printSchema

In [None]:
df.groupBy(DAYS).agg(countDistinct(STYLECODE) as "count", sum("revenue") as "revenue", sum("quantity") as "quantity").orderBy(DAYS).show(false)

In [None]:
upi.withColumnRenamed("styleproduct", STYLECODE).join(liveStyles.join(liveStyles.filter(col(DATE) === "2023-05-09").select(STYLECODE), Seq(STYLECODE)), Seq(STYLECODE, DATE)).groupBy(DATE).agg(countDistinct(STYLECODE) as "count", sum("orderprice") as "revenue", sum("orderquantity") as "quantity").orderBy(DATE).show(30, false)

### yData Individual checks

In [None]:
var df = ParquetToDF.getDF("/data/ecomm/ajiob2b/processed/plp/features/individual/UPIRelated/dataset=yData/Algorithm=GlobalPLP")
df.printSchema

In [None]:
df.groupBy(DAYS).agg(countDistinct(STYLECODE) as "count", sum("revenue") as "revenue", sum("quantity") as "quantity").orderBy(DAYS).show(false)

In [None]:
upi.withColumnRenamed("styleproduct", STYLECODE).join(liveStyles.filter(col(DATE) === "2023-05-09"), Seq(STYLECODE, DATE)).agg(countDistinct(STYLECODE) as "count", sum("orderprice") as "revenue", sum("orderquantity") as "quantity").show

### Xdata sanity checks

In [None]:
var features = ParquetToDF.getDF("/data/ecomm/ajiob2b/processed/plp/features/XData/dataset=train/Algorithm=GlobalPLP")
features.printSchema

In [None]:
var df = features.select(STYLECODE, "1_RevenuePerView", "1_Consideration", "1_QuantityPerView", "1_WishlistPerView", "1_revenue", "1_quantity", "1_productViews")
df.printSchema

In [None]:
df.filter(col("1_RevenuePerView") =!= 0.0).show(5, false)

In [None]:
features.select("1_revenue", "1_quantity", "1_productViews").describe().show(false)

In [None]:
(33092.64/339110.92000000004)/(2.0 - (302.0/3076.0))

In [None]:
upi.filter(col(DATE) === "2023-04-28" && col("styleproduct") === "420392462").agg(sum("orderprice")).show

### YData sanity checks

In [None]:
var y = ParquetToDF.getDF("/data/ecomm/ajiob2b/processed/plp/features/YData/Algorithm=GlobalPLP/clusterid=global")
y.printSchema

In [None]:
y.describe().show(false)

In [None]:
upi.filter(col(DATE) === "2023-04-30").groupBy("styleproduct").agg(sum("orderprice") as "revenue", sum("orderquantity") as QUANTITY).show(5, false)

In [None]:
ga.filter(col(DATE) === "2023-04-30").groupBy(STYLECODE).agg(sum(PRODUCT_VIEWS) as PRODUCT_VIEWS).filter(col(STYLECODE).isin(Array("420392467", "464589772", "465586796"): _*)).show

In [None]:
ga.filter(col(DATE) === "2023-04-30").groupBy(STYLECODE).agg(sum(PRODUCT_VIEWS) as PRODUCT_VIEWS).agg(max("productViews")).show

In [None]:
upi.filter(col(DATE) === "2023-04-30").groupBy("styleproduct").agg(sum("orderprice") as "revenue", sum("orderquantity") as QUANTITY).agg(max("revenue"), max(QUANTITY)).show(5, false)

In [None]:
y.filter(col(STYLECODE).isin(Array("420392467", "464589772", "465586796"): _*)).show

In [None]:
val nv = 237/2603.0
val nr = 66405.73000000001/415225.44000000006
val nq = 77/240.0
print(nr/(2-nv), nq/(2-nv))

### Combined rankings checks

In [None]:
var rankings = ParquetToDF.getDF("/data/ecomm/ajiob2b/processed/plp/rankings")
var weights = ParquetToDF.getDF("/data/ecomm/ajiob2b/processed/plp/weights")
var combinedRanking = ParquetToDF.getDF("/data/ecomm/ajiob2b/processed/plp/combinedRanking")

In [None]:
rankings.printSchema

In [None]:
combinedRanking.printSchema

In [None]:
combinedRanking.filter(col(PRODUCTRANK)<=5).orderBy(ALGORITHM, CLUSTERID).show(false)

In [None]:
rankings.filter(col("Algorithm") === "GlobalPLP").filter(col(STYLECODE) === "465645536").show(5, false)

In [None]:
rankings.join(weights, Seq("target", "Algorithm")).count

In [None]:
var XData = ParquetToDF.getDF("/data/ecomm/ajiob2b/processed/plp/standardizedDataset/test")
XData.printSchema

In [None]:
var targets = Array("RevenuePerView", "QuantityPerView")
targets.map(target => {
      if (!target.startsWith("yData")) {
        s"yData$target"
      } else {
        target
      }
    })

In [None]:
var df = ParquetToDF.getDF("/data/ecomm/ajiob2b/processed/plp/combinedRanking/model=xgb/Algorithm=SegmentPLP")
df.printSchema

### Target data analysis

In [None]:
var df = ParquetToDF.getDF("/data/ecomm/ajiob2b/processed/plp/features/YData")
df.printSchema

In [None]:
df.filter(col("yDataRevenuePerView")>0.5).filter(col(CLUSTERID) === "global").show(5, false)

In [None]:
upi.filter(col("styleproduct") === "420441999").groupBy(DATE).agg(sum("orderprice") as "revenue").orderBy(desc(DATE)).show(false)
ga.filter(col(STYLECODE) ==="420441999").groupBy(DATE).agg(sum("productViews") as "views").orderBy(desc(DATE)).show(false)

### Brick details Transformation

In [None]:
var brickDetails = spark.read.parquet("/data/ecomm/ajiob2b/processed/20230511/metadata/BrickDetails")
brickDetails = brickDetails.withColumn("L1_CODE", substring(col("brickid"), 0, 4)).withColumn("L2_CODE", substring(col("brickid"), 0, 6))
brickDetails = brickDetails.as("bd1").join(
    brickDetails.select(col("brickname").as("L1_NAME"), col("brickid")).as("bd2"), col("bd2.brickid") === col("bd1.L1_CODE")
).join(
    brickDetails.select(col("brickname").as("L2_NAME"), col("brickid")).as("bd3"), col("bd3.brickid") === col("bd1.L2_CODE")
).filter(length(col("bd1.brickid")) > 6).select("bd1.brickid", "L1_NAME", "L2_NAME", "brickname").dropDuplicates().toDF("brickid", "l1name", "l2name", "brickname")

brickDetails.show(5, false)
DFToParquet.putDF("/data/ecomm/ajiob2b/processed/20230511/metadata/processedBrickDetailsPLP", brickDetails)
// Self join to same df to get the corresponding L1 and L2 names
// Input: processed/metadata/BrickDetails, output: processed/metadata/processedBrickDetailsPLP

In [None]:
var brickDetails = ParquetToDF.getDF("/data/ecomm/ajiob2b/processed/20230511/metadata/processedBrickDetailsPLP")
brickDetails.printSchema

### Miscellaneous

In [None]:
var df = ParquetToDF.getDF(COMBINED_PACK_ATTRIBUTES_PATH)
df.printSchema

In [None]:
df.show(5)

In [None]:
df.columns.foreach(println(_))

In [None]:
upi.printSchema

In [None]:
upi.show(1, false)

In [None]:
upi.filter(col(DATE) === "2023-05-06").groupBy("styleproduct").agg(sum("orderprice"), sum("packprice"), sum("orderquantity"), sum("packquantity")).show(false)

In [None]:
upi.filter(col("interactionCode") === "49408387" && col(DATE) === "2023-05-06").show(33, false)

In [None]:
upi.filter(col("interactionCode") === "49408387").agg(sum("packprice"), sum("packquantity")).show(false)

In [None]:
var upi_full = spark.read.parquet(COMBINED_USER_PRODUCT_INTERACTIONS_PATH)

In [None]:
upi_full.filter($"interactionCode"==="49408387").select("date").distinct.show(false)

In [None]:
upi_full.orderBy(desc("date")).show(1, false)

In [None]:
upi_full.filter($"interactionCode"==="49408387"&&$"packquantity"=!=0).distinct.count

In [None]:
upi.filter(col("interactionCode") === "49408387").agg(sum("packquantity")).show(false)

In [None]:
upi.filter(col("interactionCode") === "49408387").select("interactiontype").distinct.show(false)

In [None]:
upi_full.filter(col("interactionCode") === "49408387").distinct.show(false)

In [None]:
upi_full.filter(col("interactionCode") === "49408387").groupBy("styleproduct").count.orderBy(desc("count")).show(5, false)

In [None]:
upi_full.filter(col("interactionCode")==="420377466" && col("styleproduct")==="420377466").show(5, false)

In [None]:
upi_full.filter(col("interactionCode") === "420377466" && col("packquantity") === 0).select("itemid").distinct.
except(upi_full.filter(col("interactionCode") === "420377466" && col("packquantity") === 1).select("itemid").distinct).show

In [None]:
var refineStyleAttributes = spark.read.parquet("/data/ecomm/ajiob2b/processed/20230512/RefinedStyleAttributes")
refineStyleAttributes.printSchema

In [None]:
refineStyleAttributes.groupBy(STYLECODE).agg(max("stylename"), max("brandname")).show(5, false)

In [None]:
var df = ParquetToDF.getDF("/data/ecomm/ajiob2b/processed/plp/output/uc/RankingWithStyleAttributes")
df.printSchema

In [None]:
df.groupBy(CLUSTERID).agg(countDistinct(STYLECODE) as "count", count(STYLECODE) as "count2").show()

In [None]:
var df2 = ParquetToDF.getDF("/data/ecomm/ajiob2b/processed/plp/combinedRanking/model=xgb")
df2.printSchema

In [None]:
df2.groupBy(CLUSTERID).count().show

In [None]:
var images = ParquetToDF.getDF(B2B_IMAGE_PER_STYLE_CODE_PATH)
images.printSchema

In [None]:
ga.filter(col(STYLECODE) === "466037126").groupBy(DATE).agg(sum(PRODUCT_VIEWS) as PRODUCT_VIEWS, sum(PRODUCT_CLICKS) as PRODUCT_CLICKS).orderBy(desc(DATE)).show(false)

In [None]:
var brickDetails = ParquetToDF.getDF("/data/ecomm/ajiob2b/processed/20230511/metadata/processedBrickDetailsPLP")
brickDetails.printSchema

In [None]:
brickDetails.filter(col("l2name") === "830502").show()

In [None]:
val df = ParquetToDF.getDF("/data/ecomm/ajiob2b/processed/20230511/metadata/BrickDetails")
df.printSchema

In [None]:
df.filter(col("brickid").startsWith("830311001")).show(false)

In [None]:
var df1 = ParquetToDF.getDF("/data/ecomm/ajiob2b/processed/plp/combinedRanking/model=lr/Algorithm=GlobalPLP")
var df2 = ParquetToDF.getDF("/data/ecomm/ajiob2b/processed/plp/combinedRanking/model=xgb/Algorithm=GlobalPLP")
df1.printSchema

In [None]:
df1.orderBy(PRODUCTRANK).show(5, false)
df2.orderBy(PRODUCTRANK).show(5, false)