diff --git a/aggregator/src/main/scala/ai/chronon/aggregator/base/TimedAggregators.scala b/aggregator/src/main/scala/ai/chronon/aggregator/base/TimedAggregators.scala index e4ef7dda4..5b0267d7d 100644 --- a/aggregator/src/main/scala/ai/chronon/aggregator/base/TimedAggregators.scala +++ b/aggregator/src/main/scala/ai/chronon/aggregator/base/TimedAggregators.scala @@ -16,12 +16,14 @@ package ai.chronon.aggregator.base +import org.slf4j.LoggerFactory import ai.chronon.aggregator.base.TimeTuple.typ import ai.chronon.api._ import java.util object TimeTuple extends Ordering[util.ArrayList[Any]] { + private val logger = LoggerFactory.getLogger(getClass) type typ = util.ArrayList[Any] def `type`(inputType: DataType): DataType = @@ -53,6 +55,7 @@ object TimeTuple extends Ordering[util.ArrayList[Any]] { } abstract class TimeOrdered(inputType: DataType) extends TimedAggregator[Any, TimeTuple.typ, Any] { + private val logger = LoggerFactory.getLogger(getClass) override def outputType: DataType = inputType override def irType: DataType = TimeTuple.`type`(inputType) @@ -72,6 +75,7 @@ abstract class TimeOrdered(inputType: DataType) extends TimedAggregator[Any, Tim } class First(inputType: DataType) extends TimeOrdered(inputType) { + private val logger = LoggerFactory.getLogger(getClass) //mutating override def update( ir: util.ArrayList[Any], @@ -92,6 +96,7 @@ class First(inputType: DataType) extends TimeOrdered(inputType) { } class Last(inputType: DataType) extends TimeOrdered(inputType) { + private val logger = LoggerFactory.getLogger(getClass) //mutating override def update( ir: util.ArrayList[Any], @@ -119,6 +124,7 @@ class OrderByLimitTimed( limit: Int, ordering: Ordering[TimeTuple.typ] ) extends TimedAggregator[Any, util.ArrayList[TimeTuple.typ], util.ArrayList[Any]] { + private val logger = LoggerFactory.getLogger(getClass) type Container = util.ArrayList[TimeTuple.typ] private val minHeap = new MinHeap[TimeTuple.typ](limit, ordering) @@ -129,7 +135,7 @@ class OrderByLimitTimed( override final def prepare(input: Any, ts: Long): Container = { // val gson = new Gson() val tuple = TimeTuple.make(ts, input) -// println(s"init: ${gson.toJson(tuple)}") +// logger.info(s"init: ${gson.toJson(tuple)}") val arr = new Container() arr.add(tuple) arr diff --git a/aggregator/src/main/scala/ai/chronon/aggregator/windowing/HopsAggregator.scala b/aggregator/src/main/scala/ai/chronon/aggregator/windowing/HopsAggregator.scala index dea5faa20..e55133fb8 100644 --- a/aggregator/src/main/scala/ai/chronon/aggregator/windowing/HopsAggregator.scala +++ b/aggregator/src/main/scala/ai/chronon/aggregator/windowing/HopsAggregator.scala @@ -16,6 +16,7 @@ package ai.chronon.aggregator.windowing +import org.slf4j.LoggerFactory import ai.chronon.aggregator.row.RowAggregator import ai.chronon.aggregator.windowing.HopsAggregator._ import ai.chronon.api.Extensions.{AggregationOps, AggregationsOps, WindowOps, WindowUtils} @@ -31,6 +32,7 @@ import java.util // t class HopsAggregatorBase(aggregations: Seq[Aggregation], inputSchema: Seq[(String, DataType)], resolution: Resolution) extends Serializable { + private val logger = LoggerFactory.getLogger(getClass) @transient lazy val rowAggregator = new RowAggregator(inputSchema, aggregations.flatMap(_.unWindowed)) @@ -93,6 +95,7 @@ class HopsAggregator(minQueryTs: Long, inputSchema: Seq[(String, DataType)], resolution: Resolution) extends HopsAggregatorBase(aggregations, inputSchema, resolution) { + private val logger = LoggerFactory.getLogger(getClass) val leftBoundaries: Array[Option[Long]] = { // Nikhil is pretty confident we won't call this when aggregations is empty @@ -135,7 +138,7 @@ class HopsAggregator(minQueryTs: Long, .zip(readableLeftBounds) .map { case (hop, left) => s"$hop->$left" } .mkString(", ") - println(s"""Left bounds: $readableHopsToBoundsMap + logger.info(s"""Left bounds: $readableHopsToBoundsMap |minQueryTs = ${TsUtils.toStr(minQueryTs)}""".stripMargin) result } diff --git a/aggregator/src/main/scala/ai/chronon/aggregator/windowing/SawtoothOnlineAggregator.scala b/aggregator/src/main/scala/ai/chronon/aggregator/windowing/SawtoothOnlineAggregator.scala index b36951281..6af83a824 100644 --- a/aggregator/src/main/scala/ai/chronon/aggregator/windowing/SawtoothOnlineAggregator.scala +++ b/aggregator/src/main/scala/ai/chronon/aggregator/windowing/SawtoothOnlineAggregator.scala @@ -16,6 +16,7 @@ package ai.chronon.aggregator.windowing +import org.slf4j.LoggerFactory import scala.collection.Seq import ai.chronon.api.Extensions.{AggregationPartOps, WindowOps} import ai.chronon.api._ @@ -31,6 +32,7 @@ class SawtoothOnlineAggregator(val batchEndTs: Long, inputSchema: Seq[(String, DataType)], resolution: Resolution, tailBufferMillis: Long) { + private val logger = LoggerFactory.getLogger(getClass) // logically, batch response is arranged like so // sum-90d => sum_ir_88d, [(sum_ir_1d, ts)] -> 1d is the hopSize for 90d @@ -46,10 +48,10 @@ class SawtoothOnlineAggregator(val batchEndTs: Long, val batchTailTs: Array[Option[Long]] = tailTs(batchEndTs) - println(s"Batch End: ${TsUtils.toStr(batchEndTs)}") - println("Window Tails: ") + logger.info(s"Batch End: ${TsUtils.toStr(batchEndTs)}") + logger.info("Window Tails: ") for (i <- windowMappings.indices) { - println(s" ${windowMappings(i).aggregationPart.outputColumnName} -> ${batchTailTs(i).map(TsUtils.toStr)}") + logger.info(s" ${windowMappings(i).aggregationPart.outputColumnName} -> ${batchTailTs(i).map(TsUtils.toStr)}") } def update(batchIr: BatchIr, row: Row): BatchIr = update(batchEndTs, batchIr, row, batchTailTs) diff --git a/aggregator/src/test/scala/ai/chronon/aggregator/test/ApproxDistinctTest.scala b/aggregator/src/test/scala/ai/chronon/aggregator/test/ApproxDistinctTest.scala index 2416a894f..4b82e5eb2 100644 --- a/aggregator/src/test/scala/ai/chronon/aggregator/test/ApproxDistinctTest.scala +++ b/aggregator/src/test/scala/ai/chronon/aggregator/test/ApproxDistinctTest.scala @@ -16,11 +16,13 @@ package ai.chronon.aggregator.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.base.ApproxDistinctCount import junit.framework.TestCase import org.junit.Assert._ class ApproxDistinctTest extends TestCase { + private val logger = LoggerFactory.getLogger(getClass) def testErrorBound(uniques: Int, errorBound: Int, lgK: Int): Unit = { val uniqueElems = 1 to uniques val duplicates = uniqueElems ++ uniqueElems ++ uniqueElems @@ -28,7 +30,7 @@ class ApproxDistinctTest extends TestCase { val ir = counter.prepare(duplicates.head) duplicates.tail.foreach { elem => counter.update(ir, elem) } val estimated = counter.finalize(ir) - // println(s"estimated - $estimated, actual - $uniques, bound - $errorBound") + // logger.info(s"estimated - $estimated, actual - $uniques, bound - $errorBound") assertTrue(Math.abs(estimated - uniques) < errorBound) } @@ -46,7 +48,7 @@ class ApproxDistinctTest extends TestCase { } val ir = irList.reduceLeft(counter.merge) val estimated = counter.finalize(ir) - // println(s"estimated - $estimated, actual - $uniques, bound - $errorBound") + // logger.info(s"estimated - $estimated, actual - $uniques, bound - $errorBound") assertTrue(Math.abs(estimated - uniques) < errorBound) } diff --git a/aggregator/src/test/scala/ai/chronon/aggregator/test/ApproxPercentilesTest.scala b/aggregator/src/test/scala/ai/chronon/aggregator/test/ApproxPercentilesTest.scala index 9799991e0..a1ed5672c 100644 --- a/aggregator/src/test/scala/ai/chronon/aggregator/test/ApproxPercentilesTest.scala +++ b/aggregator/src/test/scala/ai/chronon/aggregator/test/ApproxPercentilesTest.scala @@ -16,6 +16,7 @@ package ai.chronon.aggregator.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.base.ApproxPercentiles import ai.chronon.aggregator.row.StatsGenerator import com.yahoo.sketches.kll.KllFloatsSketch @@ -25,6 +26,7 @@ import org.junit.Assert._ import scala.util.Random class ApproxPercentilesTest extends TestCase { + private val logger = LoggerFactory.getLogger(getClass) def testBasicImpl(nums: Int, slide: Int, k: Int, percentiles: Array[Double], errorPercent: Float): Unit = { val sorted = (0 to nums).map(_.toFloat) val elems = Random.shuffle(sorted.toList).toArray @@ -42,7 +44,7 @@ class ApproxPercentilesTest extends TestCase { val expected = result.indices.map(_ * step).map(_.toFloat).toArray val diffs = result.indices.map(i => Math.abs(result(i) - expected(i))) val errorMargin = (nums.toFloat * errorPercent) / 100.0 - println(s""" + logger.info(s""" |sketch size: ${merged.getSerializedSizeBytes} |result: ${result.toVector} |result size: ${result.size} @@ -66,7 +68,7 @@ class ApproxPercentilesTest extends TestCase { sample1.map(sketch1.update) sample2.map(sketch2.update) val drift = StatsGenerator.PSIKllSketch(sketch1.toByteArray, sketch2.toByteArray).asInstanceOf[Double] - println(s"PSI drift: $drift") + logger.info(s"PSI drift: $drift") drift } diff --git a/aggregator/src/test/scala/ai/chronon/aggregator/test/RowAggregatorTest.scala b/aggregator/src/test/scala/ai/chronon/aggregator/test/RowAggregatorTest.scala index 76205f669..ced62d8f7 100644 --- a/aggregator/src/test/scala/ai/chronon/aggregator/test/RowAggregatorTest.scala +++ b/aggregator/src/test/scala/ai/chronon/aggregator/test/RowAggregatorTest.scala @@ -16,6 +16,7 @@ package ai.chronon.aggregator.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.row.RowAggregator import ai.chronon.api._ import junit.framework.TestCase @@ -25,6 +26,7 @@ import java.util import scala.collection.JavaConverters._ class TestRow(val fieldsSeq: Any*)(tsIndex: Int = 0) extends Row { + private val logger = LoggerFactory.getLogger(getClass) val fields: util.List[Any] = new java.util.ArrayList[Any](fieldsSeq.asJava) override val length: Int = fields.size() @@ -39,7 +41,7 @@ class TestRow(val fieldsSeq: Any*)(tsIndex: Int = 0) extends Row { override def mutationTs: Long = timeStamp - def print(): Unit = println(fieldsSeq) + def print(): Unit = logger.info(fieldsSeq) def set(index: Int, any: Any): Unit = fields.set(index, any) } diff --git a/aggregator/src/test/scala/ai/chronon/aggregator/test/SawtoothAggregatorTest.scala b/aggregator/src/test/scala/ai/chronon/aggregator/test/SawtoothAggregatorTest.scala index 73dfec18e..f41fa0a78 100644 --- a/aggregator/src/test/scala/ai/chronon/aggregator/test/SawtoothAggregatorTest.scala +++ b/aggregator/src/test/scala/ai/chronon/aggregator/test/SawtoothAggregatorTest.scala @@ -16,6 +16,7 @@ package ai.chronon.aggregator.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.row.RowAggregator import ai.chronon.aggregator.test.SawtoothAggregatorTest.sawtoothAggregate import ai.chronon.aggregator.windowing._ @@ -30,6 +31,7 @@ import scala.collection.mutable import scala.collection.Seq class Timer { + private val logger = LoggerFactory.getLogger(getClass) var ts: Long = System.currentTimeMillis() @@ -38,7 +40,7 @@ class Timer { // TODO: Write this out into a file checked into git // or incorporate proper benchmarks def publish(name: String, reset: Boolean = true): Unit = { - println(s"${name.padTo(25, ' ')} ${System.currentTimeMillis() - ts} ms") + logger.info(s"${name.padTo(25, ' ')} ${System.currentTimeMillis() - ts} ms") if (reset) ts = System.currentTimeMillis() } } diff --git a/aggregator/src/test/scala/ai/chronon/aggregator/test/VarianceTest.scala b/aggregator/src/test/scala/ai/chronon/aggregator/test/VarianceTest.scala index 71cdedf6b..d1a831250 100644 --- a/aggregator/src/test/scala/ai/chronon/aggregator/test/VarianceTest.scala +++ b/aggregator/src/test/scala/ai/chronon/aggregator/test/VarianceTest.scala @@ -16,11 +16,13 @@ package ai.chronon.aggregator.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.base.Variance import junit.framework.TestCase import org.junit.Assert._ class VarianceTest extends TestCase { + private val logger = LoggerFactory.getLogger(getClass) def mean(elems: Seq[Double]): Double = elems.sum / elems.length def naive(elems: Seq[Double]): Double = { @@ -52,8 +54,8 @@ class VarianceTest extends TestCase { val nums = (0 until cardinality).map { _ => min + math.random * (max - min) } val naiveResult = naive(nums) val welfordResult = welford(nums) - println(s"naive $naiveResult - welford $welfordResult - sum of squares ${sumOfSquares(nums)}") - println((naiveResult - welfordResult) / naiveResult) + logger.info(s"naive $naiveResult - welford $welfordResult - sum of squares ${sumOfSquares(nums)}") + logger.info((naiveResult - welfordResult) / naiveResult) assertTrue((naiveResult - welfordResult) / naiveResult < 0.0000001) } diff --git a/api/src/main/scala/ai/chronon/api/Extensions.scala b/api/src/main/scala/ai/chronon/api/Extensions.scala index 76c781d5a..bae181c31 100644 --- a/api/src/main/scala/ai/chronon/api/Extensions.scala +++ b/api/src/main/scala/ai/chronon/api/Extensions.scala @@ -16,6 +16,7 @@ package ai.chronon.api +import org.slf4j.LoggerFactory import ai.chronon.api.DataModel._ import ai.chronon.api.Operation._ import com.fasterxml.jackson.core.`type`.TypeReference @@ -31,8 +32,10 @@ import scala.util.ScalaJavaConversions.{IteratorOps, ListOps, MapOps} import scala.util.{Failure, Success, Try} object Extensions { + private val logger = LoggerFactory.getLogger(getClass) implicit class TimeUnitOps(timeUnit: TimeUnit) { + private val logger = LoggerFactory.getLogger(getClass) def str: String = timeUnit match { case TimeUnit.HOURS => "h" @@ -47,6 +50,7 @@ object Extensions { } implicit class OperationOps(operation: Operation) { + private val logger = LoggerFactory.getLogger(getClass) def isSimple: Boolean = operation match { case Operation.FIRST | Operation.LAST | Operation.LAST_K | Operation.FIRST_K => false @@ -58,6 +62,7 @@ object Extensions { } implicit class WindowOps(window: Window) { + private val logger = LoggerFactory.getLogger(getClass) private def unbounded: Boolean = window.length == Int.MaxValue || window.length <= 0 def str: String = @@ -70,6 +75,7 @@ object Extensions { } object WindowUtils { + private val logger = LoggerFactory.getLogger(getClass) val Unbounded: Window = new Window(Int.MaxValue, TimeUnit.DAYS) val Hour: Window = new Window(1, TimeUnit.HOURS) val Day: Window = new Window(1, TimeUnit.DAYS) @@ -93,6 +99,7 @@ object Extensions { } implicit class MetadataOps(metaData: MetaData) { + private val logger = LoggerFactory.getLogger(getClass) def cleanName: String = metaData.name.sanitize def outputTable = s"${metaData.outputNamespace}.${metaData.cleanName}" @@ -150,6 +157,7 @@ object Extensions { // one per output column - so single window // not exposed to users implicit class AggregationPartOps(aggregationPart: AggregationPart) { + private val logger = LoggerFactory.getLogger(getClass) def getInt(arg: String, default: Option[Int] = None): Int = { val argOpt = Option(aggregationPart.argMap) @@ -177,6 +185,7 @@ object Extensions { } implicit class AggregationOps(aggregation: Aggregation) { + private val logger = LoggerFactory.getLogger(getClass) // one agg part per bucket per window // unspecified windows are translated to one unbounded window @@ -232,6 +241,7 @@ object Extensions { case class UnpackedAggregations(perBucket: Array[AggregationPart], perWindow: Array[WindowMapping]) object UnpackedAggregations { + private val logger = LoggerFactory.getLogger(getClass) def from(aggregations: Seq[Aggregation]): UnpackedAggregations = { var counter = 0 val perBucket = new mutable.ArrayBuffer[AggregationPart] @@ -277,6 +287,7 @@ object Extensions { } implicit class AggregationsOps(aggregations: Seq[Aggregation]) { + private val logger = LoggerFactory.getLogger(getClass) def hasTimedAggregations: Boolean = aggregations.exists(_.operation match { case LAST_K | FIRST_K | LAST | FIRST => true @@ -300,6 +311,7 @@ object Extensions { } implicit class SourceOps(source: Source) { + private val logger = LoggerFactory.getLogger(getClass) def dataModel: DataModel = { assert(source.isSetEntities || source.isSetEvents || source.isSetJoinSource, "Source type is not specified") if (source.isSetEntities) Entities @@ -412,6 +424,7 @@ object Extensions { } implicit class GroupByOps(groupBy: GroupBy) extends GroupBy(groupBy) { + private val logger = LoggerFactory.getLogger(getClass) def maxWindow: Option[Window] = { val allWindowsOpt = Option(groupBy.aggregations) .flatMap(_.toScala.toSeq.allWindowsOpt) @@ -612,12 +625,14 @@ object Extensions { } implicit class StringOps(string: String) { + private val logger = LoggerFactory.getLogger(getClass) def sanitize: String = Option(string).map(_.replaceAll("[^a-zA-Z0-9_]", "_")).orNull def cleanSpec: String = string.split("/").head } implicit class ExternalSourceOps(externalSource: ExternalSource) extends ExternalSource(externalSource) { + private val logger = LoggerFactory.getLogger(getClass) private def schemaNames(schema: TDataType): Array[String] = schemaFields(schema).map(_.name) private def schemaFields(schema: TDataType): Array[StructField] = @@ -634,6 +649,7 @@ object Extensions { } object KeyMappingHelper { + private val logger = LoggerFactory.getLogger(getClass) // key mapping is defined as {left_col1: right_col1}, on the right there can be two keys [right_col1, right_col2] // Left is implicitly assumed to have right_col2 // We need to convert a map {left_col1: a, right_col2: b, irrelevant_col: c} into {right_col1: a, right_col2: b} @@ -649,6 +665,7 @@ object Extensions { } implicit class ExternalPartOps(externalPart: ExternalPart) extends ExternalPart(externalPart) { + private val logger = LoggerFactory.getLogger(getClass) lazy val fullName: String = Constants.ExternalPrefix + "_" + Option(externalPart.prefix).map(_ + "_").getOrElse("") + @@ -690,6 +707,7 @@ object Extensions { } implicit class JoinPartOps(joinPart: JoinPart) extends JoinPart(joinPart) { + private val logger = LoggerFactory.getLogger(getClass) lazy val fullPrefix = (Option(prefix) ++ Some(groupBy.getMetaData.cleanName)).mkString("_") lazy val leftToRight: Map[String, String] = rightToLeft.map { case (key, value) => value -> key } @@ -719,6 +737,7 @@ object Extensions { } implicit class LabelPartOps(val labelPart: LabelPart) extends Serializable { + private val logger = LoggerFactory.getLogger(getClass) def leftKeyCols: Array[String] = { labelPart.labels.toScala .flatMap { @@ -747,6 +766,7 @@ object Extensions { } implicit class BootstrapPartOps(val bootstrapPart: BootstrapPart) extends Serializable { + private val logger = LoggerFactory.getLogger(getClass) /** * Compress the info such that the hash can be stored at record and @@ -778,6 +798,7 @@ object Extensions { } object JoinOps { + private val logger = LoggerFactory.getLogger(getClass) private val identifierRegex: Pattern = Pattern.compile("[a-zA-Z_][a-zA-Z0-9_]*") def isIdentifier(s: String): Boolean = identifierRegex.matcher(s).matches() } @@ -923,7 +944,7 @@ object Extensions { } .filter(_.nonEmpty) .mkString(joiner) - println(s"Generated join left side skew filter:\n $result") + logger.info(s"Generated join left side skew filter:\n $result") result } } @@ -943,7 +964,7 @@ object Extensions { .mkString(joiner) if (result.nonEmpty) { - println(s"Generated join part skew filter for ${joinPart.groupBy.metaData.name}:\n $result") + logger.info(s"Generated join part skew filter for ${joinPart.groupBy.metaData.name}:\n $result") Some(result) } else None } diff --git a/api/src/main/scala/ai/chronon/api/ParametricMacro.scala b/api/src/main/scala/ai/chronon/api/ParametricMacro.scala index 637e948ff..dacc9d411 100644 --- a/api/src/main/scala/ai/chronon/api/ParametricMacro.scala +++ b/api/src/main/scala/ai/chronon/api/ParametricMacro.scala @@ -16,6 +16,7 @@ package ai.chronon.api +import org.slf4j.LoggerFactory import scala.collection.mutable // takes a map of macro names and functions and applies the functions on macro arguments @@ -38,7 +39,7 @@ case class ParametricMacro(value: String, func: Map[String, String] => String) { argSeq.tail :+ (argSeq.head + "," + token) } } - println(parsed) + logger.info(parsed) parsed.map(_.split("=").map(_.trim)).map(x => x(0) -> x(1)).toMap } val result = func(argMap.getOrElse(Map.empty[String, String])) @@ -51,10 +52,11 @@ case class ParametricMacro(value: String, func: Map[String, String] => String) { } object ParametricMacro { + private val logger = LoggerFactory.getLogger(getClass) def main(args: Array[String]): Unit = { val mc = ParametricMacro("something", { x => "st:" + x.keys.mkString("/") + "|" + x.values.mkString("/") }) val str = "something nothing-{{ something( a_1=b,, 3.1, c=d) }}-something after-{{ thing:a1=b1 }}{{ something }}" val replaced = mc.replace(str) - println(replaced) + logger.info(replaced) } } diff --git a/api/src/main/scala/ai/chronon/api/ThriftJsonCodec.scala b/api/src/main/scala/ai/chronon/api/ThriftJsonCodec.scala index 79c8e8f24..a13f7ab3d 100644 --- a/api/src/main/scala/ai/chronon/api/ThriftJsonCodec.scala +++ b/api/src/main/scala/ai/chronon/api/ThriftJsonCodec.scala @@ -16,6 +16,7 @@ package ai.chronon.api +import org.slf4j.LoggerFactory import ai.chronon.api.Extensions.StringsOps import com.fasterxml.jackson.databind.{DeserializationFeature, JsonNode, ObjectMapper} import org.apache.thrift.protocol.{TCompactProtocol, TSimpleJSONProtocol} @@ -28,6 +29,7 @@ import scala.reflect.ClassTag import scala.util.ScalaJavaConversions.ListOps object ThriftJsonCodec { + private val logger = LoggerFactory.getLogger(getClass) def serializer = new TSerializer(new TSimpleJSONProtocol.Factory()) @@ -63,7 +65,7 @@ object ThriftJsonCodec { base } catch { case e: Exception => { - println("Failed to deserialize using compact protocol, trying Json.") + logger.info("Failed to deserialize using compact protocol, trying Json.") fromJsonStr(new String(bytes), check = false, base.getClass) } } diff --git a/api/src/test/scala/ai/chronon/api/test/DataTypeConversionTest.scala b/api/src/test/scala/ai/chronon/api/test/DataTypeConversionTest.scala index 09cfe648c..1640d8f98 100644 --- a/api/src/test/scala/ai/chronon/api/test/DataTypeConversionTest.scala +++ b/api/src/test/scala/ai/chronon/api/test/DataTypeConversionTest.scala @@ -16,6 +16,7 @@ package ai.chronon.api.test +import org.slf4j.LoggerFactory import ai.chronon.api._ import org.apache.thrift.TSerializer import org.apache.thrift.protocol.TSimpleJSONProtocol @@ -23,6 +24,7 @@ import org.junit.Assert._ import org.junit.Test class DataTypeConversionTest { + private val logger = LoggerFactory.getLogger(getClass) @Test def testDataTypeToThriftAndBack(): Unit = { // build some complex type @@ -45,7 +47,7 @@ class DataTypeConversionTest { // serialize with TSimpleJson - this is what python code will do val jsonSerializer = new TSerializer(new TSimpleJSONProtocol.Factory()) val json = new String(jsonSerializer.serialize(thriftType)) - println(json) + logger.info(json) val reversedTType = ThriftJsonCodec.fromJsonStr[TDataType](json, check = true, classOf[TDataType]) val reversed = DataType.fromTDataType(reversedTType) diff --git a/flink/src/main/scala/ai/chronon/flink/AsyncKVStoreWriter.scala b/flink/src/main/scala/ai/chronon/flink/AsyncKVStoreWriter.scala index dafb7664e..9072b3b31 100644 --- a/flink/src/main/scala/ai/chronon/flink/AsyncKVStoreWriter.scala +++ b/flink/src/main/scala/ai/chronon/flink/AsyncKVStoreWriter.scala @@ -1,5 +1,6 @@ package ai.chronon.flink +import org.slf4j.LoggerFactory import ai.chronon.online.{Api, KVStore} import ai.chronon.online.KVStore.PutRequest import org.apache.flink.configuration.Configuration @@ -16,6 +17,7 @@ import scala.util.{Failure, Success} case class WriteResponse(putRequest: PutRequest, status: Boolean) object AsyncKVStoreWriter { + private val logger = LoggerFactory.getLogger(getClass) private val kvStoreConcurrency = 10 private val defaultTimeoutMillis = 1000L @@ -88,7 +90,7 @@ class AsyncKVStoreWriter(onlineImpl: Api, featureGroupName: String) } override def timeout(input: PutRequest, resultFuture: ResultFuture[WriteResponse]): Unit = { - println(s"Timed out writing to KV Store for object: $input") + logger.info(s"Timed out writing to KV Store for object: $input") errorCounter.inc() resultFuture.complete(util.Arrays.asList[WriteResponse](WriteResponse(input, status = false))) } @@ -102,7 +104,7 @@ class AsyncKVStoreWriter(onlineImpl: Api, featureGroupName: String) successCounter.inc() } else { errorCounter.inc() - println(s"Failed to write to KVStore for object: $input") + logger.info(s"Failed to write to KVStore for object: $input") } resultFuture.complete(util.Arrays.asList[WriteResponse](WriteResponse(input, status = succeeded))) case Failure(exception) => @@ -110,7 +112,7 @@ class AsyncKVStoreWriter(onlineImpl: Api, featureGroupName: String) // in the KVStore - we log the exception and skip the object to // not fail the app errorCounter.inc() - println(s"Caught exception writing to KVStore for object: $input - $exception") + logger.info(s"Caught exception writing to KVStore for object: $input - $exception") resultFuture.complete(util.Arrays.asList[WriteResponse](WriteResponse(input, status = false))) } } diff --git a/flink/src/main/scala/ai/chronon/flink/AvroCodecFn.scala b/flink/src/main/scala/ai/chronon/flink/AvroCodecFn.scala index d10bb26cc..d9b92224a 100644 --- a/flink/src/main/scala/ai/chronon/flink/AvroCodecFn.scala +++ b/flink/src/main/scala/ai/chronon/flink/AvroCodecFn.scala @@ -1,5 +1,6 @@ package ai.chronon.flink +import org.slf4j.LoggerFactory import ai.chronon.api.Extensions.GroupByOps import ai.chronon.api.{Constants, DataModel, Query, StructType => ChrononStructType} import ai.chronon.online.{AvroConversions, GroupByServingInfoParsed} @@ -86,7 +87,7 @@ case class AvroCodecFn[T](groupByServingInfoParsed: GroupByServingInfoParsed) case e: Exception => // To improve availability, we don't rethrow the exception. We just drop the event // and track the errors in a metric. If there are too many errors we'll get alerted/paged. - println(s"Error converting to Avro bytes - $e") + logger.info(s"Error converting to Avro bytes - $e") avroConversionErrorCounter.inc() } diff --git a/flink/src/main/scala/ai/chronon/flink/SparkExpressionEvalFn.scala b/flink/src/main/scala/ai/chronon/flink/SparkExpressionEvalFn.scala index 1bcec1f6a..6c5a32dcf 100644 --- a/flink/src/main/scala/ai/chronon/flink/SparkExpressionEvalFn.scala +++ b/flink/src/main/scala/ai/chronon/flink/SparkExpressionEvalFn.scala @@ -1,5 +1,6 @@ package ai.chronon.flink +import org.slf4j.LoggerFactory import ai.chronon.api.Extensions.{GroupByOps, MetadataOps} import ai.chronon.api.{Constants, GroupBy, Query, StructType => ChrononStructType} import ai.chronon.online.{CatalystUtil, SparkConversions} @@ -27,6 +28,7 @@ import scala.jdk.CollectionConverters.{asScalaBufferConverter, mapAsScalaMapConv * @tparam T The type of the input data. */ class SparkExpressionEvalFn[T](encoder: Encoder[T], groupBy: GroupBy) extends RichFlatMapFunction[T, Map[String, Any]] { + private val logger = LoggerFactory.getLogger(getClass) private val query: Query = groupBy.streamingSource.get.getEvents.query @@ -100,7 +102,7 @@ class SparkExpressionEvalFn[T](encoder: Encoder[T], groupBy: GroupBy) extends Ri case e: Exception => // To improve availability, we don't rethrow the exception. We just drop the event // and track the errors in a metric. If there are too many errors we'll get alerted/paged. - println(s"Error evaluating Spark expression - $e") + logger.info(s"Error evaluating Spark expression - $e") exprEvalErrorCounter.inc() } } diff --git a/online/src/main/scala/ai/chronon/online/Api.scala b/online/src/main/scala/ai/chronon/online/Api.scala index 78342a659..706f74129 100644 --- a/online/src/main/scala/ai/chronon/online/Api.scala +++ b/online/src/main/scala/ai/chronon/online/Api.scala @@ -16,6 +16,7 @@ package ai.chronon.online +import org.slf4j.LoggerFactory import ai.chronon.api.{Constants, StructType} import ai.chronon.online.KVStore.{GetRequest, GetResponse, PutRequest} import org.apache.spark.sql.SparkSession @@ -27,6 +28,7 @@ import scala.concurrent.{Await, ExecutionContext, Future} import scala.util.{Failure, Success, Try} object KVStore { + private val logger = LoggerFactory.getLogger(getClass) // a scan request essentially for the keyBytes // afterTsMillis - is used to limit the scan to more recent data case class GetRequest(keyBytes: Array[Byte], dataset: String, afterTsMillis: Option[Long] = None) @@ -69,7 +71,7 @@ trait KVStore { .map(_.head) .recover { case e: java.util.NoSuchElementException => - println( + logger.info( s"Failed request against ${request.dataset} check the related task to the upload of the dataset (GroupByUpload or MetadataUpload)") throw e } diff --git a/online/src/main/scala/ai/chronon/online/DataStreamBuilder.scala b/online/src/main/scala/ai/chronon/online/DataStreamBuilder.scala index ed819c2ba..e16f87730 100644 --- a/online/src/main/scala/ai/chronon/online/DataStreamBuilder.scala +++ b/online/src/main/scala/ai/chronon/online/DataStreamBuilder.scala @@ -16,6 +16,7 @@ package ai.chronon.online +import org.slf4j.LoggerFactory import ai.chronon.api import ai.chronon.api.{Constants, DataModel} import ai.chronon.api.DataModel.DataModel @@ -27,6 +28,7 @@ import scala.util.{Failure, Success, Try} case class TopicInfo(name: String, topicType: String, params: Map[String, String]) object TopicInfo { + private val logger = LoggerFactory.getLogger(getClass) // default topic type is kafka // kafka://topic_name/schema=my_schema/host=X/port=Y should parse into TopicInfo(topic_name, kafka, {schema: my_schema, host: X, port Y}) def parse(topic: String): TopicInfo = { @@ -55,9 +57,9 @@ case class DataStream(df: DataFrame, partitions: Int, topicInfo: TopicInfo) { Option(query.setups).map(_.toScala.map { setup => Try(df.sparkSession.sql(setup)) match { case Failure(ex) => - println(s"[Failure] Setup command: ($setup) failed with exception: ${ex.toString}") + logger.info(s"[Failure] Setup command: ($setup) failed with exception: ${ex.toString}") ex.printStackTrace(System.out) - case Success(value) => println(s"[SUCCESS] Setup command: $setup") + case Success(value) => logger.info(s"[SUCCESS] Setup command: $setup") } }) @@ -76,7 +78,7 @@ case class DataStream(df: DataFrame, partitions: Int, topicInfo: TopicInfo) { } yield (keyMap ++ selectMap ++ timeSelects) val selectClauses = selectsOption.map { _.map { case (name, expr) => s"($expr) AS `$name`" }.toSeq } - println(s"Applying select clauses: $selectClauses") + logger.info(s"Applying select clauses: $selectClauses") val selectedDf = selectClauses.map { selects => df.selectExpr(selects: _*) }.getOrElse(df) // enrich where clauses @@ -93,7 +95,7 @@ case class DataStream(df: DataFrame, partitions: Int, topicInfo: TopicInfo) { val baseWheres = Option(query.wheres).map(_.toScala).getOrElse(Seq.empty[String]) val whereClauses = baseWheres ++ atLeastOneKeyIsPresent :+ timeIsPresent - println(s"Applying where clauses: $whereClauses") + logger.info(s"Applying where clauses: $whereClauses") val filteredDf = whereClauses.foldLeft(selectedDf)(_.where(_)) DataStream(filteredDf, partitions, topicInfo) } diff --git a/online/src/main/scala/ai/chronon/online/Fetcher.scala b/online/src/main/scala/ai/chronon/online/Fetcher.scala index 57ea5b0ed..8d2c81f22 100644 --- a/online/src/main/scala/ai/chronon/online/Fetcher.scala +++ b/online/src/main/scala/ai/chronon/online/Fetcher.scala @@ -16,6 +16,7 @@ package ai.chronon.online +import org.slf4j.LoggerFactory import ai.chronon.aggregator.row.{ColumnAggregator, StatsGenerator} import ai.chronon.api import ai.chronon.api.Constants.UTF8 @@ -35,6 +36,7 @@ import scala.concurrent.Future import scala.util.{Failure, Success, Try} object Fetcher { + private val logger = LoggerFactory.getLogger(getClass) case class Request(name: String, keys: Map[String, AnyRef], atMillis: Option[Long] = None, @@ -161,8 +163,8 @@ class Fetcher(val kvStore: KVStore, internalResponses.zip(externalResponses).map { case (internalResponse, externalResponse) => if (debug) { - println(internalResponse.values.get.keys.toSeq) - println(externalResponse.values.get.keys.toSeq) + logger.info(internalResponse.values.get.keys.toSeq) + logger.info(externalResponse.values.get.keys.toSeq) } val cleanInternalRequest = internalResponse.request.copy(context = None) assert( @@ -264,10 +266,10 @@ class Fetcher(val kvStore: KVStore, } if (debug) { - println(s"Logging ${resp.request.keys} : ${hash % 100000}: $samplePercent") + logger.info(s"Logging ${resp.request.keys} : ${hash % 100000}: $samplePercent") val gson = new Gson() val valuesFormatted = values.map { case (k, v) => s"$k -> ${gson.toJson(v)}" }.mkString(", ") - println(s"""Sampled join fetch + logger.info(s"""Sampled join fetch |Key Map: ${resp.request.keys} |Value Map: [${valuesFormatted}] |""".stripMargin) @@ -291,7 +293,7 @@ class Fetcher(val kvStore: KVStore, context.distribution("logging_request.overall.latency.millis", System.currentTimeMillis() - ts)) if (debug) { - println(s"Logged data with schema_hash ${codec.loggingSchemaHash}") + logger.info(s"Logged data with schema_hash ${codec.loggingSchemaHash}") } } } @@ -300,7 +302,7 @@ class Fetcher(val kvStore: KVStore, // to handle GroupByServingInfo staleness that results in encoding failure getJoinCodecs.refresh(resp.request.name) joinContext.foreach(_.incrementException(exception)) - println(s"logging failed due to ${exception.traceString}") + logger.info(s"logging failed due to ${exception.traceString}") } Response(resp.request, Success(resp.derivedValues)) } @@ -410,7 +412,7 @@ class Fetcher(val kvStore: KVStore, if (logFunc != null) { logFunc.accept(controlEvent) if (debug) { - println(s"schema data logged successfully with schema_hash ${enc.loggingSchemaHash}") + logger.info(s"schema data logged successfully with schema_hash ${enc.loggingSchemaHash}") } } } diff --git a/online/src/main/scala/ai/chronon/online/FetcherBase.scala b/online/src/main/scala/ai/chronon/online/FetcherBase.scala index dd845464e..d013b8020 100644 --- a/online/src/main/scala/ai/chronon/online/FetcherBase.scala +++ b/online/src/main/scala/ai/chronon/online/FetcherBase.scala @@ -16,6 +16,7 @@ package ai.chronon.online +import org.slf4j.LoggerFactory import ai.chronon.aggregator.row.ColumnAggregator import ai.chronon.aggregator.windowing import ai.chronon.aggregator.windowing.{FinalBatchIr, SawtoothOnlineAggregator, TsUtils} @@ -42,6 +43,7 @@ class FetcherBase(kvStore: KVStore, timeoutMillis: Long = 10000, debug: Boolean = false) extends MetadataStore(kvStore, metaDataSet, timeoutMillis) { + private val logger = LoggerFactory.getLogger(getClass) private case class GroupByRequestMeta( groupByServingInfoParsed: GroupByServingInfoParsed, @@ -87,7 +89,7 @@ class FetcherBase(kvStore: KVStore, case DataModel.Entities => servingInfo.mutationValueAvroCodec } if (batchBytes == null && (streamingResponses == null || streamingResponses.isEmpty)) { - if (debug) println("Both batch and streaming data are null") + if (debug) logger.info("Both batch and streaming data are null") null } else { val streamingRows: Array[Row] = streamingResponses.iterator @@ -103,7 +105,7 @@ class FetcherBase(kvStore: KVStore, val output = aggregator.lambdaAggregateFinalized(batchIr, streamingRows.iterator, queryTimeMs, mutations) if (debug) { val gson = new Gson() - println(s""" + logger.info(s""" |batch ir: ${gson.toJson(batchIr)} |streamingRows: ${gson.toJson(streamingRows)} |batchEnd in millis: ${servingInfo.batchEndTsMillis} @@ -139,14 +141,14 @@ class FetcherBase(kvStore: KVStore, groupByServingInfo: GroupByServingInfoParsed): GroupByServingInfoParsed = { val name = groupByServingInfo.groupBy.metaData.name if (batchEndTs > groupByServingInfo.batchEndTsMillis) { - println(s"""$name's value's batch timestamp of $batchEndTs is + logger.info(s"""$name's value's batch timestamp of $batchEndTs is |ahead of schema timestamp of ${groupByServingInfo.batchEndTsMillis}. |Forcing an update of schema.""".stripMargin) getGroupByServingInfo .force(name) .recover { case ex: Throwable => - println( + logger.info( s"Couldn't update GroupByServingInfo of $name due to ${ex.getMessage}. Proceeding with the old one.") ex.printStackTrace() groupByServingInfo @@ -255,7 +257,7 @@ class FetcherBase(kvStore: KVStore, val queryTs = request.atMillis.getOrElse(System.currentTimeMillis()) try { if (debug) - println( + logger.info( s"Constructing response for groupBy: ${groupByServingInfo.groupByOps.metaData.getName} " + s"for keys: ${request.keys}") constructGroupByResponse(batchResponseTryAll, @@ -369,7 +371,7 @@ class FetcherBase(kvStore: KVStore, .recover { // capture exception as a key case ex: Throwable => if (debug || Math.random() < 0.001) { - println(s"Failed to fetch $groupByRequest with \n${ex.traceString}") + logger.info(s"Failed to fetch $groupByRequest with \n${ex.traceString}") } Map(groupByRequest.name + "_exception" -> ex.traceString) } @@ -448,7 +450,7 @@ class FetcherBase(kvStore: KVStore, .recoverWith { // capture exception as a key case ex: Throwable => if (debug || Math.random() < 0.001) { - println(s"Failed to fetch $request with \n${ex.traceString}") + logger.info(s"Failed to fetch $request with \n${ex.traceString}") } Failure(ex) } diff --git a/online/src/main/scala/ai/chronon/online/MetadataStore.scala b/online/src/main/scala/ai/chronon/online/MetadataStore.scala index 685cf76ff..6e016133a 100644 --- a/online/src/main/scala/ai/chronon/online/MetadataStore.scala +++ b/online/src/main/scala/ai/chronon/online/MetadataStore.scala @@ -16,6 +16,7 @@ package ai.chronon.online +import org.slf4j.LoggerFactory import ai.chronon.api.Constants.{ChrononMetadataKey, UTF8} import ai.chronon.api.Extensions.{JoinOps, MetadataOps, StringOps, WindowOps, WindowUtils} import ai.chronon.api._ @@ -36,6 +37,7 @@ import scala.util.{Failure, Success, Try} case class DataMetrics(series: Seq[(Long, SortedMap[String, Any])]) class MetadataStore(kvStore: KVStore, val dataset: String = ChrononMetadataKey, timeoutMillis: Long) { + private val logger = LoggerFactory.getLogger(getClass) private var partitionSpec = PartitionSpec(format = "yyyy-MM-dd", spanMillis = WindowUtils.Day.millis) private val CONF_BATCH_SIZE = 50 @@ -73,7 +75,7 @@ class MetadataStore(kvStore: KVStore, val dataset: String = ChrononMetadataKey, val result = getConf[Join](s"joins/$name") .recover { case e: java.util.NoSuchElementException => - println( + logger.info( s"Failed to fetch conf for join $name at joins/$name, please check metadata upload to make sure the join metadata for $name has been uploaded") throw e } @@ -92,7 +94,7 @@ class MetadataStore(kvStore: KVStore, val dataset: String = ChrononMetadataKey, { join => Metrics.Context(environment = "join.meta.fetch", join = join) }) def putJoinConf(join: Join): Unit = { - println(s"uploading join conf to dataset: $dataset by key: joins/${join.metaData.nameToFilePath}") + logger.info(s"uploading join conf to dataset: $dataset by key: joins/${join.metaData.nameToFilePath}") kvStore.put( PutRequest(s"joins/${join.metaData.nameToFilePath}".getBytes(Constants.UTF8), ThriftJsonCodec.toJsonStr(join).getBytes(Constants.UTF8), @@ -104,7 +106,7 @@ class MetadataStore(kvStore: KVStore, val dataset: String = ChrononMetadataKey, .getString(key, dataset, timeoutMillis) .recover { case e: java.util.NoSuchElementException => - println(s"Failed to retrieve $key for $dataset. Is it possible that hasn't been uploaded?") + logger.info(s"Failed to retrieve $key for $dataset. Is it possible that hasn't been uploaded?") throw e } .map(AvroCodec.of(_)) @@ -125,11 +127,11 @@ class MetadataStore(kvStore: KVStore, val dataset: String = ChrononMetadataKey, val metaData = kvStore.getString(Constants.GroupByServingInfoKey, batchDataset, timeoutMillis).recover { case e: java.util.NoSuchElementException => - println( + logger.info( s"Failed to fetch metadata for $batchDataset, is it possible Group By Upload for $name has not succeeded?") throw e } - println(s"Fetched ${Constants.GroupByServingInfoKey} from : $batchDataset") + logger.info(s"Fetched ${Constants.GroupByServingInfoKey} from : $batchDataset") if (metaData.isFailure) { Failure( new RuntimeException(s"Couldn't fetch group by serving info for $batchDataset, " + @@ -158,13 +160,13 @@ class MetadataStore(kvStore: KVStore, val dataset: String = ChrononMetadataKey, def putConf(configPath: String): Future[Seq[Boolean]] = { val configFile = new File(configPath) assert(configFile.exists(), s"$configFile does not exist") - println(s"Uploading Chronon configs from $configPath") + logger.info(s"Uploading Chronon configs from $configPath") val fileList = listFiles(configFile) val puts = fileList .filter { file => val name = parseName(file.getPath) - if (name.isEmpty) println(s"Skipping invalid file ${file.getPath}") + if (name.isEmpty) logger.info(s"Skipping invalid file ${file.getPath}") name.isDefined } .flatMap { file => @@ -173,11 +175,11 @@ class MetadataStore(kvStore: KVStore, val dataset: String = ChrononMetadataKey, case value if value.contains("staging_queries/") => loadJson[StagingQuery](value) case value if value.contains("joins/") => loadJson[Join](value) case value if value.contains("group_bys/") => loadJson[GroupBy](value) - case _ => println(s"unknown config type in file $path"); None + case _ => logger.info(s"unknown config type in file $path"); None } val key = pathToKey(path) confJsonOpt.map { conf => - println(s"""Putting metadata for + logger.info(s"""Putting metadata for |key: $key |conf: $conf""".stripMargin) PutRequest(keyBytes = key.getBytes(), @@ -187,7 +189,7 @@ class MetadataStore(kvStore: KVStore, val dataset: String = ChrononMetadataKey, } } val putsBatches = puts.grouped(CONF_BATCH_SIZE).toSeq - println(s"Putting ${puts.size} configs to KV Store, dataset=$dataset") + logger.info(s"Putting ${puts.size} configs to KV Store, dataset=$dataset") val futures = putsBatches.map(batch => kvStore.multiPut(batch)) Future.sequence(futures).map(_.flatten) } @@ -215,7 +217,7 @@ class MetadataStore(kvStore: KVStore, val dataset: String = ChrononMetadataKey, Some(ThriftJsonCodec.toJsonStr(configConf)) } catch { case e: Throwable => - println(s"Failed to parse compiled Chronon config file: $file, \nerror=${e.getMessage}") + logger.info(s"Failed to parse compiled Chronon config file: $file, \nerror=${e.getMessage}") None } } @@ -232,7 +234,7 @@ class MetadataStore(kvStore: KVStore, val dataset: String = ChrononMetadataKey, .map(_.asInstanceOf[String]) } catch { case ex: Throwable => - println(s"Failed to parse Chronon config file at $path as JSON with error: ${ex.getMessage}") + logger.info(s"Failed to parse Chronon config file at $path as JSON with error: ${ex.getMessage}") ex.printStackTrace() None } diff --git a/online/src/test/scala/ai/chronon/online/TileCodecTest.scala b/online/src/test/scala/ai/chronon/online/TileCodecTest.scala index 9124e5622..b60e0053c 100644 --- a/online/src/test/scala/ai/chronon/online/TileCodecTest.scala +++ b/online/src/test/scala/ai/chronon/online/TileCodecTest.scala @@ -16,12 +16,14 @@ package ai.chronon.online +import org.slf4j.LoggerFactory import ai.chronon.api.{Aggregation, Builders, FloatType, IntType, ListType, LongType, Operation, Row, StringType, TimeUnit, Window} import org.junit.Assert.assertEquals import org.junit.Test import scala.collection.JavaConverters._ class TileCodecTest { + private val logger = LoggerFactory.getLogger(getClass) private val histogram = Map[String, Int]("A" -> 3, "B" -> 2).asJava private val aggregationsAndExpected: Array[(Aggregation, Seq[Any])] = Array( @@ -105,7 +107,7 @@ class TileCodecTest { val windowedRowAggregator = TileCodec.buildWindowedRowAggregator(groupBy, schema) expectedFlattenedVals.zip(finalResults).zip(windowedRowAggregator.outputSchema.map(_._1)).foreach { case ((expected, actual), name) => - println(s"Checking: $name") + logger.info(s"Checking: $name") assertEquals(expected, actual) } } @@ -138,7 +140,7 @@ class TileCodecTest { val windowedRowAggregator = TileCodec.buildWindowedRowAggregator(groupBy, schema) expectedBucketedResults.zip(finalResults).zip(windowedRowAggregator.outputSchema.map(_._1)).foreach { case ((expected, actual), name) => - println(s"Checking: $name") + logger.info(s"Checking: $name") assertEquals(expected, actual) } } diff --git a/online/src/test/scala/ai/chronon/online/test/DataStreamBuilderTest.scala b/online/src/test/scala/ai/chronon/online/test/DataStreamBuilderTest.scala index af1ec3d82..2b6661027 100644 --- a/online/src/test/scala/ai/chronon/online/test/DataStreamBuilderTest.scala +++ b/online/src/test/scala/ai/chronon/online/test/DataStreamBuilderTest.scala @@ -16,6 +16,7 @@ package ai.chronon.online.test +import org.slf4j.LoggerFactory import ai.chronon.api.{Builders, DataModel, LongType, StringType, StructField, StructType} import ai.chronon.online.{DataStream, SparkConversions, TopicInfo} import ai.chronon.online.TopicInfo.parse @@ -26,6 +27,7 @@ import org.junit.Test import scala.util.ScalaJavaConversions.JListOps class DataStreamBuilderTest { + private val logger = LoggerFactory.getLogger(getClass) lazy val spark: SparkSession = { System.setSecurityManager(null) val spark = SparkSession @@ -64,7 +66,7 @@ class DataStreamBuilderTest { def checkTopicInfo(actual: TopicInfo, expected: TopicInfo): Unit = { if (actual != expected) { - println(s"Actual topicInfo != expected topicInfo. Actual: $actual, expected: $expected") + logger.info(s"Actual topicInfo != expected topicInfo. Actual: $actual, expected: $expected") } assert(actual == expected) } diff --git a/project/FolderCleaner.scala b/project/FolderCleaner.scala index 0d975c2eb..96c2b213b 100644 --- a/project/FolderCleaner.scala +++ b/project/FolderCleaner.scala @@ -1,9 +1,11 @@ +import org.slf4j.LoggerFactory import java.io.File import scala.reflect.io.Directory object Folder { + private val logger = LoggerFactory.getLogger(getClass) def clean(files: File*): Unit = { - println(s"Removing folders ${files.map(_.getAbsolutePath)}") + logger.info(s"Removing folders ${files.map(_.getAbsolutePath)}") files.foreach { file => if (file.exists() && file.isDirectory) { val directory = new Directory(file) diff --git a/project/ThriftGen.scala b/project/ThriftGen.scala index b92f49060..3e9a3f945 100644 --- a/project/ThriftGen.scala +++ b/project/ThriftGen.scala @@ -1,17 +1,19 @@ +import org.slf4j.LoggerFactory import sbt._ import sys.process._ object Thrift { + private val logger = LoggerFactory.getLogger(getClass) def gen(inputPath: String, outputPath: String, language: String, cleanupSuffixPath: String = "", extension: String = null): Seq[File] = { s"""echo "Generating files from thrift file: $outputPath \ninto folder $inputPath" """ !; s"rm -rf $outputPath/$cleanupSuffixPath" !; s"mkdir -p $outputPath" !; s"thrift --gen $language -out $outputPath $inputPath" !; val files = (PathFinder(new File(outputPath)) ** s"*.${Option(extension).getOrElse(language)}").get() - println("Generated files list") - files.map(_.getPath).foreach { path => println(s" $path") } - println("\n") + logger.info("Generated files list") + files.map(_.getPath).foreach { path => logger.info(s" $path") } + logger.info("\n") files } } diff --git a/spark/src/main/scala/ai/chronon/spark/Analyzer.scala b/spark/src/main/scala/ai/chronon/spark/Analyzer.scala index 8e6949844..ce2247c22 100644 --- a/spark/src/main/scala/ai/chronon/spark/Analyzer.scala +++ b/spark/src/main/scala/ai/chronon/spark/Analyzer.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.api import ai.chronon.api.{Accuracy, AggregationPart, Constants, DataType, TimeUnit, Window} import ai.chronon.api.Extensions._ @@ -38,6 +39,7 @@ import scala.util.ScalaJavaConversions.ListOps //class ItemSketchSerializable(var mapSize: Int) extends ItemsSketch[String](mapSize) with Serializable {} class ItemSketchSerializable extends Serializable { + private val logger = LoggerFactory.getLogger(getClass) var sketch: ItemsSketch[String] = null def init(mapSize: Int): ItemSketchSerializable = { sketch = new ItemsSketch[String](mapSize) @@ -69,6 +71,7 @@ class Analyzer(tableUtils: TableUtils, sample: Double = 0.1, enableHitter: Boolean = false, silenceMode: Boolean = false) { + private val logger = LoggerFactory.getLogger(getClass) // include ts into heavy hitter analysis - useful to surface timestamps that have wrong units // include total approx row count - so it is easy to understand the percentage of skewed data def heavyHittersWithTsAndCount(df: DataFrame, @@ -189,7 +192,7 @@ class Analyzer(tableUtils: TableUtils, groupByConf.setups.foreach(tableUtils.sql) val groupBy = GroupBy.from(groupByConf, range, tableUtils, computeDependency = enableHitter, finalize = true) val name = "group_by/" + prefix + groupByConf.metaData.name - println(s"""|Running GroupBy analysis for $name ...""".stripMargin) + logger.info(s"""|Running GroupBy analysis for $name ...""".stripMargin) val analysis = if (enableHitter) analyze(groupBy.inputDf, @@ -215,20 +218,20 @@ class Analyzer(tableUtils: TableUtils, groupBy.outputSchema } if (silenceMode) { - println(s"""ANALYSIS completed for group_by/${name}.""".stripMargin) + logger.info(s"""ANALYSIS completed for group_by/${name}.""".stripMargin) } else { - println(s""" + logger.info(s""" |ANALYSIS for $name: |$analysis """.stripMargin) if (includeOutputTableName) - println(s""" + logger.info(s""" |----- OUTPUT TABLE NAME ----- |${groupByConf.metaData.outputTable} """.stripMargin) val keySchema = groupBy.keySchema.fields.map { field => s" ${field.name} => ${field.dataType}" } schema.fields.map { field => s" ${field.name} => ${field.fieldType}" } - println(s""" + logger.info(s""" |----- KEY SCHEMA ----- |${keySchema.mkString("\n")} |----- OUTPUT SCHEMA ----- @@ -251,7 +254,7 @@ class Analyzer(tableUtils: TableUtils, def analyzeJoin(joinConf: api.Join, enableHitter: Boolean = false, validationAssert: Boolean = false) : (Map[String, DataType], ListBuffer[AggregationMetadata], Map[String, DataType]) = { val name = "joins/" + joinConf.metaData.name - println(s"""|Running join analysis for $name ...""".stripMargin) + logger.info(s"""|Running join analysis for $name ...""".stripMargin) // run SQL environment setups such as UDFs and JARs joinConf.setups.foreach(tableUtils.sql) @@ -269,7 +272,7 @@ class Analyzer(tableUtils: TableUtils, val rangeToFill = JoinUtils.getRangesToFill(joinConf.left, tableUtils, endDate, historicalBackfill = joinConf.historicalBackfill) - println(s"[Analyzer] Join range to fill $rangeToFill") + logger.info(s"[Analyzer] Join range to fill $rangeToFill") val unfilledRanges = tableUtils .unfilledRanges(joinConf.metaData.outputTable, rangeToFill, Some(Seq(joinConf.left.table))) .getOrElse(Seq.empty) @@ -302,9 +305,9 @@ class Analyzer(tableUtils: TableUtils, aggregationsMetadata.map(aggregation => (aggregation.name, aggregation.columnType)).toMap val statsSchema = StatsGenerator.statsIrSchema(api.StructType.from("Stats", rightSchema.toArray)) if (silenceMode) { - println(s"""ANALYSIS completed for join/${joinConf.metaData.cleanName}.""".stripMargin) + logger.info(s"""ANALYSIS completed for join/${joinConf.metaData.cleanName}.""".stripMargin) } else { - println(s""" + logger.info(s""" |ANALYSIS for join/${joinConf.metaData.cleanName}: |$analysis |----- OUTPUT TABLE NAME ----- @@ -319,26 +322,26 @@ class Analyzer(tableUtils: TableUtils, |""".stripMargin) } - println(s"----- Validations for join/${joinConf.metaData.cleanName} -----") + logger.info(s"----- Validations for join/${joinConf.metaData.cleanName} -----") if (!gbStartPartitions.isEmpty) { - println( + logger.info( "----- Following Group_Bys contains a startPartition. Please check if any startPartition will conflict with your backfill. -----") gbStartPartitions.foreach { case (gbName, startPartitions) => - println(s"$gbName : ${startPartitions.mkString(",")}") + logger.info(s"$gbName : ${startPartitions.mkString(",")}") } } if (keysWithError.isEmpty && noAccessTables.isEmpty && dataAvailabilityErrors.isEmpty) { - println("----- Backfill validation completed. No errors found. -----") + logger.info("----- Backfill validation completed. No errors found. -----") } else { - println(s"----- Schema validation completed. Found ${keysWithError.size} errors") + logger.info(s"----- Schema validation completed. Found ${keysWithError.size} errors") val keyErrorSet: Set[(String, String)] = keysWithError.toSet - println(keyErrorSet.map { case (key, errorMsg) => s"$key => $errorMsg" }.mkString("\n")) - println(s"---- Table permission check completed. Found permission errors in ${noAccessTables.size} tables ----") - println(noAccessTables.mkString("\n")) - println(s"---- Data availability check completed. Found issue in ${dataAvailabilityErrors.size} tables ----") + logger.info(keyErrorSet.map { case (key, errorMsg) => s"$key => $errorMsg" }.mkString("\n")) + logger.info(s"---- Table permission check completed. Found permission errors in ${noAccessTables.size} tables ----") + logger.info(noAccessTables.mkString("\n")) + logger.info(s"---- Data availability check completed. Found issue in ${dataAvailabilityErrors.size} tables ----") dataAvailabilityErrors.foreach(error => - println(s"Table ${error._1} : Group_By ${error._2} : Expected start ${error._3}")) + logger.info(s"Table ${error._1} : Group_By ${error._2} : Expected start ${error._3}")) } if (validationAssert) { @@ -377,7 +380,7 @@ class Analyzer(tableUtils: TableUtils, // validate the table permissions for given list of tables // return a list of tables that the user doesn't have access to def runTablePermissionValidation(sources: Set[String]): Set[String] = { - println(s"Validating ${sources.size} tables permissions ...") + logger.info(s"Validating ${sources.size} tables permissions ...") val today = tableUtils.partitionSpec.at(System.currentTimeMillis()) //todo: handle offset-by-1 depending on temporal vs snapshot accuracy val partitionFilter = tableUtils.partitionSpec.minus(today, new Window(2, TimeUnit.DAYS)) @@ -394,7 +397,7 @@ class Analyzer(tableUtils: TableUtils, groupBy: api.GroupBy, unfilledRanges: Seq[PartitionRange]): List[(String, String, String)] = { if (unfilledRanges.isEmpty) { - println("No unfilled ranges found.") + logger.info("No unfilled ranges found.") List.empty } else { val firstUnfilledPartition = unfilledRanges.min.start @@ -418,7 +421,7 @@ class Analyzer(tableUtils: TableUtils, } groupBy.sources.toScala.flatMap { source => val table = source.table - println(s"Checking table $table for data availability ... Expected start partition: $expectedStart") + logger.info(s"Checking table $table for data availability ... Expected start partition: $expectedStart") //check if partition available or table is cumulative if (!tableUtils.ifPartitionExistsInTable(table, expectedStart) && !source.isCumulative) { Some((table, groupBy.getMetaData.getName, expectedStart)) diff --git a/spark/src/main/scala/ai/chronon/spark/BootstrapInfo.scala b/spark/src/main/scala/ai/chronon/spark/BootstrapInfo.scala index 7d9877103..4a4301b3f 100644 --- a/spark/src/main/scala/ai/chronon/spark/BootstrapInfo.scala +++ b/spark/src/main/scala/ai/chronon/spark/BootstrapInfo.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.api import ai.chronon.api.Extensions._ import ai.chronon.api.{Constants, ExternalPart, JoinPart, StructField} @@ -68,6 +69,7 @@ case class BootstrapInfo( } object BootstrapInfo { + private val logger = LoggerFactory.getLogger(getClass) // Build metadata for the join that contains schema information for join parts, external parts and bootstrap parts def from(joinConf: api.Join, @@ -77,7 +79,7 @@ object BootstrapInfo { mutationScan: Boolean = true): BootstrapInfo = { // Enrich each join part with the expected output schema - println(s"\nCreating BootstrapInfo for GroupBys for Join ${joinConf.metaData.name}") + logger.info(s"\nCreating BootstrapInfo for GroupBys for Join ${joinConf.metaData.name}") var joinParts: Seq[JoinPartMetadata] = Option(joinConf.joinParts.toScala) .getOrElse(Seq.empty) .map(part => { @@ -110,7 +112,7 @@ object BootstrapInfo { }) // Enrich each external part with the expected output schema - println(s"\nCreating BootstrapInfo for ExternalParts for Join ${joinConf.metaData.name}") + logger.info(s"\nCreating BootstrapInfo for ExternalParts for Join ${joinConf.metaData.name}") val externalParts: Seq[ExternalPartMetadata] = Option(joinConf.onlineExternalParts.toScala) .getOrElse(Seq.empty) .map(part => ExternalPartMetadata(part, part.keySchemaFull, part.valueSchemaFull)) @@ -166,7 +168,7 @@ object BootstrapInfo { val exceptionList = mutable.ListBuffer[Throwable]() def collectException(assertion: => Unit): Unit = Try(assertion).failed.foreach(exceptionList += _) - println(s"\nCreating BootstrapInfo for Log Based Bootstraps for Join ${joinConf.metaData.name}") + logger.info(s"\nCreating BootstrapInfo for Log Based Bootstraps for Join ${joinConf.metaData.name}") // Verify that join keys are valid columns on the log table logBootstrapParts .foreach(part => { @@ -188,7 +190,7 @@ object BootstrapInfo { .toSeq }.toMap - println(s"\nCreating BootstrapInfo for Table Based Bootstraps for Join ${joinConf.metaData.name}") + logger.info(s"\nCreating BootstrapInfo for Table Based Bootstraps for Join ${joinConf.metaData.name}") // Verify that join keys are valid columns on the bootstrap source table val tableHashes = tableBootstrapParts .map(part => { @@ -293,13 +295,13 @@ object BootstrapInfo { } if (exceptionList.nonEmpty) { - exceptionList.foreach(t => println(t.traceString)) + exceptionList.foreach(t => logger.info(t.traceString)) throw new Exception(s"Validation failed for bootstrapInfo construction for join ${joinConf.metaData.name}") } - println(s"\n======= Finalized Bootstrap Info ${joinConf.metaData.name} =======\n") + logger.info(s"\n======= Finalized Bootstrap Info ${joinConf.metaData.name} =======\n") joinParts.foreach { metadata => - println(s"""Bootstrap Info for Join Part `${metadata.joinPart.groupBy.metaData.name}` + logger.info(s"""Bootstrap Info for Join Part `${metadata.joinPart.groupBy.metaData.name}` |Key Schema: |${stringify(metadata.keySchema)} |Value Schema: @@ -307,7 +309,7 @@ object BootstrapInfo { |""".stripMargin) } externalParts.foreach { metadata => - println(s"""Bootstrap Info for External Part `${metadata.externalPart.fullName}` + logger.info(s"""Bootstrap Info for External Part `${metadata.externalPart.fullName}` |Key Schema: |${stringify(metadata.keySchema)} |Value Schema: @@ -315,16 +317,16 @@ object BootstrapInfo { |""".stripMargin) } if (derivedSchema.nonEmpty) { - println(s"""Bootstrap Info for Derivations + logger.info(s"""Bootstrap Info for Derivations |${stringify(derivedSchema.map(_._1))} |""".stripMargin) } - println(s"""Bootstrap Info for Log Bootstraps + logger.info(s"""Bootstrap Info for Log Bootstraps |Log Hashes: ${logHashes.keys.prettyInline} |""".stripMargin) tableHashes.foreach { case (hash, (schema, _, query)) => - println(s"""Bootstrap Info for Table Bootstraps + logger.info(s"""Bootstrap Info for Table Bootstraps |Table Hash: ${hash} |Bootstrap Query: |\n${query}\n @@ -333,7 +335,7 @@ object BootstrapInfo { |""".stripMargin) } - println(s"\n======= Finalized Bootstrap Info ${joinConf.metaData.name} END =======\n") + logger.info(s"\n======= Finalized Bootstrap Info ${joinConf.metaData.name} END =======\n") bootstrapInfo } diff --git a/spark/src/main/scala/ai/chronon/spark/Comparison.scala b/spark/src/main/scala/ai/chronon/spark/Comparison.scala index ff445cf0d..bbd67ecf1 100644 --- a/spark/src/main/scala/ai/chronon/spark/Comparison.scala +++ b/spark/src/main/scala/ai/chronon/spark/Comparison.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.online.Extensions.StructTypeOps import com.google.gson.Gson import org.apache.spark.sql.DataFrame @@ -24,6 +25,7 @@ import org.apache.spark.sql.types.{DecimalType, DoubleType, FloatType, MapType} import java.util object Comparison { + private val logger = LoggerFactory.getLogger(getClass) // used for comparison def sortedJson(m: Map[String, Any]): String = { @@ -58,8 +60,8 @@ object Comparison { aName: String = "a", bName: String = "b"): DataFrame = { - println("====== side-by-side comparison ======") - println(s"keys: $keys\na_schema:\n${a.schema.pretty}\nb_schema:\n${b.schema.pretty}") + logger.info("====== side-by-side comparison ======") + logger.info(s"keys: $keys\na_schema:\n${a.schema.pretty}\nb_schema:\n${b.schema.pretty}") val prefixedExpectedDf = prefixColumnName(stringifyMaps(a), s"${aName}_") val prefixedOutputDf = prefixColumnName(stringifyMaps(b), s"${bName}_") @@ -98,7 +100,7 @@ object Comparison { } else { s"($left <> $right)" } Seq(s"(($left IS NULL AND $right IS NOT NULL) OR ($right IS NULL AND $left IS NOT NULL) OR $compareExpression)") } - println(s"Using comparison filter:\n ${comparisonFilters.mkString("\n ")}") + logger.info(s"Using comparison filter:\n ${comparisonFilters.mkString("\n ")}") if (comparisonFilters.nonEmpty) { finalDf.filter(comparisonFilters.mkString(" or ")) } else { diff --git a/spark/src/main/scala/ai/chronon/spark/Driver.scala b/spark/src/main/scala/ai/chronon/spark/Driver.scala index 9f2d431e0..17b2f42d9 100644 --- a/spark/src/main/scala/ai/chronon/spark/Driver.scala +++ b/spark/src/main/scala/ai/chronon/spark/Driver.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.api import ai.chronon.api.Extensions.{GroupByOps, MetadataOps, SourceOps} import ai.chronon.api.ThriftJsonCodec @@ -50,11 +51,13 @@ import scala.util.{Failure, Success, Try} // useful to override spark.sql.extensions args - there is no good way to unset that conf apparently // so we give it dummy extensions class DummyExtensions extends (SparkSessionExtensions => Unit) { + private val logger = LoggerFactory.getLogger(getClass) override def apply(extensions: SparkSessionExtensions): Unit = {} } // The mega chronon cli object Driver { + private val logger = LoggerFactory.getLogger(getClass) def parseConf[T <: TBase[_, _]: Manifest: ClassTag](confPath: String): T = ThriftJsonCodec.fromJsonFile[T](confPath, check = true) @@ -202,21 +205,23 @@ object Driver { val result = CompareJob.getConsolidatedData(metrics, tableUtils.partitionSpec) if (result.nonEmpty) { - println("[Validation] Failed. Please try exporting the result and investigate.") + logger.info("[Validation] Failed. Please try exporting the result and investigate.") false } else { - println("[Validation] Success.") + logger.info("[Validation] Success.") true } } } object JoinBackfill { + private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("join") with OfflineSubcommand with LocalExportTableAbility with ResultValidationAbility { + private val logger = LoggerFactory.getLogger(getClass) val stepDays: ScallopOption[Int] = opt[Int](required = false, descr = "Runs backfill in steps, step-days at a time. Default is 30 days", @@ -253,11 +258,12 @@ object Driver { } df.show(numRows = 3, truncate = 0, vertical = true) - println(s"\nShowing three rows of output above.\nQuery table `${args.joinConf.metaData.outputTable}` for more.\n") + logger.info(s"\nShowing three rows of output above.\nQuery table `${args.joinConf.metaData.outputTable}` for more.\n") } } object GroupByBackfill { + private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("group-by-backfill") with OfflineSubcommand @@ -292,6 +298,7 @@ object Driver { } object LabelJoin { + private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("label-join") with OfflineSubcommand with LocalExportTableAbility { val stepDays: ScallopOption[Int] = opt[Int](required = false, @@ -317,6 +324,7 @@ object Driver { } object Analyzer { + private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("analyze") with OfflineSubcommand { val startDate: ScallopOption[String] = opt[String](required = false, @@ -357,6 +365,7 @@ object Driver { } object MetadataExport { + private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("metadata-export") with OfflineSubcommand { val inputRootPath: ScallopOption[String] = opt[String](required = true, descr = "Base path of config repo to export from") @@ -371,6 +380,7 @@ object Driver { } object StagingQueryBackfill { + private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("staging-query-backfill") with OfflineSubcommand with LocalExportTableAbility { val stepDays: ScallopOption[Int] = opt[Int](required = false, @@ -400,6 +410,7 @@ object Driver { } object DailyStats { + private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("stats-summary") with OfflineSubcommand { val stepDays: ScallopOption[Int] = opt[Int](required = false, @@ -420,6 +431,7 @@ object Driver { } object LogStats { + private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("log-summary") with OfflineSubcommand { val stepDays: ScallopOption[Int] = opt[Int](required = false, @@ -439,6 +451,7 @@ object Driver { } object GroupByUploader { + private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("group-by-upload") with OfflineSubcommand { override def subcommandName() = "group-by-upload" } @@ -449,6 +462,7 @@ object Driver { } object ConsistencyMetricsCompute { + private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("consistency-metrics-compute") with OfflineSubcommand { override def subcommandName() = "consistency-metrics-compute" } @@ -464,6 +478,7 @@ object Driver { } object CompareJoinQuery { + private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("compare-join-query") with OfflineSubcommand { val queryConf: ScallopOption[String] = opt[String](required = true, descr = "Conf to the Staging Query to compare with") @@ -521,6 +536,7 @@ object Driver { } object FetcherCli { + private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("fetch") with OnlineSubcommand { val keyJson: ScallopOption[String] = opt[String](required = false, descr = "json of the keys to fetch") @@ -560,7 +576,7 @@ object Driver { ) series.get(keyMap("statsKey").asInstanceOf[String]) else series - println(s"--- [FETCHED RESULT] ---\n${objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(toPrint)}") + logger.info(s"--- [FETCHED RESULT] ---\n${objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(toPrint)}") } def run(args: Args): Unit = { @@ -582,19 +598,19 @@ object Driver { if (args.keyJson.isDefined) { Try(readMapList(args.keyJson())).toOption.getOrElse(Seq(readMap(args.keyJson()))) } else { - println(s"Reading requests from ${args.keyJsonFile()}") + logger.info(s"Reading requests from ${args.keyJsonFile()}") val file = Source.fromFile(args.keyJsonFile()) val mapList = file.getLines().map(json => readMap(json)).toList file.close() mapList } if (keyMapList.length > 1) { - println(s"Plan to send ${keyMapList.length} fetches with ${args.interval()} seconds interval") + logger.info(s"Plan to send ${keyMapList.length} fetches with ${args.interval()} seconds interval") } val fetcher = args.impl(args.serializableProps).buildFetcher(true) def iterate(): Unit = { keyMapList.foreach(keyMap => { - println(s"--- [START FETCHING for ${keyMap}] ---") + logger.info(s"--- [START FETCHING for ${keyMap}] ---") if (args.`type`() == "join-stats") { fetchStats(args, objectMapper, keyMap, fetcher) } else { @@ -614,13 +630,13 @@ object Driver { r.values match { case Success(valMap) => { if (valMap == null) { - println("No data present for the provided key.") + logger.info("No data present for the provided key.") } else { valMap.foreach { case (k, v) => tMap.put(k, v) } - println( + logger.info( s"--- [FETCHED RESULT] ---\n${objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(tMap)}") } - println(s"Fetched in: $awaitTimeMs ms") + logger.info(s"Fetched in: $awaitTimeMs ms") } case Failure(exception) => { exception.printStackTrace() @@ -632,13 +648,14 @@ object Driver { } iterate() while (args.loop()) { - println("loop is set to true, start next iteration. will only exit if manually killed.") + logger.info("loop is set to true, start next iteration. will only exit if manually killed.") iterate() } } } object MetadataUploader { + private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("metadata-upload") with OnlineSubcommand { val confPath: ScallopOption[String] = opt[String](required = true, descr = "Path to the Chronon config file or directory") @@ -647,12 +664,13 @@ object Driver { def run(args: Args): Unit = { val putRequest = args.metaDataStore.putConf(args.confPath()) val res = Await.result(putRequest, 1.hour) - println( + logger.info( s"Uploaded Chronon Configs to the KV store, success count = ${res.count(v => v)}, failure count = ${res.count(!_)}") } } object LogFlattener { + private val logger = LoggerFactory.getLogger(getClass) class Args extends Subcommand("log-flattener") with OfflineSubcommand { val logTable: ScallopOption[String] = opt[String](required = true, descr = "Hive table with partitioned raw logs") @@ -683,17 +701,18 @@ object Driver { } object GroupByStreaming { + private val logger = LoggerFactory.getLogger(getClass) def dataStream(session: SparkSession, host: String, topic: String): DataFrame = { TopicChecker.topicShouldExist(topic, host) session.streams.addListener(new StreamingQueryListener() { override def onQueryStarted(queryStarted: QueryStartedEvent): Unit = { - println("Query started: " + queryStarted.id) + logger.info("Query started: " + queryStarted.id) } override def onQueryTerminated(queryTerminated: QueryTerminatedEvent): Unit = { - println("Query terminated: " + queryTerminated.id) + logger.info("Query terminated: " + queryTerminated.id) } override def onQueryProgress(queryProgress: QueryProgressEvent): Unit = { - println("Query made progress: " + queryProgress.progress) + logger.info("Query made progress: " + queryProgress.progress) } }) session.readStream @@ -742,7 +761,7 @@ object Driver { } s"$file $suffix" } - println(s"File Statuses:\n ${messages.mkString("\n ")}") + logger.info(s"File Statuses:\n ${messages.mkString("\n ")}") statuses.find(_._2 == true).map(_._1) } @@ -850,9 +869,9 @@ object Driver { case args.LogStatsArgs => LogStats.run(args.LogStatsArgs) case args.MetadataExportArgs => MetadataExport.run(args.MetadataExportArgs) case args.LabelJoinArgs => LabelJoin.run(args.LabelJoinArgs) - case _ => println(s"Unknown subcommand: $x") + case _ => logger.info(s"Unknown subcommand: $x") } - case None => println(s"specify a subcommand please") + case None => logger.info(s"specify a subcommand please") } if (shouldExit) { System.exit(0) diff --git a/spark/src/main/scala/ai/chronon/spark/Extensions.scala b/spark/src/main/scala/ai/chronon/spark/Extensions.scala index 84e0581bc..09bb2cf62 100644 --- a/spark/src/main/scala/ai/chronon/spark/Extensions.scala +++ b/spark/src/main/scala/ai/chronon/spark/Extensions.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.api import ai.chronon.api.Constants import ai.chronon.online.{AvroCodec, AvroConversions, SparkConversions} @@ -32,8 +33,10 @@ import scala.collection.Seq import scala.reflect.ClassTag object Extensions { + private val logger = LoggerFactory.getLogger(getClass) implicit class StructTypeOps(schema: StructType) { + private val logger = LoggerFactory.getLogger(getClass) def pretty: String = { val schemaTuples = schema.fields.map { field => field.dataType.simpleString -> field.name @@ -57,6 +60,7 @@ object Extensions { case class DfStats(count: Long, partitionRange: PartitionRange) // helper class to maintain datafram stats that are necessary for downstream operations case class DfWithStats(df: DataFrame, partitionCounts: Map[String, Long])(implicit val tableUtils: TableUtils) { + private val logger = LoggerFactory.getLogger(getClass) private val minPartition: String = partitionCounts.keys.min private val maxPartition: String = partitionCounts.keys.max val partitionRange: PartitionRange = PartitionRange(minPartition, maxPartition) @@ -71,6 +75,7 @@ object Extensions { } object DfWithStats { + private val logger = LoggerFactory.getLogger(getClass) def apply(dataFrame: DataFrame)(implicit tableUtils: TableUtils): DfWithStats = { val partitionCounts = dataFrame .groupBy(col(TableUtils(dataFrame.sparkSession).partitionColumn)) @@ -83,6 +88,7 @@ object Extensions { } implicit class DataframeOps(df: DataFrame) { + private val logger = LoggerFactory.getLogger(getClass) private implicit val tableUtils: TableUtils = TableUtils(df.sparkSession) // This is safe to call on dataframes that are un-shuffled from their disk sources - @@ -98,7 +104,7 @@ object Extensions { def prunePartition(partitionRange: PartitionRange): DataFrame = { val pruneFilter = partitionRange.whereClauses().mkString(" AND ") - println(s"Pruning using $pruneFilter") + logger.info(s"Pruning using $pruneFilter") df.filter(pruneFilter) } @@ -121,7 +127,7 @@ object Extensions { val minMaxRow = minMaxRows(0) df.sparkSession.catalog.dropTempView(viewName) val (min, max) = (minMaxRow.getAs[T](0), minMaxRow.getAs[T](1)) - println(s"Computed Range for $columnName - min: $min, max: $max") + logger.info(s"Computed Range for $columnName - min: $min, max: $max") (min, max) } @@ -188,10 +194,10 @@ object Extensions { val approxCount = df.filter(df.col(col).isNotNull).select(approx_count_distinct(col)).collect()(0).getLong(0) if (approxCount == 0) { - println( + logger.info( s"Warning: approxCount for col ${col} from table ${tableName} is 0. Please double check your input data.") } - println(s""" [STARTED] Generating bloom filter on key `$col` for range $partitionRange from $tableName + logger.info(s""" [STARTED] Generating bloom filter on key `$col` for range $partitionRange from $tableName | Approximate distinct count of `$col`: $approxCount | Total count of rows: $totalCount |""".stripMargin) @@ -200,7 +206,7 @@ object Extensions { .stat .bloomFilter(col, approxCount + 1, fpp) // expectedNumItems must be positive - println(s""" + logger.info(s""" | [FINISHED] Generating bloom filter on key `$col` for range $partitionRange from $tableName | Approximate distinct count of `$col`: $approxCount | Total count of rows: $totalCount @@ -210,7 +216,7 @@ object Extensions { } def removeNulls(cols: Seq[String]): DataFrame = { - println(s"filtering nulls from columns: [${cols.mkString(", ")}]") + logger.info(s"filtering nulls from columns: [${cols.mkString(", ")}]") // do not use != or <> operator with null, it doesn't return false ever! df.filter(cols.map(_ + " IS NOT NULL").mkString(" AND ")) } @@ -272,7 +278,7 @@ object Extensions { def prettyPrint(timeColumns: Seq[String] = Seq(Constants.TimeColumn, Constants.MutationTimeColumn)): Unit = { val availableColumns = timeColumns.filter(df.schema.names.contains) - println(s"schema: ${df.schema.fieldNames.mkString("Array(", ", ", ")")}") + logger.info(s"schema: ${df.schema.fieldNames.mkString("Array(", ", ", ")")}") df.replaceWithReadableTime(availableColumns, dropOriginal = true).show(truncate = false) } } diff --git a/spark/src/main/scala/ai/chronon/spark/FastHashing.scala b/spark/src/main/scala/ai/chronon/spark/FastHashing.scala index 1cf405954..d25a23633 100644 --- a/spark/src/main/scala/ai/chronon/spark/FastHashing.scala +++ b/spark/src/main/scala/ai/chronon/spark/FastHashing.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.spark.Extensions._ import com.google.common.hash.{Hasher, Hashing} import org.apache.spark.sql.Row @@ -41,11 +42,12 @@ case class KeyWithHash(data: Array[Any], hash: Array[Byte], hashInt: Int) extend } object FastHashing { + private val logger = LoggerFactory.getLogger(getClass) // function to generate a fast-ish hasher // the approach tries to accumulate several tiny closures to compute the final hash def generateKeyBuilder(keys: Array[String], schema: StructType): Row => KeyWithHash = { val keySchema = StructType(schema.filter { sf => keys.contains(sf.name) }) - println(s"Generating key builder over keys:\n${keySchema.pretty}\n") + logger.info(s"Generating key builder over keys:\n${keySchema.pretty}\n") val keyIndices: Array[Int] = keys.map(schema.fieldIndex) // the hash function generation won't be in the hot path - so its okay to val hashFunctions: Array[(Hasher, Row) => Unit] = keys.zip(keyIndices).map { diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala index 258e9db4c..796e4e233 100644 --- a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala +++ b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.aggregator.base.TimeTuple import ai.chronon.aggregator.row.RowAggregator import ai.chronon.aggregator.windowing._ @@ -43,6 +44,7 @@ class GroupBy(val aggregations: Seq[api.Aggregation], skewFilter: Option[String] = None, finalize: Boolean = true) extends Serializable { + private val logger = LoggerFactory.getLogger(getClass) protected[spark] val tsIndex: Int = inputDf.schema.fieldNames.indexOf(Constants.TimeColumn) protected val selectedSchema: Array[(String, api.DataType)] = SparkConversions.toChrononSchema(inputDf.schema) @@ -119,8 +121,8 @@ class GroupBy(val aggregations: Seq[api.Aggregation], inputDf -> updateFunc } - println("prepped input schema") - println(preppedInputDf.schema.pretty) + logger.info("prepped input schema") + logger.info(preppedInputDf.schema.pretty) tableUtils .preAggRepartition(preppedInputDf) @@ -404,7 +406,7 @@ object GroupBy { val result = groupByConf.deepCopy() val newSources: java.util.List[api.Source] = groupByConf.sources.toScala.map { source => if (source.isSetJoinSource) { - println("Join source detected. Materializing the join.") + logger.info("Join source detected. Materializing the join.") val joinSource = source.getJoinSource val joinConf = joinSource.join // materialize the table with the right end date. QueryRange.end could be shifted for temporal events @@ -417,7 +419,7 @@ object GroupBy { if (computeDependency) { val df = join.computeJoin() if (showDf) { - println( + logger.info( s"printing output data from groupby::join_source: ${groupByConf.metaData.name}::${joinConf.metaData.name}") df.prettyPrint() } @@ -458,7 +460,7 @@ object GroupBy { finalize: Boolean = true, mutationScan: Boolean = true, showDf: Boolean = false): GroupBy = { - println(s"\n----[Processing GroupBy: ${groupByConfOld.metaData.name}]----") + logger.info(s"\n----[Processing GroupBy: ${groupByConfOld.metaData.name}]----") val groupByConf = replaceJoinSource(groupByConfOld, queryRange, tableUtils, computeDependency, showDf) val inputDf = groupByConf.sources.toScala .map { source => @@ -492,7 +494,7 @@ object GroupBy { val keyColumns = groupByConf.getKeyColumns.toScala val skewFilteredDf = skewFilter .map { sf => - println(s"$logPrefix filtering using skew filter:\n $sf") + logger.info(s"$logPrefix filtering using skew filter:\n $sf") val filtered = inputDf.filter(sf) filtered } @@ -504,7 +506,7 @@ object GroupBy { val nullFilterClause = groupByConf.keyColumns.toScala.map(key => s"($key IS NOT NULL)").mkString(" OR ") val nullFiltered = processedInputDf.filter(nullFilterClause) if (showDf) { - println(s"printing input date for groupBy: ${groupByConf.metaData.name}") + logger.info(s"printing input date for groupBy: ${groupByConf.metaData.name}") nullFiltered.prettyPrint() } @@ -536,7 +538,7 @@ object GroupBy { } else null if (showDf && mutationDf != null) { - println(s"printing mutation data for groupBy: ${groupByConf.metaData.name}") + logger.info(s"printing mutation data for groupBy: ${groupByConf.metaData.name}") mutationDf.prettyPrint() } @@ -576,7 +578,7 @@ object GroupBy { val queryableDataRange = PartitionRange(dataProfile.earliestRequired, Seq(queryEnd, dataProfile.latestAllowed).max)(tableUtils) val intersectedRange = sourceRange.intersect(queryableDataRange) - println(s""" + logger.info(s""" |Computing intersected range as: | query range: $queryRange | query window: $window @@ -624,14 +626,14 @@ object GroupBy { Some(Constants.TimeColumn -> Option(source.query.timeColumn).getOrElse(dsBasedTimestamp)) } } - println(s""" + logger.info(s""" |Time Mapping: $timeMapping |""".stripMargin) metaColumns ++= timeMapping val partitionConditions = intersectedRange.map(_.whereClauses()).getOrElse(Seq.empty) - println(s""" + logger.info(s""" |Rendering source query: | intersected/effective scan range: $intersectedRange | partitionConditions: $partitionConditions @@ -684,25 +686,25 @@ object GroupBy { if (isAnySourceCumulative) None else Some(inputTables)) if (groupByUnfilledRangesOpt.isEmpty) { - println(s"""Nothing to backfill for $outputTable - given + logger.info(s"""Nothing to backfill for $outputTable - given |endPartition of $endPartition |backfill start of ${groupByConf.backfillStartDate} |Exiting...""".stripMargin) return } val groupByUnfilledRanges = groupByUnfilledRangesOpt.get - println(s"group by unfilled ranges: $groupByUnfilledRanges") + logger.info(s"group by unfilled ranges: $groupByUnfilledRanges") val exceptions = mutable.Buffer.empty[String] groupByUnfilledRanges.foreach { case groupByUnfilledRange => try { val stepRanges = stepDays.map(groupByUnfilledRange.steps).getOrElse(Seq(groupByUnfilledRange)) - println(s"Group By ranges to compute: ${stepRanges.map { + logger.info(s"Group By ranges to compute: ${stepRanges.map { _.toString }.pretty}") stepRanges.zipWithIndex.foreach { case (range, index) => - println(s"Computing group by for range: $range [${index + 1}/${stepRanges.size}]") + logger.info(s"Computing group by for range: $range [${index + 1}/${stepRanges.size}]") val groupByBackfill = from(groupByConf, range, tableUtils, computeDependency = true) val outputDf = groupByConf.dataModel match { // group by backfills have to be snapshot only @@ -716,9 +718,9 @@ object GroupBy { val result = outputDf.select(finalOutputColumns: _*) result.save(outputTable, tableProps) } - println(s"Wrote to table $outputTable, into partitions: $range") + logger.info(s"Wrote to table $outputTable, into partitions: $range") } - println(s"Wrote to table $outputTable for range: $groupByUnfilledRange") + logger.info(s"Wrote to table $outputTable for range: $groupByUnfilledRange") } catch { case err: Throwable => diff --git a/spark/src/main/scala/ai/chronon/spark/GroupByUpload.scala b/spark/src/main/scala/ai/chronon/spark/GroupByUpload.scala index 1f2ee5cd9..ba1c63d8a 100644 --- a/spark/src/main/scala/ai/chronon/spark/GroupByUpload.scala +++ b/spark/src/main/scala/ai/chronon/spark/GroupByUpload.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.aggregator.windowing.{FinalBatchIr, FiveMinuteResolution, Resolution, SawtoothOnlineAggregator} import ai.chronon.api import ai.chronon.api.{Accuracy, Constants, DataModel, GroupByServingInfo, QueryUtils, ThriftJsonCodec} @@ -34,6 +35,7 @@ import scala.util.ScalaJavaConversions.{ListOps, MapOps} import scala.util.Try class GroupByUpload(endPartition: String, groupBy: GroupBy) extends Serializable { + private val logger = LoggerFactory.getLogger(getClass) implicit val sparkSession: SparkSession = groupBy.sparkSession implicit private val tableUtils: TableUtils = TableUtils(sparkSession) private def fromBase(rdd: RDD[(Array[Any], Array[Any])]): KvRdd = { @@ -62,7 +64,7 @@ class GroupByUpload(endPartition: String, groupBy: GroupBy) extends Serializable // Shared between events and mutations (temporal entities). def temporalEvents(resolution: Resolution = FiveMinuteResolution): KvRdd = { val endTs = tableUtils.partitionSpec.epochMillis(endPartition) - println(s"TemporalEvents upload end ts: $endTs") + logger.info(s"TemporalEvents upload end ts: $endTs") val sawtoothOnlineAggregator = new SawtoothOnlineAggregator( endTs, groupBy.aggregations, @@ -71,7 +73,7 @@ class GroupByUpload(endPartition: String, groupBy: GroupBy) extends Serializable val irSchema = SparkConversions.fromChrononSchema(sawtoothOnlineAggregator.batchIrSchema) val keyBuilder = FastHashing.generateKeyBuilder(groupBy.keyColumns.toArray, groupBy.inputDf.schema) - println(s""" + logger.info(s""" |BatchIR Element Size: ${SparkEnv.get.serializer .newInstance() .serialize(sawtoothOnlineAggregator.init) @@ -151,12 +153,12 @@ object GroupByUpload { } groupByServingInfo.setInputAvroSchema(inputSchema.toAvroSchema(name = "Input").toString(true)) } else { - println("Not setting InputAvroSchema to GroupByServingInfo as there is no streaming source defined.") + logger.info("Not setting InputAvroSchema to GroupByServingInfo as there is no streaming source defined.") } val result = new GroupByServingInfoParsed(groupByServingInfo, tableUtils.partitionSpec) val firstSource = groupByConf.sources.get(0) - println(s""" + logger.info(s""" |Built GroupByServingInfo for ${groupByConf.metaData.name}: |table: ${firstSource.table} / data-model: ${firstSource.dataModel} | keySchema: ${Try(result.keyChrononSchema.catalogString)} @@ -206,7 +208,7 @@ object GroupByUpload { // for mutations I need the snapshot from the previous day, but a batch end date of ds +1 lazy val otherGroupByUpload = new GroupByUpload(batchEndDate, groupBy) - println(s""" + logger.info(s""" |GroupBy upload for: ${groupByConf.metaData.team}.${groupByConf.metaData.name} |Accuracy: ${groupByConf.inferredAccuracy} |Data Model: ${groupByConf.dataModel} diff --git a/spark/src/main/scala/ai/chronon/spark/Join.scala b/spark/src/main/scala/ai/chronon/spark/Join.scala index 0adff47f4..50d758a52 100644 --- a/spark/src/main/scala/ai/chronon/spark/Join.scala +++ b/spark/src/main/scala/ai/chronon/spark/Join.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.api import ai.chronon.api.Extensions._ import ai.chronon.api._ @@ -44,6 +45,7 @@ import scala.util.ScalaJavaConversions.{IterableOps, ListOps, MapOps} case class CoveringSet(hashes: Seq[String], rowCount: Long, isCovering: Boolean) object CoveringSet { + private val logger = LoggerFactory.getLogger(getClass) def toFilterExpression(coveringSets: Seq[CoveringSet]): String = { val coveringSetHashExpression = "(" + coveringSets @@ -65,6 +67,7 @@ class Join(joinConf: api.Join, mutationScan: Boolean = true, showDf: Boolean = false) extends JoinBase(joinConf, endPartition, tableUtils, skipFirstHole, mutationScan, showDf) { + private val logger = LoggerFactory.getLogger(getClass) private val bootstrapTable = joinConf.metaData.bootstrapTable @@ -169,16 +172,16 @@ class Join(joinConf: api.Join, (joinPartMetadata, coveringSets) } - println( + logger.info( s"\n======= CoveringSet for JoinPart ${joinConf.metaData.name} for PartitionRange(${leftRange.start}, ${leftRange.end}) =======\n") coveringSetsPerJoinPart.foreach { case (joinPartMetadata, coveringSets) => - println(s"Bootstrap sets for join part ${joinPartMetadata.joinPart.groupBy.metaData.name}") + logger.info(s"Bootstrap sets for join part ${joinPartMetadata.joinPart.groupBy.metaData.name}") coveringSets.foreach { coveringSet => - println( + logger.info( s"CoveringSet(hash=${coveringSet.hashes.prettyInline}, rowCount=${coveringSet.rowCount}, isCovering=${coveringSet.isCovering})") } - println() + logger.info() } coveringSetsPerJoinPart @@ -344,7 +347,7 @@ class Join(joinConf: api.Join, val result = baseDf.select(finalOutputColumns: _*) if (showDf) { - println(s"printing results for join: ${joinConf.metaData.name}") + logger.info(s"printing results for join: ${joinConf.metaData.name}") result.prettyPrint() } result @@ -422,7 +425,7 @@ class Join(joinConf: api.Join, val joinedDf = parts.foldLeft(initDf) { case (partialDf, part) => { - println(s"\nProcessing Bootstrap from table ${part.table} for range ${unfilledRange}") + logger.info(s"\nProcessing Bootstrap from table ${part.table} for range ${unfilledRange}") val bootstrapRange = if (part.isSetQuery) { unfilledRange.intersect(PartitionRange(part.startPartition, part.endPartition)(tableUtils)) @@ -430,7 +433,7 @@ class Join(joinConf: api.Join, unfilledRange } if (!bootstrapRange.valid) { - println(s"partition range of bootstrap table ${part.table} is beyond unfilled range") + logger.info(s"partition range of bootstrap table ${part.table} is beyond unfilled range") partialDf } else { var bootstrapDf = tableUtils.sql( @@ -476,7 +479,7 @@ class Join(joinConf: api.Join, }) val elapsedMins = (System.currentTimeMillis() - startMillis) / (60 * 1000) - println(s"Finished computing bootstrap table ${joinConf.metaData.bootstrapTable} in ${elapsedMins} minutes") + logger.info(s"Finished computing bootstrap table ${joinConf.metaData.bootstrapTable} in ${elapsedMins} minutes") tableUtils.sql(range.genScanQuery(query = null, table = bootstrapTable)) } @@ -495,7 +498,7 @@ class Join(joinConf: api.Join, return Some(bootstrapDfWithStats) } val filterExpr = CoveringSet.toFilterExpression(coveringSets) - println(s"Using covering set filter: $filterExpr") + logger.info(s"Using covering set filter: $filterExpr") val filteredDf = bootstrapDf.where(filterExpr) val filteredCount = filteredDf.count() if (bootstrapDfWithStats.count == filteredCount) { // counting is faster than computing stats diff --git a/spark/src/main/scala/ai/chronon/spark/JoinBase.scala b/spark/src/main/scala/ai/chronon/spark/JoinBase.scala index 56c6e1fa0..c1d1c7f07 100644 --- a/spark/src/main/scala/ai/chronon/spark/JoinBase.scala +++ b/spark/src/main/scala/ai/chronon/spark/JoinBase.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.api import ai.chronon.api.DataModel.{Entities, Events} import ai.chronon.api.Extensions._ @@ -38,6 +39,7 @@ abstract class JoinBase(joinConf: api.Join, skipFirstHole: Boolean, mutationScan: Boolean = true, showDf: Boolean = false) { + private val logger = LoggerFactory.getLogger(getClass) assert(Option(joinConf.metaData.outputNamespace).nonEmpty, s"output namespace could not be empty or null") val metrics: Metrics.Context = Metrics.Context(Metrics.Environment.JoinOffline, joinConf) private val outputTable = joinConf.metaData.outputTable @@ -98,14 +100,14 @@ abstract class JoinBase(joinConf: api.Join, keyRenamedRightDf } - println(s""" + logger.info(s""" |Join keys for ${joinPart.groupBy.metaData.name}: ${keys.mkString(", ")} |Left Schema: |${leftDf.schema.pretty} |Right Schema: |${joinableRightDf.schema.pretty}""".stripMargin) val joinedDf = coalescedJoin(leftDf, joinableRightDf, keys) - println(s"""Final Schema: + logger.info(s"""Final Schema: |${joinedDf.schema.pretty} |""".stripMargin) @@ -154,18 +156,18 @@ abstract class JoinBase(joinConf: api.Join, computeJoinPart(prunedLeft, joinPart, joinLevelBloomMapOpt) // Cache join part data into intermediate table if (filledDf.isDefined) { - println(s"Writing to join part table: $partTable for partition range $unfilledRange") + logger.info(s"Writing to join part table: $partTable for partition range $unfilledRange") filledDf.get.save(partTable, tableProps, stats = prunedLeft.map(_.stats)) } }) val elapsedMins = (System.currentTimeMillis() - start) / 60000 partMetrics.gauge(Metrics.Name.LatencyMinutes, elapsedMins) partMetrics.gauge(Metrics.Name.PartitionCount, partitionCount) - println(s"Wrote ${partitionCount} partitions to join part table: $partTable in $elapsedMins minutes") + logger.info(s"Wrote ${partitionCount} partitions to join part table: $partTable in $elapsedMins minutes") } } catch { case e: Exception => - println(s"Error while processing groupBy: ${joinConf.metaData.name}/${joinPart.groupBy.getMetaData.getName}") + logger.info(s"Error while processing groupBy: ${joinConf.metaData.name}/${joinPart.groupBy.getMetaData.getName}") throw e } if (tableUtils.tableExists(partTable)) { @@ -183,7 +185,7 @@ abstract class JoinBase(joinConf: api.Join, if (leftDfWithStats.isEmpty) { // happens when all rows are already filled by bootstrap tables - println(s"\nBackfill is NOT required for ${joinPart.groupBy.metaData.name} since all rows are bootstrapped.") + logger.info(s"\nBackfill is NOT required for ${joinPart.groupBy.metaData.name} since all rows are bootstrapped.") return None } @@ -191,7 +193,7 @@ abstract class JoinBase(joinConf: api.Join, val rowCount = leftDfWithStats.get.count val unfilledRange = leftDfWithStats.get.partitionRange - println(s"\nBackfill is required for ${joinPart.groupBy.metaData.name} for $rowCount rows on range $unfilledRange") + logger.info(s"\nBackfill is required for ${joinPart.groupBy.metaData.name} for $rowCount rows on range $unfilledRange") val rightBloomMap = JoinUtils.genBloomFilterIfNeeded(leftDf, joinPart, @@ -216,7 +218,7 @@ abstract class JoinBase(joinConf: api.Join, lazy val unfilledTimeRange = { val timeRange = leftDf.timeRange - println(s"left unfilled time range: $timeRange") + logger.info(s"left unfilled time range: $timeRange") timeRange } @@ -226,7 +228,7 @@ abstract class JoinBase(joinConf: api.Join, lazy val skewFilteredLeft = leftSkewFilter .map { sf => val filtered = leftDf.filter(sf) - println(s"""Skew filtering left-df for + logger.info(s"""Skew filtering left-df for |GroupBy: ${joinPart.groupBy.metaData.name} |filterClause: $sf |""".stripMargin) @@ -276,7 +278,7 @@ abstract class JoinBase(joinConf: api.Join, rightDf } if (showDf) { - println(s"printing results for joinPart: ${joinConf.metaData.name}::${joinPart.groupBy.metaData.name}") + logger.info(s"printing results for joinPart: ${joinConf.metaData.name}::${joinPart.groupBy.metaData.name}") rightDfWithDerivations.prettyPrint() } Some(rightDfWithDerivations) @@ -300,15 +302,15 @@ abstract class JoinBase(joinConf: api.Join, try { analyzer.analyzeJoin(joinConf, validationAssert = true) metrics.gauge(Metrics.Name.validationSuccess, 1) - println("Join conf validation succeeded. No error found.") + logger.info("Join conf validation succeeded. No error found.") } catch { case ex: AssertionError => metrics.gauge(Metrics.Name.validationFailure, 1) - println(s"Validation failed. Please check the validation error in log.") + logger.info(s"Validation failed. Please check the validation error in log.") if (tableUtils.backfillValidationEnforced) throw ex case e: Throwable => metrics.gauge(Metrics.Name.validationFailure, 1) - println(s"An unexpected error occurred during validation. ${e.getMessage}") + logger.info(s"An unexpected error occurred during validation. ${e.getMessage}") } // First run command to archive tables that have changed semantically since the last run @@ -325,14 +327,14 @@ abstract class JoinBase(joinConf: api.Join, endPartition, overrideStartPartition, joinConf.historicalBackfill) - println(s"Join range to fill $rangeToFill") + logger.info(s"Join range to fill $rangeToFill") val unfilledRanges = tableUtils .unfilledRanges(outputTable, rangeToFill, Some(Seq(joinConf.left.table)), skipFirstHole = skipFirstHole) .getOrElse(Seq.empty) def finalResult: DataFrame = tableUtils.sql(rangeToFill.genScanQuery(null, outputTable)) if (unfilledRanges.isEmpty) { - println(s"\nThere is no data to compute based on end partition of ${rangeToFill.end}.\n\n Exiting..") + logger.info(s"\nThere is no data to compute based on end partition of ${rangeToFill.end}.\n\n Exiting..") return finalResult } @@ -345,12 +347,12 @@ abstract class JoinBase(joinConf: api.Join, // build bootstrap info once for the entire job val bootstrapInfo = BootstrapInfo.from(joinConf, rangeToFill, tableUtils, leftSchema, mutationScan = mutationScan) - println(s"Join ranges to compute: ${stepRanges.map { _.toString }.pretty}") + logger.info(s"Join ranges to compute: ${stepRanges.map { _.toString }.pretty}") stepRanges.zipWithIndex.foreach { case (range, index) => val startMillis = System.currentTimeMillis() val progress = s"| [${index + 1}/${stepRanges.size}]" - println(s"Computing join for range: $range $progress") + logger.info(s"Computing join for range: $range $progress") leftDf(joinConf, range, tableUtils).map { leftDfInRange => if (showDf) leftDfInRange.prettyPrint() // set autoExpand = true to ensure backward compatibility due to column ordering changes @@ -358,10 +360,10 @@ abstract class JoinBase(joinConf: api.Join, val elapsedMins = (System.currentTimeMillis() - startMillis) / (60 * 1000) metrics.gauge(Metrics.Name.LatencyMinutes, elapsedMins) metrics.gauge(Metrics.Name.PartitionCount, range.partitions.length) - println(s"Wrote to table $outputTable, into partitions: $range $progress in $elapsedMins mins") + logger.info(s"Wrote to table $outputTable, into partitions: $range $progress in $elapsedMins mins") } } - println(s"Wrote to table $outputTable, into partitions: $unfilledRanges") + logger.info(s"Wrote to table $outputTable, into partitions: $unfilledRanges") finalResult } } diff --git a/spark/src/main/scala/ai/chronon/spark/JoinUtils.scala b/spark/src/main/scala/ai/chronon/spark/JoinUtils.scala index f30b4cfc9..017c8d99a 100644 --- a/spark/src/main/scala/ai/chronon/spark/JoinUtils.scala +++ b/spark/src/main/scala/ai/chronon/spark/JoinUtils.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.api.Constants import ai.chronon.api.DataModel.Events import ai.chronon.api.Extensions.{JoinOps, _} @@ -30,6 +31,7 @@ import scala.collection.Seq import scala.util.ScalaJavaConversions.MapOps object JoinUtils { + private val logger = LoggerFactory.getLogger(getClass) /*** * Util methods for join computation @@ -53,12 +55,12 @@ object JoinUtils { val skewFilter = joinConf.skewFilter() val result = skewFilter .map(sf => { - println(s"left skew filter: $sf") + logger.info(s"left skew filter: $sf") df.filter(sf) }) .getOrElse(df) if (result.isEmpty) { - println(s"Left side query below produced 0 rows in range $range. Query:\n$scanQuery") + logger.info(s"Left side query below produced 0 rows in range $range. Query:\n$scanQuery") if (!allowEmpty) { return None } @@ -103,7 +105,7 @@ object JoinUtils { val overrideStart = if (historicalBackfill) { overrideStartPartition } else { - println(s"Historical backfill is set to false. Backfill latest single partition only: $endPartition") + logger.info(s"Historical backfill is set to false. Backfill latest single partition only: $endPartition") Some(endPartition) } lazy val defaultLeftStart = Option(leftSource.query.startPartition) @@ -294,12 +296,12 @@ object JoinUtils { unfilledRange: PartitionRange, tableUtils: TableUtils, joinLevelBloomMapOpt: Option[Map[String, BloomFilter]]): Option[Map[String, BloomFilter]] = { - println( + logger.info( s"\nRow count to be filled for ${joinPart.groupBy.metaData.name}. BloomFilter Threshold: ${tableUtils.bloomFilterThreshold}") // apply bloom filter when left row count is below threshold if (leftRowCount > tableUtils.bloomFilterThreshold) { - println("Row count is above threshold. Skip gen bloom filter.") + logger.info("Row count is above threshold. Skip gen bloom filter.") Option.empty } else { @@ -312,7 +314,7 @@ object JoinUtils { val rightBloomMap = joinPart.rightToLeft.mapValues(leftBlooms(_)).toMap val bloomSizes = rightBloomMap.map { case (col, bloom) => s"$col -> ${bloom.bitSize()}" }.pretty - println(s""" + logger.info(s""" Generating bloom filter for joinPart: | part name : ${joinPart.groupBy.metaData.name}, | left type : ${joinConf.left.dataModel}, @@ -342,7 +344,7 @@ object JoinUtils { oldSemanticJson <- props.get(Constants.SemanticHashKey); oldSemanticHash = gson.fromJson(oldSemanticJson, classOf[java.util.HashMap[String, String]]).toScala ) yield { - println(s"Comparing Hashes:\nNew: ${joinConf.semanticHash},\nOld: $oldSemanticHash") + logger.info(s"Comparing Hashes:\nNew: ${joinConf.semanticHash},\nOld: $oldSemanticHash") joinConf.tablesToDrop(oldSemanticHash) }).getOrElse(collection.Seq.empty) } diff --git a/spark/src/main/scala/ai/chronon/spark/KvRdd.scala b/spark/src/main/scala/ai/chronon/spark/KvRdd.scala index 669eef312..e85b57a89 100644 --- a/spark/src/main/scala/ai/chronon/spark/KvRdd.scala +++ b/spark/src/main/scala/ai/chronon/spark/KvRdd.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.api import ai.chronon.online.{AvroCodec, AvroConversions, SparkConversions} import ai.chronon.spark.Extensions._ @@ -26,6 +27,7 @@ import org.apache.spark.sql.types.{BinaryType, LongType, StringType, StructField import org.apache.spark.sql.{DataFrame, Row, SparkSession} object GenericRowHandler { + private val logger = LoggerFactory.getLogger(getClass) val func: Any => Array[Any] = { case x: GenericRowWithSchema => { val result = new Array[Any](x.length) @@ -85,7 +87,7 @@ case class KvRdd(data: RDD[(Array[Any], Array[Any])], keySchema: StructType, val val result: Array[Any] = Array(keyToBytes(keys), valueToBytes(values), keyJson, valueJson) new GenericRow(result) } - println(s""" + logger.info(s""" |key schema: | ${AvroConversions.fromChrononSchema(keyZSchema).toString(true)} |value schema: @@ -127,7 +129,7 @@ case class TimedKvRdd(data: RDD[(Array[Any], Array[Any], Long)], } val schemasStr = Seq(keyZSchema, valueZSchema).map(AvroConversions.fromChrononSchema(_).toString(true)) - println(s""" + logger.info(s""" |key schema: | ${schemasStr(0)} |value schema: diff --git a/spark/src/main/scala/ai/chronon/spark/LabelJoin.scala b/spark/src/main/scala/ai/chronon/spark/LabelJoin.scala index 3eef6799f..cc21503a7 100644 --- a/spark/src/main/scala/ai/chronon/spark/LabelJoin.scala +++ b/spark/src/main/scala/ai/chronon/spark/LabelJoin.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.api import ai.chronon.api.DataModel.{Entities, Events} import ai.chronon.api.Extensions._ @@ -31,6 +32,7 @@ import scala.collection.Seq import scala.util.ScalaJavaConversions.IterableOps class LabelJoin(joinConf: api.Join, tableUtils: TableUtils, labelDS: String) { + private val logger = LoggerFactory.getLogger(getClass) assert(Option(joinConf.metaData.outputNamespace).nonEmpty, s"output namespace could not be empty or null") assert( @@ -92,7 +94,7 @@ class LabelJoin(joinConf: api.Join, tableUtils: TableUtils, labelDS: String) { labelTable } else { // creating final join view with feature join output table - println(s"Joining label table : ${outputLabelTable} with joined output table : ${joinConf.metaData.outputTable}") + logger.info(s"Joining label table : ${outputLabelTable} with joined output table : ${joinConf.metaData.outputTable}") JoinUtils.createOrReplaceView( joinConf.metaData.outputFinalView, leftTable = joinConf.metaData.outputTable, @@ -102,11 +104,11 @@ class LabelJoin(joinConf: api.Join, tableUtils: TableUtils, labelDS: String) { viewProperties = Map(Constants.LabelViewPropertyKeyLabelTable -> outputLabelTable, Constants.LabelViewPropertyFeatureTable -> joinConf.metaData.outputTable) ) - println(s"Final labeled view created: ${joinConf.metaData.outputFinalView}") + logger.info(s"Final labeled view created: ${joinConf.metaData.outputFinalView}") JoinUtils.createLatestLabelView(joinConf.metaData.outputLatestLabelView, baseView = joinConf.metaData.outputFinalView, tableUtils) - println(s"Final view with latest label created: ${joinConf.metaData.outputLatestLabelView}") + logger.info(s"Final view with latest label created: ${joinConf.metaData.outputLatestLabelView}") labelTable } } @@ -114,18 +116,18 @@ class LabelJoin(joinConf: api.Join, tableUtils: TableUtils, labelDS: String) { def compute(leftRange: PartitionRange, stepDays: Option[Int] = None, labelDS: Option[String] = None): DataFrame = { val today = tableUtils.partitionSpec.at(System.currentTimeMillis()) val sanitizedLabelDs = labelDS.getOrElse(today) - println(s"Label join range to fill $leftRange") + logger.info(s"Label join range to fill $leftRange") def finalResult = tableUtils.sql(leftRange.genScanQuery(null, outputLabelTable)) val leftFeatureRange = leftRange stepDays.foreach(metrics.gauge("step_days", _)) val stepRanges = stepDays.map(leftFeatureRange.steps).getOrElse(Seq(leftFeatureRange)) - println(s"Label Join left ranges to compute: ${stepRanges.map { _.toString }.pretty}") + logger.info(s"Label Join left ranges to compute: ${stepRanges.map { _.toString }.pretty}") stepRanges.zipWithIndex.foreach { case (range, index) => val startMillis = System.currentTimeMillis() val progress = s"| [${index + 1}/${stepRanges.size}]" - println(s"Computing label join for range: $range Label DS: ${labelDS.getOrElse(today)} $progress") + logger.info(s"Computing label join for range: $range Label DS: ${labelDS.getOrElse(today)} $progress") JoinUtils.leftDf(joinConf, range, tableUtils).map { leftDfInRange => computeRange(leftDfInRange, range, sanitizedLabelDs) .save(outputLabelTable, @@ -135,10 +137,10 @@ class LabelJoin(joinConf: api.Join, tableUtils: TableUtils, labelDS: String) { val elapsedMins = (System.currentTimeMillis() - startMillis) / (60 * 1000) metrics.gauge(Metrics.Name.LatencyMinutes, elapsedMins) metrics.gauge(Metrics.Name.PartitionCount, range.partitions.length) - println(s"Wrote to table $outputLabelTable, into partitions: $range $progress in $elapsedMins mins") + logger.info(s"Wrote to table $outputLabelTable, into partitions: $range $progress in $elapsedMins mins") } } - println(s"Wrote to table $outputLabelTable, into partitions: $leftFeatureRange") + logger.info(s"Wrote to table $outputLabelTable, into partitions: $leftFeatureRange") finalResult } @@ -168,7 +170,7 @@ class LabelJoin(joinConf: api.Join, tableUtils: TableUtils, labelDS: String) { .foreach(leftRange => { val labeledDf = computeLabelPart(labelJoinPart, leftRange, leftBlooms) // Cache label part data into intermediate table - println(s"Writing to join part table: $partTable for partition range $leftRange") + logger.info(s"Writing to join part table: $partTable for partition range $leftRange") labeledDf.save(tableName = partTable, tableProperties = confTableProps, partitionColumns = Seq(Constants.LabelPartitionColumn)) @@ -176,11 +178,11 @@ class LabelJoin(joinConf: api.Join, tableUtils: TableUtils, labelDS: String) { val elapsedMins = (System.currentTimeMillis() - start) / 60000 labelJoinPartMetrics.gauge(Metrics.Name.LatencyMinutes, elapsedMins) labelJoinPartMetrics.gauge(Metrics.Name.PartitionCount, partitionCount) - println(s"Wrote ${partitionCount} partitions to label part table: $partTable in $elapsedMins minutes") + logger.info(s"Wrote ${partitionCount} partitions to label part table: $partTable in $elapsedMins minutes") } } catch { case e: Exception => - println( + logger.info( s"Error while processing groupBy: " + s"${joinConf.metaData.name}/${labelJoinPart.groupBy.getMetaData.getName}") throw e @@ -191,7 +193,7 @@ class LabelJoin(joinConf: api.Join, tableUtils: TableUtils, labelDS: String) { } val rowIdentifier = labelJoinConf.rowIdentifier(joinConf.rowIds, tableUtils.partitionColumn) - println("Label Join filtering left df with only row identifier:", rowIdentifier.mkString(", ")) + logger.info("Label Join filtering left df with only row identifier:", rowIdentifier.mkString(", ")) val leftFiltered = JoinUtils.filterColumns(leftDf, rowIdentifier) val joined = rightDfs.zip(labelJoinConf.labels.asScala).foldLeft(leftFiltered) { @@ -212,7 +214,7 @@ class LabelJoin(joinConf: api.Join, tableUtils: TableUtils, labelDS: String) { val rightSkewFilter = joinConf.partSkewFilter(joinPart) val rightBloomMap = joinPart.rightToLeft.mapValues(leftBlooms(_)).toMap val bloomSizes = rightBloomMap.map { case (col, bloom) => s"$col -> ${bloom.bitSize()}" }.pretty - println(s""" + logger.info(s""" |Label JoinPart Info: | part name : ${joinPart.groupBy.metaData.name}, | left type : ${joinConf.left.dataModel}, @@ -266,7 +268,7 @@ class LabelJoin(joinConf: api.Join, tableUtils: TableUtils, labelDS: String) { val partName = joinPart.groupBy.metaData.name - println(s"""Join keys for $partName: ${partLeftKeys.mkString(", ")} + logger.info(s"""Join keys for $partName: ${partLeftKeys.mkString(", ")} |Left Schema: |${updatedLeftDf.schema.pretty} | diff --git a/spark/src/main/scala/ai/chronon/spark/LocalDataLoader.scala b/spark/src/main/scala/ai/chronon/spark/LocalDataLoader.scala index b980bfc09..4d152ddb5 100644 --- a/spark/src/main/scala/ai/chronon/spark/LocalDataLoader.scala +++ b/spark/src/main/scala/ai/chronon/spark/LocalDataLoader.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.spark.Extensions.StructTypeOps import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.util.FailFastMode @@ -25,12 +26,13 @@ import org.apache.spark.sql.types.{StringType, TimestampType} import java.io.File object LocalDataLoader { + private val logger = LoggerFactory.getLogger(getClass) def writeTableFromFile(file: File, tableName: String, session: SparkSession): Unit = { - println(s"Checking table: ${tableName}") + logger.info(s"Checking table: ${tableName}") if (session.catalog.tableExists(tableName)) return val extension = file.getName.split("\\.").last if (!Seq("csv", "json", "jsonl").contains(extension)) { - println(s"Unable to load file due to invalid extension from file: ${file.getPath}") + logger.info(s"Unable to load file due to invalid extension from file: ${file.getPath}") return } @@ -53,9 +55,9 @@ object LocalDataLoader { .drop("ts_string") } - println(s"Loading data from ${file.getPath} into $tableName. Sample data and schema shown below") + logger.info(s"Loading data from ${file.getPath} into $tableName. Sample data and schema shown below") df.show(100) - println(df.schema.pretty) + logger.info(df.schema.pretty) if (df.schema.map(_.name).contains("ds")) { df.write.partitionBy("ds").saveAsTable(tableName) @@ -100,7 +102,7 @@ object LocalDataLoader { ): Unit = { assert(file.exists(), s"Non existent file: ${file.getPath}") assert(file.isFile, s"Cannot load a directory as a local table: ${file.getPath}") - println(s"Loading file(${file.getPath}) as $namespace.$tableName") + logger.info(s"Loading file(${file.getPath}) as $namespace.$tableName") if (!session.catalog.databaseExists(namespace)) session.sql(s"CREATE DATABASE $namespace") diff --git a/spark/src/main/scala/ai/chronon/spark/LogFlattenerJob.scala b/spark/src/main/scala/ai/chronon/spark/LogFlattenerJob.scala index e6ed12af4..39f8bc8f8 100644 --- a/spark/src/main/scala/ai/chronon/spark/LogFlattenerJob.scala +++ b/spark/src/main/scala/ai/chronon/spark/LogFlattenerJob.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.api import ai.chronon.api.Extensions._ import ai.chronon.api._ @@ -50,6 +51,7 @@ class LogFlattenerJob(session: SparkSession, schemaTable: String, stepDays: Option[Int] = None) extends Serializable { + private val logger = LoggerFactory.getLogger(getClass) implicit val tableUtils: TableUtils = TableUtils(session) val joinTblProps: Map[String, String] = Option(joinConf.metaData.tableProperties) .map(_.toScala) @@ -69,13 +71,13 @@ class LogFlattenerJob(session: SparkSession, val ranges = unfilledRangeTry match { case Failure(_: AssertionError) => { - println(s""" + logger.info(s""" |The join name ${joinConf.metaData.nameToFilePath} does not have available logged data yet. |Please double check your logging status""".stripMargin) Seq() } case Success(None) => { - println( + logger.info( s"$outputTable seems to be caught up - to either " + s"$inputTable(latest ${tableUtils.lastAvailablePartition(inputTable)}) or $endDate.") Seq() @@ -196,7 +198,7 @@ class LogFlattenerJob(session: SparkSession, def buildLogTable(): Unit = { if (!joinConf.metaData.isSetSamplePercent) { - println(s"samplePercent is unset for ${joinConf.metaData.name}. Exit.") + logger.info(s"samplePercent is unset for ${joinConf.metaData.name}. Exit.") return } val unfilledRanges = getUnfilledRanges(logTable, joinConf.metaData.loggedTable) @@ -216,8 +218,8 @@ class LogFlattenerJob(session: SparkSession, val flattenedDf = flattenKeyValueBytes(rawDf, schemaMap) val schemaTblProps = buildTableProperties(schemaStringsMap) - println("======= Log table schema =======") - println(flattenedDf.schema.pretty) + logger.info("======= Log table schema =======") + logger.info(flattenedDf.schema.pretty) tableUtils.insertPartitions(flattenedDf, joinConf.metaData.loggedTable, tableProperties = @@ -237,7 +239,7 @@ class LogFlattenerJob(session: SparkSession, val failureCount = totalInputRowCount - totalOutputRowCount metrics.gauge(Metrics.Name.RowCount, totalOutputRowCount) metrics.gauge(Metrics.Name.FailureCount, failureCount) - println(s"Processed logs: ${totalOutputRowCount} rows success, ${failureCount} rows failed.") + logger.info(s"Processed logs: ${totalOutputRowCount} rows success, ${failureCount} rows failed.") metrics.gauge(Metrics.Name.ColumnBeforeCount, columnBeforeCount) metrics.gauge(Metrics.Name.ColumnAfterCount, columnAfterCount) val elapsedMins = (System.currentTimeMillis() - start) / 60000 diff --git a/spark/src/main/scala/ai/chronon/spark/MetadataExporter.scala b/spark/src/main/scala/ai/chronon/spark/MetadataExporter.scala index cae3b775b..02c00dd4d 100644 --- a/spark/src/main/scala/ai/chronon/spark/MetadataExporter.scala +++ b/spark/src/main/scala/ai/chronon/spark/MetadataExporter.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import java.io.{BufferedWriter, File, FileWriter} import ai.chronon.api import ai.chronon.api.{DataType, ThriftJsonCodec} @@ -28,6 +29,7 @@ import java.nio.file.Paths import scala.collection.immutable.Map object MetadataExporter { + private val logger = LoggerFactory.getLogger(getClass) val GROUPBY_PATH_SUFFIX = "/group_bys" val JOIN_PATH_SUFFIX = "/joins" @@ -63,7 +65,7 @@ object MetadataExporter { } } catch { case exception: Throwable => - println(s"Exception while processing entity $path: ${ExceptionUtils.getStackTrace(exception)}") + logger.info(s"Exception while processing entity $path: ${ExceptionUtils.getStackTrace(exception)}") configData } mapper.writeValueAsString(enrichedData) @@ -76,7 +78,7 @@ object MetadataExporter { val writer = new BufferedWriter(new FileWriter(file)) writer.write(data) writer.close() - println(s"${path} : Wrote to output directory successfully") + logger.info(s"${path} : Wrote to output directory successfully") } def processEntities(inputPath: String, outputPath: String, suffix: String): Unit = { @@ -90,7 +92,7 @@ object MetadataExporter { } } val failuresAndTraces = processSuccess.filter(!_._2) - println( + logger.info( s"Successfully processed ${processSuccess.filter(_._2).length} from $suffix \n " + s"Failed to process ${failuresAndTraces.length}: \n ${failuresAndTraces.mkString("\n")}") } diff --git a/spark/src/main/scala/ai/chronon/spark/SparkSessionBuilder.scala b/spark/src/main/scala/ai/chronon/spark/SparkSessionBuilder.scala index 6a3aa3678..64b7c7372 100644 --- a/spark/src/main/scala/ai/chronon/spark/SparkSessionBuilder.scala +++ b/spark/src/main/scala/ai/chronon/spark/SparkSessionBuilder.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import org.apache.spark.sql.SparkSession import org.apache.spark.SPARK_VERSION @@ -25,6 +26,7 @@ import scala.reflect.io.Path import scala.util.Properties object SparkSessionBuilder { + private val logger = LoggerFactory.getLogger(getClass) val DefaultWarehouseDir = new File("/tmp/chronon/spark-warehouse") @@ -67,7 +69,7 @@ object SparkSessionBuilder { } val builder = if (local) { - println(s"Building local spark session with warehouse at $warehouseDir") + logger.info(s"Building local spark session with warehouse at $warehouseDir") val metastoreDb = s"jdbc:derby:;databaseName=$warehouseDir/metastore_db;create=true" baseBuilder // use all threads - or the tests will be slow diff --git a/spark/src/main/scala/ai/chronon/spark/StagingQuery.scala b/spark/src/main/scala/ai/chronon/spark/StagingQuery.scala index a19dfb455..ba79a188a 100644 --- a/spark/src/main/scala/ai/chronon/spark/StagingQuery.scala +++ b/spark/src/main/scala/ai/chronon/spark/StagingQuery.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.api import ai.chronon.api.ParametricMacro import ai.chronon.api.Extensions._ @@ -25,6 +26,7 @@ import scala.collection.mutable import scala.util.ScalaJavaConversions._ class StagingQuery(stagingQueryConf: api.StagingQuery, endPartition: String, tableUtils: TableUtils) { + private val logger = LoggerFactory.getLogger(getClass) assert(Option(stagingQueryConf.metaData.outputNamespace).nonEmpty, s"output namespace could not be empty or null") private val outputTable = stagingQueryConf.metaData.outputTable private val tableProps = Option(stagingQueryConf.metaData.tableProperties) @@ -48,31 +50,31 @@ class StagingQuery(stagingQueryConf: api.StagingQuery, endPartition: String, tab tableUtils.unfilledRanges(outputTable, PartitionRange(stagingQueryConf.startPartition, endPartition)(tableUtils)) if (unfilledRanges.isEmpty) { - println(s"""No unfilled range for $outputTable given + logger.info(s"""No unfilled range for $outputTable given |start partition of ${stagingQueryConf.startPartition} |end partition of $endPartition |""".stripMargin) return } val stagingQueryUnfilledRanges = unfilledRanges.get - println(s"Staging Query unfilled ranges: $stagingQueryUnfilledRanges") + logger.info(s"Staging Query unfilled ranges: $stagingQueryUnfilledRanges") val exceptions = mutable.Buffer.empty[String] stagingQueryUnfilledRanges.foreach { stagingQueryUnfilledRange => try { val stepRanges = stepDays.map(stagingQueryUnfilledRange.steps).getOrElse(Seq(stagingQueryUnfilledRange)) - println(s"Staging query ranges to compute: ${stepRanges.map { _.toString }.pretty}") + logger.info(s"Staging query ranges to compute: ${stepRanges.map { _.toString }.pretty}") stepRanges.zipWithIndex.foreach { case (range, index) => val progress = s"| [${index + 1}/${stepRanges.size}]" - println(s"Computing staging query for range: $range $progress") + logger.info(s"Computing staging query for range: $range $progress") val renderedQuery = StagingQuery.substitute(tableUtils, stagingQueryConf.query, range.start, range.end, endPartition) - println(s"Rendered Staging Query to run is:\n$renderedQuery") + logger.info(s"Rendered Staging Query to run is:\n$renderedQuery") val df = tableUtils.sql(renderedQuery) tableUtils.insertPartitions(df, outputTable, tableProps, partitionCols, autoExpand = enableAutoExpand.get) - println(s"Wrote to table $outputTable, into partitions: $range $progress") + logger.info(s"Wrote to table $outputTable, into partitions: $range $progress") } - println(s"Finished writing Staging Query data to $outputTable") + logger.info(s"Finished writing Staging Query data to $outputTable") } catch { case err: Throwable => exceptions.append(s"Error handling range $stagingQueryUnfilledRange : ${err.getMessage}\n${err.traceString}") diff --git a/spark/src/main/scala/ai/chronon/spark/TableUtils.scala b/spark/src/main/scala/ai/chronon/spark/TableUtils.scala index 22a82084e..71a2c6b6d 100644 --- a/spark/src/main/scala/ai/chronon/spark/TableUtils.scala +++ b/spark/src/main/scala/ai/chronon/spark/TableUtils.scala @@ -16,6 +16,7 @@ package ai.chronon.spark +import org.slf4j.LoggerFactory import ai.chronon.aggregator.windowing.TsUtils import ai.chronon.api.{Constants, PartitionSpec} import ai.chronon.api.Extensions._ @@ -161,10 +162,10 @@ case class TableUtils(sparkSession: SparkSession) { sparkSession.read.format("iceberg").load(tableName) } match { case Success(_) => - println(s"IcebergCheck: Detected iceberg formatted table $tableName.") + logger.info(s"IcebergCheck: Detected iceberg formatted table $tableName.") true case _ => - println(s"IcebergCheck: Checked table $tableName is not iceberg format.") + logger.info(s"IcebergCheck: Checked table $tableName is not iceberg format.") false } @@ -231,7 +232,7 @@ case class TableUtils(sparkSession: SparkSession) { def checkTablePermission(tableName: String, fallbackPartition: String = partitionSpec.before(partitionSpec.at(System.currentTimeMillis()))): Boolean = { - println(s"Checking permission for table $tableName...") + logger.info(s"Checking permission for table $tableName...") try { // retrieve one row from the table val partitionFilter = lastAvailablePartition(tableName).getOrElse(fallbackPartition) @@ -240,14 +241,14 @@ case class TableUtils(sparkSession: SparkSession) { } catch { case e: RuntimeException => if (e.getMessage.contains("ACCESS DENIED")) - println(s"[Error] No access to table: $tableName ") + logger.info(s"[Error] No access to table: $tableName ") else { - println(s"[Error] Encountered exception when reading table: $tableName.") + logger.info(s"[Error] Encountered exception when reading table: $tableName.") e.printStackTrace() } false case ex: Exception => - println(s"[Error] Encountered exception when reading table: $tableName.") + logger.info(s"[Error] Encountered exception when reading table: $tableName.") ex.printStackTrace() true } @@ -284,7 +285,7 @@ case class TableUtils(sparkSession: SparkSession) { sql(creationSql) } catch { case e: Exception => - println(s"Failed to create table $tableName with error: ${e.getMessage}") + logger.info(s"Failed to create table $tableName with error: ${e.getMessage}") throw e } } else { @@ -318,7 +319,7 @@ case class TableUtils(sparkSession: SparkSession) { def sql(query: String): DataFrame = { val partitionCount = sparkSession.sparkContext.getConf.getInt("spark.default.parallelism", 1000) - println( + logger.info( s"\n----[Running query coalesced into at most $partitionCount partitions]----\n$query\n----[End of Query]----\n") val df = sparkSession.sql(query).coalesce(partitionCount) df @@ -354,13 +355,13 @@ case class TableUtils(sparkSession: SparkSession) { def wrapWithCache[T](opString: String, dataFrame: DataFrame)(func: => T): Try[T] = { val start = System.currentTimeMillis() cacheLevel.foreach { level => - println(s"Starting to cache dataframe before $opString - start @ ${TsUtils.toStr(start)}") + logger.info(s"Starting to cache dataframe before $opString - start @ ${TsUtils.toStr(start)}") dataFrame.persist(level) } def clear(): Unit = { cacheLevel.foreach(_ => dataFrame.unpersist(blockingCacheEviction)) val end = System.currentTimeMillis() - println( + logger.info( s"Cleared the dataframe cache after $opString - start @ ${TsUtils.toStr(start)} end @ ${TsUtils.toStr(end)}") } Try { @@ -400,7 +401,7 @@ case class TableUtils(sparkSession: SparkSession) { (df.count(), 1) } - println(s"$rowCount rows requested to be written into table $tableName") + logger.info(s"$rowCount rows requested to be written into table $tableName") if (rowCount > 0) { val columnSizeEstimate = columnSizeEstimator(df.schema) @@ -428,7 +429,7 @@ case class TableUtils(sparkSession: SparkSession) { .flatMap(value => if (value > 0) Some(value) else None) if (outputParallelism.isDefined) { - println(s"Using custom outputParallelism ${outputParallelism.get}") + logger.info(s"Using custom outputParallelism ${outputParallelism.get}") } val dailyFileCount = outputParallelism.getOrElse(dailyFileCountBounded) @@ -437,7 +438,7 @@ case class TableUtils(sparkSession: SparkSession) { val saltCol = "random_partition_salt" val saltedDf = df.withColumn(saltCol, round(rand() * (dailyFileCount + 1))) - println( + logger.info( s"repartitioning data for table $tableName by $shuffleParallelism spark tasks into $tablePartitionCount table partitions and $dailyFileCount files per partition") val repartitionCols = if (df.schema.fieldNames.contains(partitionColumn)) { @@ -449,7 +450,7 @@ case class TableUtils(sparkSession: SparkSession) { .write .mode(saveMode) .insertInto(tableName) - println(s"Finished writing to $tableName") + logger.info(s"Finished writing to $tableName") } } @@ -558,7 +559,7 @@ case class TableUtils(sparkSession: SparkSession) { val inputMissing = fillablePartitions -- allInputExisting val missingPartitions = outputMissing -- inputMissing val missingChunks = chunk(missingPartitions) - println(s""" + logger.info(s""" |Unfilled range computation: | Output table: $outputTable | Missing output partitions: ${outputMissing.toSeq.sorted.prettyInline} @@ -582,14 +583,14 @@ case class TableUtils(sparkSession: SparkSession) { def dropTableIfExists(tableName: String): Unit = { val command = s"DROP TABLE IF EXISTS $tableName" - println(s"Dropping table with command: $command") + logger.info(s"Dropping table with command: $command") sql(command) } def archiveOrDropTableIfExists(tableName: String, timestamp: Option[Instant]): Unit = { val archiveTry = Try(archiveTableIfExists(tableName, timestamp)) archiveTry.failed.foreach { e => - println(s"""Fail to archive table ${tableName} + logger.info(s"""Fail to archive table ${tableName} |${e.getMessage} |Proceed to dropping the table instead. |""".stripMargin) @@ -602,7 +603,7 @@ case class TableUtils(sparkSession: SparkSession) { val humanReadableTimestamp = archiveTimestampFormatter.format(timestamp.getOrElse(Instant.now())) val finalArchiveTableName = s"${tableName}_${humanReadableTimestamp}" val command = s"ALTER TABLE $tableName RENAME TO $finalArchiveTableName" - println(s"Archiving table with command: $command") + logger.info(s"Archiving table with command: $command") sql(command) } } @@ -624,7 +625,7 @@ case class TableUtils(sparkSession: SparkSession) { val earliestHoleOpt = (inputPartitions -- outputPartitions).reduceLeftOption(Ordering[String].min) earliestHoleOpt.foreach { hole => val toDrop = outputPartitions.filter(_ > hole) - println(s""" + logger.info(s""" |Earliest hole at $hole in output table $outputTable, relative to $inputTable |Input Parts : ${inputPartitions.toArray.sorted.mkString("Array(", ", ", ")")} |Output Parts : ${outputPartitions.toArray.sorted.mkString("Array(", ", ", ")")} @@ -653,7 +654,7 @@ case class TableUtils(sparkSession: SparkSession) { val dropSql = s"ALTER TABLE $tableName DROP IF EXISTS $partitionSpecs" sql(dropSql) } else { - println(s"$tableName doesn't exist, please double check before drop partitions") + logger.info(s"$tableName doesn't exist, please double check before drop partitions") } } @@ -665,7 +666,7 @@ case class TableUtils(sparkSession: SparkSession) { val toDrop = Stream.iterate(startDate)(partitionSpec.after).takeWhile(_ <= endDate) dropPartitions(tableName, toDrop, partitionColumn, subPartitionFilters) } else { - println(s"$tableName doesn't exist, please double check before drop partitions") + logger.info(s"$tableName doesn't exist, please double check before drop partitions") } } @@ -728,7 +729,7 @@ case class TableUtils(sparkSession: SparkSession) { if (excludedFields.nonEmpty) { val excludedFieldsStr = excludedFields.map(tuple => s"columnName: ${tuple._1} dataType: ${tuple._2.dataType.catalogString}") - println( + logger.info( s"""Warning. Detected columns that exist in Hive table but not in updated schema. These are ignored in DDL. |${excludedFieldsStr.mkString("\n")} |""".stripMargin) diff --git a/spark/src/main/scala/ai/chronon/spark/stats/CompareBaseJob.scala b/spark/src/main/scala/ai/chronon/spark/stats/CompareBaseJob.scala index 0ae31c525..78ec9aaae 100644 --- a/spark/src/main/scala/ai/chronon/spark/stats/CompareBaseJob.scala +++ b/spark/src/main/scala/ai/chronon/spark/stats/CompareBaseJob.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.stats +import org.slf4j.LoggerFactory import ai.chronon.api._ import ai.chronon.online.{SparkConversions, _} import ai.chronon.spark.Extensions._ @@ -26,6 +27,7 @@ import org.apache.spark.sql.types.DataType import scala.collection.mutable.ListBuffer object CompareBaseJob { + private val logger = LoggerFactory.getLogger(getClass) def checkConsistency( leftFields: Map[String, DataType], @@ -132,10 +134,10 @@ object CompareBaseJob { } else { leftDf } - println(s"Pruning fields from the left source for equivalent comparison - ${prunedColumns.mkString(",")}") + logger.info(s"Pruning fields from the left source for equivalent comparison - ${prunedColumns.mkString(",")}") // 3. Build comparison dataframe - println(s"""Join keys: ${keys.mkString(", ")} + logger.info(s"""Join keys: ${keys.mkString(", ")} |Left Schema: |${prunedLeftDf.schema.pretty} | diff --git a/spark/src/main/scala/ai/chronon/spark/stats/CompareJob.scala b/spark/src/main/scala/ai/chronon/spark/stats/CompareJob.scala index 87f7ba809..5e20f59b7 100644 --- a/spark/src/main/scala/ai/chronon/spark/stats/CompareJob.scala +++ b/spark/src/main/scala/ai/chronon/spark/stats/CompareJob.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.stats +import org.slf4j.LoggerFactory import ai.chronon.api import ai.chronon.api.{Constants, PartitionSpec} import ai.chronon.api.DataModel.Events @@ -38,6 +39,7 @@ class CompareJob( startDate: String, endDate: String ) extends Serializable { + private val logger = LoggerFactory.getLogger(getClass) val tableProps: Map[String, String] = Option(joinConf.metaData.tableProperties) .map(_.toScala) .orNull @@ -68,21 +70,21 @@ class CompareJob( CompareBaseJob.compare(leftDf, rightDf, getJoinKeys(joinConf, tableUtils), tableUtils, migrationCheck = true) // Save the comparison table - println("Saving comparison output..") - println(s"Comparison schema ${compareDf.schema.fields.map(sb => (sb.name, sb.dataType)).toMap.mkString("\n - ")}") + logger.info("Saving comparison output..") + logger.info(s"Comparison schema ${compareDf.schema.fields.map(sb => (sb.name, sb.dataType)).toMap.mkString("\n - ")}") tableUtils.insertUnPartitioned(compareDf, comparisonTableName, tableProps, saveMode = SaveMode.Overwrite) // Save the metrics table - println("Saving metrics output..") + logger.info("Saving metrics output..") val metricsDf = metricsTimedKvRdd.toFlatDf - println(s"Metrics schema ${metricsDf.schema.fields.map(sb => (sb.name, sb.dataType)).toMap.mkString("\n - ")}") + logger.info(s"Metrics schema ${metricsDf.schema.fields.map(sb => (sb.name, sb.dataType)).toMap.mkString("\n - ")}") tableUtils.insertUnPartitioned(metricsDf, metricsTableName, tableProps, saveMode = SaveMode.Overwrite) - println("Printing basic comparison results..") - println("(Note: This is just an estimation and not a detailed analysis of results)") + logger.info("Printing basic comparison results..") + logger.info("(Note: This is just an estimation and not a detailed analysis of results)") CompareJob.printAndGetBasicMetrics(metrics, tableUtils.partitionSpec) - println("Finished compare stats.") + logger.info("Finished compare stats.") (compareDf, metricsDf, metrics) } @@ -150,13 +152,13 @@ object CompareJob { val consolidatedData = getConsolidatedData(metrics, partitionSpec) if (consolidatedData.size == 0) { - println( + logger.info( s"No discrepancies found for data mismatches and missing counts. " + s"It is highly recommended to explore the full metrics.") } else { consolidatedData.foreach { case (date, mismatchCount) => - println(s"Found ${mismatchCount} mismatches on date '${date}'") + logger.info(s"Found ${mismatchCount} mismatches on date '${date}'") } } consolidatedData diff --git a/spark/src/main/scala/ai/chronon/spark/stats/ConsistencyJob.scala b/spark/src/main/scala/ai/chronon/spark/stats/ConsistencyJob.scala index 2b7e81521..85159a53b 100644 --- a/spark/src/main/scala/ai/chronon/spark/stats/ConsistencyJob.scala +++ b/spark/src/main/scala/ai/chronon/spark/stats/ConsistencyJob.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.stats +import org.slf4j.LoggerFactory import ai.chronon import ai.chronon.api.Extensions._ import ai.chronon.api._ @@ -28,6 +29,7 @@ import java.util import scala.util.ScalaJavaConversions.{JListOps, ListOps, MapOps} class ConsistencyJob(session: SparkSession, joinConf: Join, endDate: String) extends Serializable { + private val logger = LoggerFactory.getLogger(getClass) val tblProperties: Map[String, String] = Option(joinConf.metaData.tableProperties) .map(_.toScala) @@ -36,7 +38,7 @@ class ConsistencyJob(session: SparkSession, joinConf: Join, endDate: String) ext // Replace join's left side with the logged table events to determine offline values of the aggregations. private def buildComparisonJoin(): Join = { - println("Building Join With left as logged") + logger.info("Building Join With left as logged") val copiedJoin = joinConf.deepCopy() val loggedSource: Source = new Source() val loggedEvents: EventSource = new EventSource() @@ -76,26 +78,26 @@ class ConsistencyJob(session: SparkSession, joinConf: Join, endDate: String) ext .getOrElse(Seq.empty) if (unfilledRanges.isEmpty) return val join = new chronon.spark.Join(buildComparisonJoin(), unfilledRanges.last.end, TableUtils(session)) - println("Starting compute Join for comparison table") + logger.info("Starting compute Join for comparison table") val compareDf = join.computeJoin(Some(30)) - println("======= side-by-side comparison schema =======") - println(compareDf.schema.pretty) + logger.info("======= side-by-side comparison schema =======") + logger.info(compareDf.schema.pretty) } def buildConsistencyMetrics(): DataMetrics = { // migrate legacy configs without consistencySamplePercent param if (!joinConf.metaData.isSetConsistencySamplePercent) { - println("consistencySamplePercent is unset and will default to 100") + logger.info("consistencySamplePercent is unset and will default to 100") joinConf.metaData.consistencySamplePercent = 100 } if (joinConf.metaData.consistencySamplePercent == 0) { - println(s"Exit ConsistencyJob because consistencySamplePercent = 0 for join conf ${joinConf.metaData.name}") + logger.info(s"Exit ConsistencyJob because consistencySamplePercent = 0 for join conf ${joinConf.metaData.name}") return DataMetrics(Seq()) } buildComparisonTable() - println("Determining Range between consistency table and comparison table") + logger.info("Determining Range between consistency table and comparison table") val unfilledRanges = tableUtils .unfilledRanges(joinConf.metaData.consistencyTable, PartitionRange(null, endDate), @@ -108,22 +110,22 @@ class ConsistencyJob(session: SparkSession, joinConf: Join, endDate: String) ext tableUtils.sql(unfilled.genScanQuery(null, joinConf.metaData.loggedTable)).drop(Constants.SchemaHash) // there could be external columns that are logged during online env, therefore they could not be used for computing OOC val loggedDfNoExternalCols = loggedDf.select(comparisonDf.columns.map(org.apache.spark.sql.functions.col): _*) - println("Starting compare job for stats") + logger.info("Starting compare job for stats") val joinKeys = if (joinConf.isSetRowIds) { joinConf.rowIds.toScala } else { JoinCodec.timeFields.map(_.name).toList ++ joinConf.leftKeyCols } - println(s"Using ${joinKeys.mkString("[", ",", "]")} as join keys between log and backfill.") + logger.info(s"Using ${joinKeys.mkString("[", ",", "]")} as join keys between log and backfill.") val (compareDf, metricsKvRdd, metrics) = CompareBaseJob.compare(comparisonDf, loggedDfNoExternalCols, keys = joinKeys, tableUtils, name = joinConf.metaData.nameToFilePath) - println("Saving output.") + logger.info("Saving output.") val outputDf = metricsKvRdd.toFlatDf.withTimeBasedColumn("ds") - println(s"output schema ${outputDf.schema.fields.map(sb => (sb.name, sb.dataType)).toMap.mkString("\n - ")}") + logger.info(s"output schema ${outputDf.schema.fields.map(sb => (sb.name, sb.dataType)).toMap.mkString("\n - ")}") tableUtils.insertPartitions(outputDf, joinConf.metaData.consistencyTable, tableProperties = tblProperties, diff --git a/spark/src/main/scala/ai/chronon/spark/stats/SummaryJob.scala b/spark/src/main/scala/ai/chronon/spark/stats/SummaryJob.scala index f6fdca014..175b096fd 100644 --- a/spark/src/main/scala/ai/chronon/spark/stats/SummaryJob.scala +++ b/spark/src/main/scala/ai/chronon/spark/stats/SummaryJob.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.stats +import org.slf4j.LoggerFactory import ai.chronon.online.SparkConversions import ai.chronon.aggregator.row.StatsGenerator import ai.chronon.api.Extensions._ @@ -31,6 +32,7 @@ import org.apache.spark.sql.SparkSession * Follow pattern of OOC for computing offline and uploading online as well. */ class SummaryJob(session: SparkSession, joinConf: Join, endDate: String) extends Serializable { + private val logger = LoggerFactory.getLogger(getClass) val tableUtils: TableUtils = TableUtils(session) private val loggingStatsTable = joinConf.metaData.loggingStatsTable @@ -50,17 +52,17 @@ class SummaryJob(session: SparkSession, joinConf: Join, endDate: String) extends .unfilledRanges(outputTable, PartitionRange(null, endDate)(tableUtils), Some(Seq(inputTable))) .getOrElse(Seq.empty) if (unfilledRanges.isEmpty) { - println(s"No data to compute for $outputTable") + logger.info(s"No data to compute for $outputTable") return } unfilledRanges.foreach { computeRange => - println(s"Daily output statistics table $outputTable unfilled range: $computeRange") + logger.info(s"Daily output statistics table $outputTable unfilled range: $computeRange") val stepRanges = stepDays.map(computeRange.steps).getOrElse(Seq(computeRange)) - println(s"Ranges to compute: ${stepRanges.map(_.toString).pretty}") + logger.info(s"Ranges to compute: ${stepRanges.map(_.toString).pretty}") // We are going to build the aggregator to denormalize sketches for hive. stepRanges.zipWithIndex.foreach { case (range, index) => - println(s"Computing range [${index + 1}/${stepRanges.size}]: $range") + logger.info(s"Computing range [${index + 1}/${stepRanges.size}]: $range") val joinOutputDf = tableUtils.sql(s""" |SELECT * |FROM ${inputTable} @@ -82,10 +84,10 @@ class SummaryJob(session: SparkSession, joinConf: Join, endDate: String) extends stats .addDerivedMetrics(summaryKvRdd.toFlatDf, aggregator) .save(outputTable, tableProps) - println(s"Finished range [${index + 1}/${stepRanges.size}].") + logger.info(s"Finished range [${index + 1}/${stepRanges.size}].") } } - println("Finished writing stats.") + logger.info("Finished writing stats.") } /** diff --git a/spark/src/main/scala/ai/chronon/spark/streaming/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/streaming/GroupBy.scala index b1f0f911d..f3fc5cf0a 100644 --- a/spark/src/main/scala/ai/chronon/spark/streaming/GroupBy.scala +++ b/spark/src/main/scala/ai/chronon/spark/streaming/GroupBy.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.streaming +import org.slf4j.LoggerFactory import ai.chronon import ai.chronon.api import ai.chronon.api.{Row => _, _} @@ -40,6 +41,7 @@ class GroupBy(inputStream: DataFrame, onlineImpl: Api, debug: Boolean = false) extends Serializable { + private val logger = LoggerFactory.getLogger(getClass) private def buildStreamingQuery(inputTable: String): String = { val streamingSource = groupByConf.streamingSource.get @@ -102,7 +104,7 @@ class GroupBy(inputStream: DataFrame, streamDecoder.decode(arr) } catch { case ex: Throwable => - println( + logger.info( s"Error while decoding streaming events for ${groupByConf.getMetaData.getName} with " + s"schema ${streamDecoder.schema.catalogString}" + s" \n${ex.traceString}") @@ -114,7 +116,7 @@ class GroupBy(inputStream: DataFrame, mutation != null && (!(mutation.before != null && mutation.after != null) || !(mutation.before sameElements mutation.after))) val streamSchema = SparkConversions.fromChrononSchema(streamDecoder.schema) - println(s""" + logger.info(s""" | group by serving info: $groupByServingInfo | Streaming source: $streamingSource | streaming Query: $streamingQuery @@ -171,7 +173,7 @@ class GroupBy(inputStream: DataFrame, val gson = new Gson() val formatter = DateTimeFormatter.ISO_LOCAL_DATE_TIME.withZone(ZoneId.from(ZoneOffset.UTC)) val pstFormatter = DateTimeFormatter.ISO_LOCAL_DATE_TIME.withZone(ZoneId.of("America/Los_Angeles")) - println(s""" + logger.info(s""" |streaming dataset: $streamingDataset |keys: ${gson.toJson(keys)} |values: ${gson.toJson(values)} diff --git a/spark/src/main/scala/ai/chronon/spark/streaming/JoinSourceRunner.scala b/spark/src/main/scala/ai/chronon/spark/streaming/JoinSourceRunner.scala index 57ecef9be..cf460c093 100644 --- a/spark/src/main/scala/ai/chronon/spark/streaming/JoinSourceRunner.scala +++ b/spark/src/main/scala/ai/chronon/spark/streaming/JoinSourceRunner.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.streaming +import org.slf4j.LoggerFactory import ai.chronon.api import ai.chronon.api.Extensions.{GroupByOps, SourceOps} import ai.chronon.api._ @@ -41,6 +42,7 @@ import scala.util.ScalaJavaConversions.{IteratorOps, JIteratorOps, ListOps, MapO // micro batching destroys and re-creates these objects repeatedly through ForeachBatchWriter and MapFunction // this allows for re-use object LocalIOCache { + private val logger = LoggerFactory.getLogger(getClass) private var fetcher: Fetcher = null private var kvStore: KVStore = null def getOrSetFetcher(builderFunc: () => Fetcher): Fetcher = { @@ -63,6 +65,7 @@ class JoinSourceRunner(groupByConf: api.GroupBy, conf: Map[String, String] = Map session: SparkSession, apiImpl: Api) extends Serializable { + private val logger = LoggerFactory.getLogger(getClass) val context: Metrics.Context = Metrics.Context(Metrics.Environment.GroupByStreaming, groupByConf) @@ -131,7 +134,7 @@ class JoinSourceRunner(groupByConf: api.GroupBy, conf: Map[String, String] = Map val gson = new Gson() val formatter = DateTimeFormatter.ISO_LOCAL_DATE_TIME.withZone(ZoneId.from(ZoneOffset.UTC)) val pstFormatter = DateTimeFormatter.ISO_LOCAL_DATE_TIME.withZone(ZoneId.of("America/Los_Angeles")) - println(s""" + logger.info(s""" |dataset: $streamingDataset |keys: ${gson.toJson(keys)} |values: ${gson.toJson(values)} @@ -194,7 +197,7 @@ class JoinSourceRunner(groupByConf: api.GroupBy, conf: Map[String, String] = Map // GroupBy -> JoinSource (Join + outer_query) // Join -> // Join.left -> (left.(table, mutation_stream, etc) + inner_query) - println(s""" + logger.info(s""" |Schemas across chain of transformations |leftSchema: | ${leftSchema.catalogString} @@ -230,7 +233,7 @@ class JoinSourceRunner(groupByConf: api.GroupBy, conf: Map[String, String] = Map streamDecoder.decode(arr) } catch { case ex: Throwable => - println(s"Error while decoding streaming events from stream: ${dataStream.topicInfo.name}") + logger.info(s"Error while decoding streaming events from stream: ${dataStream.topicInfo.name}") ex.printStackTrace() ingressContext.incrementException(ex) null @@ -242,7 +245,7 @@ class JoinSourceRunner(groupByConf: api.GroupBy, conf: Map[String, String] = Map (mutation != null) && (!bothNull || !bothSame) } val streamSchema = SparkConversions.fromChrononSchema(streamDecoder.schema) - println(s""" + logger.info(s""" | streaming source: ${groupByConf.streamingSource.get} | streaming dataset: ${groupByConf.streamingDataset} | stream schema: ${streamSchema.catalogString} @@ -293,7 +296,7 @@ class JoinSourceRunner(groupByConf: api.GroupBy, conf: Map[String, String] = Map def applyQuery(df: DataFrame, query: api.Query): DataFrame = { val queryParts = groupByConf.buildQueryParts(query) - println(s""" + logger.info(s""" |decoded schema: ${decoded.df.schema.catalogString} |queryParts: $queryParts |df schema: ${df.schema.prettyJson} @@ -307,7 +310,7 @@ class JoinSourceRunner(groupByConf: api.GroupBy, conf: Map[String, String] = Map val leftSource: Dataset[Row] = applyQuery(decoded.df, left.query) // key format joins//join_name val joinRequestName = joinSource.join.metaData.getName.replaceFirst("\\.", "/") - println(s"Upstream join request name: $joinRequestName") + logger.info(s"Upstream join request name: $joinRequestName") val tableUtils = TableUtils(session) // the decoded schema is in lower case @@ -329,7 +332,7 @@ class JoinSourceRunner(groupByConf: api.GroupBy, conf: Map[String, String] = Map val joinEncoder: Encoder[Row] = RowEncoder(schemas.joinSchema) val joinFields = schemas.joinSchema.fieldNames val leftColumns = schemas.leftSourceSchema.fieldNames - println(s""" + logger.info(s""" |left columns ${leftColumns.mkString(",")} |reqColumns ${reqColumns.mkString(",")} |Fetching upstream join to enrich the stream... Fetching lag time: $lagMillis @@ -342,7 +345,7 @@ class JoinSourceRunner(groupByConf: api.GroupBy, conf: Map[String, String] = Map override def call(rows: util.Iterator[Row]): util.Iterator[Row] = { val shouldSample = Math.random() <= 0.1 val fetcher = LocalIOCache.getOrSetFetcher { () => - println(s"Initializing Fetcher. ${System.currentTimeMillis()}") + logger.info(s"Initializing Fetcher. ${System.currentTimeMillis()}") context.increment("chain.fetcher.init") apiImpl.buildFetcher(debug = debug) } @@ -370,7 +373,7 @@ class JoinSourceRunner(groupByConf: api.GroupBy, conf: Map[String, String] = Map } if (debug && shouldSample) { - requests.foreach(request => println(s"request: ${request.keys}, ts: ${request.atMillis}")) + requests.foreach(request => logger.info(s"request: ${request.keys}, ts: ${request.atMillis}")) } val responsesFuture = fetcher.fetchJoin(requests = requests.toSeq) @@ -379,9 +382,9 @@ class JoinSourceRunner(groupByConf: api.GroupBy, conf: Map[String, String] = Map val responses = Await.result(responsesFuture, 5.second) if (debug && shouldSample) { - println(s"responses/request size: ${responses.size}/${requests.size}\n responses: ${responses}") + logger.info(s"responses/request size: ${responses.size}/${requests.size}\n responses: ${responses}") responses.foreach(response => - println( + logger.info( s"request: ${response.request.keys}, ts: ${response.request.atMillis}, values: ${response.values}")) } responses.iterator.map { response => @@ -419,8 +422,8 @@ class JoinSourceRunner(groupByConf: api.GroupBy, conf: Map[String, String] = Map val data = df.collect() val putRequests = data.map(putRequestHelper.toPutRequest) if (debug) { - println(s" Final df size to write: ${data.length}") - println(s" Size of putRequests to kv store- ${putRequests.length}") + logger.info(s" Final df size to write: ${data.length}") + logger.info(s" Size of putRequests to kv store- ${putRequests.length}") } else { putRequests.foreach(request => emitRequestMetric(request, context.withSuffix("egress"))) kvStore.multiPut(putRequests) diff --git a/spark/src/main/scala/ai/chronon/spark/streaming/KafkaStreamBuilder.scala b/spark/src/main/scala/ai/chronon/spark/streaming/KafkaStreamBuilder.scala index 8f205cfcf..35515a9f5 100644 --- a/spark/src/main/scala/ai/chronon/spark/streaming/KafkaStreamBuilder.scala +++ b/spark/src/main/scala/ai/chronon/spark/streaming/KafkaStreamBuilder.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.streaming +import org.slf4j.LoggerFactory import ai.chronon.online.{DataStream, StreamBuilder, TopicInfo} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.streaming.StreamingQueryListener @@ -26,21 +27,22 @@ import org.apache.spark.sql.streaming.StreamingQueryListener.{ } object KafkaStreamBuilder extends StreamBuilder { + private val logger = LoggerFactory.getLogger(getClass) override def from(topicInfo: TopicInfo)(implicit session: SparkSession, conf: Map[String, String]): DataStream = { val conf = topicInfo.params val bootstrap = conf.getOrElse("bootstrap", conf("host") + conf.get("port").map(":" + _).getOrElse("")) TopicChecker.topicShouldExist(topicInfo.name, bootstrap) session.streams.addListener(new StreamingQueryListener() { override def onQueryStarted(queryStarted: QueryStartedEvent): Unit = { - println("Query started: " + queryStarted.id) + logger.info("Query started: " + queryStarted.id) } override def onQueryTerminated(queryTerminated: QueryTerminatedEvent): Unit = { - println("Query terminated: " + queryTerminated.id) + logger.info("Query terminated: " + queryTerminated.id) } override def onQueryProgress(queryProgress: QueryProgressEvent): Unit = { - println("Query made progress: " + queryProgress.progress) + logger.info("Query made progress: " + queryProgress.progress) } }) val df = session.readStream diff --git a/spark/src/main/scala/ai/chronon/spark/streaming/StreamingStats.scala b/spark/src/main/scala/ai/chronon/spark/streaming/StreamingStats.scala index 23d200446..ea621d89a 100644 --- a/spark/src/main/scala/ai/chronon/spark/streaming/StreamingStats.scala +++ b/spark/src/main/scala/ai/chronon/spark/streaming/StreamingStats.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.streaming +import org.slf4j.LoggerFactory import ai.chronon.online.KVStore.PutRequest import com.yahoo.sketches.kll.KllFloatsSketch import org.apache.commons.io.FileUtils @@ -24,6 +25,7 @@ import java.time.format.DateTimeFormatter import java.time.{Instant, ZoneId, ZoneOffset} class StreamingStats(val publishDelaySeconds: Int) { + private val logger = LoggerFactory.getLogger(getClass) private var latencyHistogram: KllFloatsSketch = new KllFloatsSketch() private var latencyMsTotal: Long = 0 private var writesTotal: Long = 0 @@ -43,7 +45,7 @@ class StreamingStats(val publishDelaySeconds: Int) { val medianLatency = latencyHistogram.getQuantile(.5) val p95Latency = latencyHistogram.getQuantile(.95) val p99Latency = latencyHistogram.getQuantile(.99) - println(s""" + logger.info(s""" |[$threadName][${timeString(utc, now)}] Wrote $writesTotal records in last ${now - startMs} ms. | Latency ms: ${latencyMsTotal / writesTotal} (avg) / $medianLatency (median) / $p95Latency (p95) / $p99Latency (p99) | Key Size: ${keyBytesTotal / writesTotal} bytes (avg) / ${readable(keyBytesTotal)} (total) @@ -56,7 +58,7 @@ class StreamingStats(val publishDelaySeconds: Int) { latencyHistogram = new KllFloatsSketch() startMs = now } else { - println("No writes registered") + logger.info("No writes registered") } } diff --git a/spark/src/main/scala/ai/chronon/spark/streaming/TopicChecker.scala b/spark/src/main/scala/ai/chronon/spark/streaming/TopicChecker.scala index d25004575..d99ac61e8 100644 --- a/spark/src/main/scala/ai/chronon/spark/streaming/TopicChecker.scala +++ b/spark/src/main/scala/ai/chronon/spark/streaming/TopicChecker.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.streaming +import org.slf4j.LoggerFactory import ai.chronon.aggregator.base.BottomK import ai.chronon.api import ai.chronon.api.Extensions.{GroupByOps, SourceOps} @@ -34,6 +35,7 @@ import scala.reflect.ClassTag import scala.util.Try object TopicChecker { + private val logger = LoggerFactory.getLogger(getClass) def getPartitions(topic: String, bootstrap: String): Int = { val props = new Properties() @@ -76,7 +78,7 @@ object TopicChecker { | ------ End ------ |""".stripMargin) } else { - println(s"Found topic $topic in bootstrap $bootstrap.") + logger.info(s"Found topic $topic in bootstrap $bootstrap.") } } catch { case ex: Exception => throw new RuntimeException(s"Failed to check for topic ${topic} in ${bootstrap}", ex) @@ -84,6 +86,7 @@ object TopicChecker { } class Args(arguments: Seq[String]) extends ScallopConf(arguments) { + private val logger = LoggerFactory.getLogger(getClass) val conf: ScallopOption[String] = opt[String](descr = "Conf to pull topic and bootstrap server information") val bootstrap: ScallopOption[String] = opt[String](descr = "Kafka bootstrap server in host:port format") val topic: ScallopOption[String] = opt[String](descr = "kafka topic to check metadata for") @@ -106,7 +109,7 @@ object TopicChecker { } else { args.topic() -> args.bootstrap() } - println(getPartitions(topic, bootstrap)) + logger.info(getPartitions(topic, bootstrap)) System.exit(0) } } diff --git a/spark/src/test/scala/ai/chronon/spark/test/AnalyzerTest.scala b/spark/src/test/scala/ai/chronon/spark/test/AnalyzerTest.scala index 8989f6d2b..98d3ac8af 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/AnalyzerTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/AnalyzerTest.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.test.Column import ai.chronon.api import ai.chronon.api.{Accuracy, Builders, Operation, TimeUnit, Window} @@ -26,6 +27,7 @@ import org.junit.Assert.assertTrue import org.junit.Test class AnalyzerTest { + private val logger = LoggerFactory.getLogger(getClass) val spark: SparkSession = SparkSessionBuilder.build("AnalyzerTest", local = true) private val tableUtils = TableUtils(spark) @@ -68,8 +70,8 @@ class AnalyzerTest { val join = new Join(joinConf = joinConf, endPartition = oneMonthAgo, tableUtils) val computed = join.computeJoin() val expectedSchema = computed.schema.fields.map(field => s"${field.name} => ${field.dataType}").sorted - println("=== expected schema =====") - println(expectedSchema.mkString("\n")) + logger.info("=== expected schema =====") + logger.info(expectedSchema.mkString("\n")) assertTrue(expectedSchema sameElements analyzerSchema) } diff --git a/spark/src/test/scala/ai/chronon/spark/test/ChainingFetcherTest.scala b/spark/src/test/scala/ai/chronon/spark/test/ChainingFetcherTest.scala index 5623d7823..99ee21fa1 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/ChainingFetcherTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/ChainingFetcherTest.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.windowing.TsUtils import ai.chronon.api import ai.chronon.api.Constants.ChrononMetadataKey @@ -35,11 +36,12 @@ import java.lang import java.util.TimeZone import java.util.concurrent.Executors import scala.collection.Seq -import scala.Console.println +import scala.Console.logger.info import scala.concurrent.ExecutionContext import scala.util.ScalaJavaConversions._ class ChainingFetcherTest extends TestCase { + private val logger = LoggerFactory.getLogger(getClass) val sessionName = "ChainingFetcherTest" val spark: SparkSession = SparkSessionBuilder.build(sessionName, local = true) private val tableUtils = TableUtils(spark) @@ -103,7 +105,7 @@ class ChainingFetcherTest extends TestCase { .save(s"$namespace.${schema.name}") } - println("saved all data hand written for fetcher test") + logger.info("saved all data hand written for fetcher test") val startPartition = "2021-04-13" val endPartition = "2021-04-16" @@ -169,7 +171,7 @@ class ChainingFetcherTest extends TestCase { ).toList TestUtils.makeDf(spark, searchSchema, searchData).save(s"$namespace.${searchSchema.name}") - println("Created user search table.") + logger.info("Created user search table.") // construct chaining join val startPartition = "2021-04-14" @@ -217,7 +219,7 @@ class ChainingFetcherTest extends TestCase { val joinedDf = new ai.chronon.spark.Join(joinConf, endDs, tableUtils).computeJoin() val joinTable = s"$namespace.join_test_expected_${joinConf.metaData.cleanName}" joinedDf.save(joinTable) - println("=== Expected join table computed: === " + joinTable) + logger.info("=== Expected join table computed: === " + joinTable) joinedDf.show() val endDsExpected = tableUtils.sql(s"SELECT * FROM $joinTable WHERE ds='$endDs'") @@ -230,7 +232,7 @@ class ChainingFetcherTest extends TestCase { s"SELECT * FROM $joinTable WHERE ts >= unix_timestamp('$endDs', '${tableUtils.partitionSpec.format}')") } val endDsQueries = endDsEvents.drop(endDsEvents.schema.fieldNames.filter(_.contains("fetcher")): _*) - println("Queries:") + logger.info("Queries:") endDsQueries.show() val keys = joinConf.leftKeyCols @@ -268,7 +270,7 @@ class ChainingFetcherTest extends TestCase { .asInstanceOf[GenericRow] } - println(endDsExpected.schema.pretty) + logger.info(endDsExpected.schema.pretty) (endDsExpected, responseRows) } @@ -286,9 +288,9 @@ class ChainingFetcherTest extends TestCase { if (endDs != today) { responseDf = responseDf.drop("ds").withColumn("ds", lit(endDs)) } - println("expected:") + logger.info("expected:") expectedDf.show() - println("response:") + logger.info("response:") responseDf.show() // remove user during comparison since `user` is not the key @@ -299,9 +301,9 @@ class ChainingFetcherTest extends TestCase { bName = "offline") assertEquals(expectedDf.count(), responseDf.count()) if (diff.count() > 0) { - println(s"Total count: ${responseDf.count()}") - println(s"Diff count: ${diff.count()}") - println(s"diff result rows:") + logger.info(s"Total count: ${responseDf.count()}") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows:") diff .withTimeBasedColumn("ts_string", "ts", "yy-MM-dd HH:mm") .select("ts_string", diff.schema.fieldNames: _*) diff --git a/spark/src/test/scala/ai/chronon/spark/test/CompareTest.scala b/spark/src/test/scala/ai/chronon/spark/test/CompareTest.scala index 317320f14..ba02f7335 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/CompareTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/CompareTest.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.windowing.TsUtils import ai.chronon.online.DataMetrics import ai.chronon.spark.stats.CompareBaseJob @@ -24,6 +25,7 @@ import org.apache.spark.sql.{DataFrame, SparkSession} import org.junit.Test class CompareTest { + private val logger = LoggerFactory.getLogger(getClass) lazy val spark: SparkSession = SparkSessionBuilder.build("CompareTest", local = true) private val tableUtils = TableUtils(spark) @@ -62,7 +64,7 @@ class CompareTest { CompareBaseJob.compare(leftDf, rightDf, keys, tableUtils) val metricsDf = metricsKvRdd.toFlatDf metricsDf.show() - println(result) + logger.info(result) assert(result.series.length == 4, "Invalid result length") for (rowIndex <- 0 until leftData.length) { for ((colName, index) <- leftColumns.zipWithIndex) { @@ -99,7 +101,7 @@ class CompareTest { ) val metricsDf = metricsKvRdd.toFlatDf metricsDf.show() - println(result) + logger.info(result) assert(result.series.length == 4, "Invalid result length") for (rowIndex <- 0 until leftData.length) { for ((colName, index) <- leftColumns.zipWithIndex) { diff --git a/spark/src/test/scala/ai/chronon/spark/test/FeatureWithLabelJoinTest.scala b/spark/src/test/scala/ai/chronon/spark/test/FeatureWithLabelJoinTest.scala index 9bad4e3d7..c05aa74b9 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/FeatureWithLabelJoinTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/FeatureWithLabelJoinTest.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.api.Extensions.{LabelPartOps, MetadataOps} import ai.chronon.api.{Builders, LongType, StringType, StructField, StructType} import ai.chronon.spark.{Comparison, LabelJoin, SparkSessionBuilder, TableUtils} @@ -25,6 +26,7 @@ import org.junit.Assert.assertEquals import org.junit.Test class FeatureWithLabelJoinTest { + private val logger = LoggerFactory.getLogger(getClass) val spark: SparkSession = SparkSessionBuilder.build("FeatureWithLabelJoinTest", local = true) private val namespace = "final_join" @@ -52,11 +54,11 @@ class FeatureWithLabelJoinTest { val runner = new LabelJoin(joinConf, tableUtils, labelDS) val labelDf = runner.computeLabelJoin() - println(" == First Run Label version 2022-10-30 == ") + logger.info(" == First Run Label version 2022-10-30 == ") prefixColumnName(labelDf, exceptions = labelJoinConf.rowIdentifier(null, tableUtils.partitionColumn)) .show() val featureDf = tableUtils.sparkSession.table(joinConf.metaData.outputTable) - println(" == Features == ") + logger.info(" == Features == ") featureDf.show() val computed = tableUtils.sql(s"select * from ${joinConf.metaData.outputFinalView}") val expectedFinal = featureDf.join(prefixColumnName(labelDf, @@ -69,7 +71,7 @@ class FeatureWithLabelJoinTest { // add another label version val secondRun = new LabelJoin(joinConf, tableUtils, "2022-11-11") val secondLabel = secondRun.computeLabelJoin() - println(" == Second Run Label version 2022-11-11 == ") + logger.info(" == Second Run Label version 2022-11-11 == ") secondLabel.show() val view = tableUtils.sql(s"select * from ${joinConf.metaData.outputFinalView} order by label_ds") view.show() @@ -137,11 +139,11 @@ class FeatureWithLabelJoinTest { val runner = new LabelJoin(joinConf, tableUtils, "2022-10-06") val labelDf = runner.computeLabelJoin() - println(" == Label DF == ") + logger.info(" == Label DF == ") prefixColumnName(labelDf, exceptions = labelJoinConf.rowIdentifier(null, tableUtils.partitionColumn)) .show() val featureDf = tableUtils.sparkSession.table(joinConf.metaData.outputTable) - println(" == Features DF == ") + logger.info(" == Features DF == ") featureDf.show() val computed = tableUtils.sql(s"select * from ${joinConf.metaData.outputFinalView}") val expectedFinal = featureDf.join(prefixColumnName(labelDf, @@ -179,16 +181,16 @@ class FeatureWithLabelJoinTest { } private def assertResult(computed: DataFrame, expected: DataFrame): Unit = { - println(" == Computed == ") + logger.info(" == Computed == ") computed.show() - println(" == Expected == ") + logger.info(" == Expected == ") expected.show() val diff = Comparison.sideBySide(computed, expected, List("listing", "ds", "label_ds")) if (diff.count() > 0) { - println(s"Actual count: ${computed.count()}") - println(s"Expected count: ${expected.count()}") - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Actual count: ${computed.count()}") + logger.info(s"Expected count: ${expected.count()}") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff.show() } assertEquals(0, diff.count()) @@ -197,8 +199,8 @@ class FeatureWithLabelJoinTest { private def prefixColumnName(df: DataFrame, prefix: String = "label_", exceptions: Array[String] = null): DataFrame = { - println("exceptions") - println(exceptions.mkString(", ")) + logger.info("exceptions") + logger.info(exceptions.mkString(", ")) val renamedColumns = df.columns .map(col => { if (exceptions.contains(col) || col.startsWith(prefix)) { diff --git a/spark/src/test/scala/ai/chronon/spark/test/FetchStatsTest.scala b/spark/src/test/scala/ai/chronon/spark/test/FetchStatsTest.scala index b4d5648e3..3d4288d99 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/FetchStatsTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/FetchStatsTest.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.test.Column import ai.chronon.api import ai.chronon.api.{Accuracy, Builders, Operation, TimeUnit, Window} @@ -46,6 +47,7 @@ import scala.concurrent.{Await, ExecutionContext} * Fetch stats. */ class FetchStatsTest extends TestCase { + private val logger = LoggerFactory.getLogger(getClass) val spark: SparkSession = SparkSessionBuilder.build("FetchStatsTest", local = true) val tableUtils = TableUtils(spark) @@ -138,17 +140,17 @@ class FetchStatsTest extends TestCase { // Stats fetchStatsSeries(request, mockApi, true) val fetchedSeries = fetchStatsSeries(request, mockApi) - println(gson.toJson(fetchedSeries.values.get)) + logger.info(gson.toJson(fetchedSeries.values.get)) // LogStats fetchLogStatsSeries(request, mockApi, true) val fetchedLogSeries = fetchLogStatsSeries(request, mockApi) - println(gson.toJson(fetchedLogSeries.values.get)) + logger.info(gson.toJson(fetchedLogSeries.values.get)) // Online Offline Consistency fetchOOCSeries(request, mockApi, true) val fetchedOOCSeries = fetchOOCSeries(request, mockApi) - println(gson.toJson(fetchedOOCSeries.values.get)) + logger.info(gson.toJson(fetchedOOCSeries.values.get)) // Appendix: Incremental run to check incremental updates for summary job. OnlineUtils.serveStats(tableUtils, inMemoryKvStore, today, joinConf) @@ -160,7 +162,7 @@ class FetchStatsTest extends TestCase { // Request drifts val driftRequest = StatsRequest(joinConf.metaData.nameToFilePath + "/drift", None, None) val fetchedDriftSeries = fetchStatsSeries(driftRequest, mockApi) - println(gson.toJson(fetchedDriftSeries.values.get)) + logger.info(gson.toJson(fetchedDriftSeries.values.get)) } def fetchStatsSeries(request: StatsRequest, diff --git a/spark/src/test/scala/ai/chronon/spark/test/FetcherTest.scala b/spark/src/test/scala/ai/chronon/spark/test/FetcherTest.scala index 9248df979..201e6241f 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/FetcherTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/FetcherTest.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.test.Column import ai.chronon.aggregator.windowing.TsUtils import ai.chronon.api @@ -38,7 +39,7 @@ import org.junit.Assert.{assertEquals, assertFalse, assertTrue} import java.lang import java.util.TimeZone import java.util.concurrent.Executors -import scala.Console.println +import scala.Console.logger.info import scala.collection.Seq import scala.compat.java8.FutureConverters import scala.concurrent.duration.{Duration, SECONDS} @@ -47,6 +48,7 @@ import scala.io.Source import scala.util.ScalaJavaConversions._ class FetcherTest extends TestCase { + private val logger = LoggerFactory.getLogger(getClass) val sessionName = "FetcherTest" val spark: SparkSession = SparkSessionBuilder.build(sessionName, local = true) private val tableUtils = TableUtils(spark) @@ -183,7 +185,7 @@ class FetcherTest extends TestCase { .save(s"$namespace.${schema.name}") } - println("saved all data hand written for fetcher test") + logger.info("saved all data hand written for fetcher test") val startPartition = "2021-04-08" val endPartition = "2021-04-10" @@ -459,7 +461,7 @@ class FetcherTest extends TestCase { .withColumn("ts_lagged", laggedResponseDf.col("ts_millis") + lagMs) .withColumn("ts_millis", col("ts_lagged")) .drop("ts_lagged") - println("corrected lagged response") + logger.info("corrected lagged response") correctedLaggedResponse.show() correctedLaggedResponse.save(mockApi.logTable, partitionColumns = Seq(tableUtils.partitionColumn, "name")) @@ -471,14 +473,14 @@ class FetcherTest extends TestCase { // build consistency metrics val consistencyJob = new ConsistencyJob(spark, joinConf, today) val metrics = consistencyJob.buildConsistencyMetrics() - println(s"ooc metrics: $metrics".stripMargin) + logger.info(s"ooc metrics: $metrics".stripMargin) OnlineUtils.serveConsistency(tableUtils, inMemoryKvStore, today, joinConf) val fetcher = mockApi.buildFetcher() val consistencyFetch = fetcher.fetchConsistencyMetricsTimeseries(StatsRequest(joinConf.metaData.nameToFilePath, None, None)) val response = Await.result(consistencyFetch, Duration.Inf) val gson = new GsonBuilder().setPrettyPrinting().serializeNulls().create() - println(s""" + logger.info(s""" | | Fetched Consistency Metrics | ${gson.toJson(response.values.get)} @@ -503,7 +505,7 @@ class FetcherTest extends TestCase { .asInstanceOf[GenericRow] } - println(endDsExpected.schema.pretty) + logger.info(endDsExpected.schema.pretty) val keyishColumns = keys.toList ++ List(tableUtils.partitionColumn, Constants.TimeColumn) val responseRdd = tableUtils.sparkSession.sparkContext.parallelize(responseRows.toSeq) @@ -511,19 +513,19 @@ class FetcherTest extends TestCase { if (endDs != today) { responseDf = responseDf.drop("ds").withColumn("ds", lit(endDs)) } - println("expected:") + logger.info("expected:") endDsExpected.show() - println("response:") + logger.info("response:") responseDf.show() val diff = Comparison.sideBySide(responseDf, endDsExpected, keyishColumns, aName = "online", bName = "offline") assertEquals(endDsQueries.count(), responseDf.count()) if (diff.count() > 0) { - println("queries:") + logger.info("queries:") endDsQueries.show() - println(s"Total count: ${responseDf.count()}") - println(s"Diff count: ${diff.count()}") - println(s"diff result rows:") + logger.info(s"Total count: ${responseDf.count()}") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows:") diff .withTimeBasedColumn("ts_string", "ts", "yy-MM-dd HH:mm") .select("ts_string", diff.schema.fieldNames: _*) @@ -565,14 +567,15 @@ class FetcherTest extends TestCase { val (responses, _) = FetcherTestUtil.joinResponses(spark, Array(request), mockApi) val responseMap = responses.head.values.get - println("====== Empty request response map ======") - println(responseMap) + logger.info("====== Empty request response map ======") + logger.info(responseMap) assertEquals(joinConf.joinParts.size() + joinConf.derivations.toScala.derivationsWithoutStar.size, responseMap.size) assertEquals(responseMap.keys.count(_.endsWith("_exception")), joinConf.joinParts.size()) } } object FetcherTestUtil { + private val logger = LoggerFactory.getLogger(getClass) def joinResponses(spark: SparkSession, requests: Array[Request], mockApi: MockApi, @@ -639,7 +642,7 @@ object FetcherTestUtil { } val fetcherNameString = if (useJavaFetcher) "Java" else "Scala" - println(s""" + logger.info(s""" |Averaging fetching stats for $fetcherNameString Fetcher over ${requests.length} requests $runCount times |with batch size: $chunkSize |average qps: ${qpsSum / runCount} @@ -654,7 +657,7 @@ object FetcherTestUtil { ) } if (samplePercent > 0) { - println(s"logged count: ${loggedDf.count()}") + logger.info(s"logged count: ${loggedDf.count()}") loggedDf.show() } result -> loggedDf diff --git a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala index f8c9876b2..835dd5486 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.test.{CStream, Column, NaiveAggregator} import ai.chronon.aggregator.windowing.FiveMinuteResolution import ai.chronon.api.Extensions._ @@ -45,6 +46,7 @@ import org.junit.Test import scala.collection.mutable class GroupByTest { + private val logger = LoggerFactory.getLogger(getClass) lazy val spark: SparkSession = SparkSessionBuilder.build("GroupByTest", local = true) implicit val tableUtils = TableUtils(spark) @@ -78,7 +80,7 @@ class GroupByTest { val diff = Comparison.sideBySide(actualDf, expectedDf, List("user", tableUtils.partitionColumn)) if (diff.count() > 0) { diff.show() - println("diff result rows") + logger.info("diff result rows") } assertEquals(0, diff.count()) } @@ -130,7 +132,7 @@ class GroupByTest { val diff = Comparison.sideBySide(actualDf, expectedDf, List("user", tableUtils.partitionColumn)) if (diff.count() > 0) { diff.show() - println("diff result rows") + logger.info("diff result rows") } assertEquals(0, diff.count()) } @@ -177,10 +179,10 @@ class GroupByTest { val diff = Comparison.sideBySide(computed, expected, List("user", "ts")) if (diff.count() > 0) { - println(s"Actual count: ${computed.count()}") - println(s"Expected count: ${expected.count()}") - println(s"Diff count: ${diff.count()}") - println(s"diff result rows last_k_test") + logger.info(s"Actual count: ${computed.count()}") + logger.info(s"Expected count: ${expected.count()}") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows last_k_test") diff.show() diff.rdd.foreach { row => val gson = new Gson() @@ -195,7 +197,7 @@ class GroupByTest { val computedStr = gson.toJson(computed) val expectedStr = gson.toJson(expected) if (computedStr != expectedStr) { - println(s""" + logger.info(s""" |computed [$computedCount]: ${gson.toJson(computed)} |expected [$expectedCount]: ${gson.toJson(expected)} |""".stripMargin) @@ -263,7 +265,7 @@ class GroupByTest { val diff = Comparison.sideBySide(naiveDf, resultDf, List("user", Constants.TimeColumn)) if (diff.count() > 0) { diff.show() - println("diff result rows") + logger.info("diff result rows") } assertEquals(0, diff.count()) } @@ -544,16 +546,16 @@ class GroupByTest { | latestB.listing = COALESCE(C.listing, '--null--') AND latestB.ts = C.ts |""".stripMargin val expectedInputDf = spark.sql(expectedSQL) - println("Expected input DF: ") + logger.info("Expected input DF: ") expectedInputDf.show() - println("Computed input DF: ") + logger.info("Computed input DF: ") newGroupBy.inputDf.show() val diff = Comparison.sideBySide(newGroupBy.inputDf, expectedInputDf, List("listing", "user", "ds")) if (diff.count() > 0) { - println(s"Actual count: ${newGroupBy.inputDf.count()}") - println(s"Expected count: ${expectedInputDf.count()}") - println(s"Diff count: ${diff.count()}") + logger.info(s"Actual count: ${newGroupBy.inputDf.count()}") + logger.info(s"Expected count: ${expectedInputDf.count()}") + logger.info(s"Diff count: ${diff.count()}") diff.show() } assertEquals(0, diff.count()) diff --git a/spark/src/test/scala/ai/chronon/spark/test/GroupByUploadTest.scala b/spark/src/test/scala/ai/chronon/spark/test/GroupByUploadTest.scala index 59ffa6f20..111aafb8a 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/GroupByUploadTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/GroupByUploadTest.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.test.Column import ai.chronon.aggregator.windowing.TsUtils import ai.chronon.api.Extensions._ @@ -33,6 +34,7 @@ import scala.concurrent.Await import scala.util.ScalaJavaConversions.{JMapOps, ListOps, MapOps} class GroupByUploadTest { + private val logger = LoggerFactory.getLogger(getClass) lazy val spark: SparkSession = SparkSessionBuilder.build("GroupByUploadTest", local = true) private val namespace = "group_by_upload_test" @@ -333,8 +335,8 @@ class GroupByUploadTest { null, cRating(4.0, 2.0) ) - println(gson.toJson(categoryRatingResults)) - println(gson.toJson(expectedCategoryRatings)) + logger.info(gson.toJson(categoryRatingResults)) + logger.info(gson.toJson(expectedCategoryRatings)) categoryRatingResults.zip(expectedCategoryRatings).foreach { case (actual, expected) => assertEquals(actual, expected) diff --git a/spark/src/test/scala/ai/chronon/spark/test/InMemoryKvStore.scala b/spark/src/test/scala/ai/chronon/spark/test/InMemoryKvStore.scala index f50e27256..e1f15ff63 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/InMemoryKvStore.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/InMemoryKvStore.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.api.Constants import ai.chronon.online.KVStore import ai.chronon.online.KVStore.{PutRequest, TimedValue} @@ -29,6 +30,7 @@ import scala.concurrent.Future import scala.util.Try class InMemoryKvStore(tableUtils: () => TableUtils) extends KVStore with Serializable { + private val logger = LoggerFactory.getLogger(getClass) //type aliases for readability type Key = String type Data = Array[Byte] @@ -129,7 +131,7 @@ class InMemoryKvStore(tableUtils: () => TableUtils) extends KVStore with Seriali val value = tableEntry.getValue value.foreach { case (version, data) => - println(s"table: $tableName, key: $key, value: $data, version: $version") + logger.info(s"table: $tableName, key: $key, value: $data, version: $version") } } } @@ -147,7 +149,7 @@ object InMemoryKvStore { testName, new function.Function[String, InMemoryKvStore] { override def apply(name: String): InMemoryKvStore = { - println(s"Missing in-memory store for name: $name. Creating one") + logger.info(s"Missing in-memory store for name: $name. Creating one") new InMemoryKvStore(tableUtils) } } diff --git a/spark/src/test/scala/ai/chronon/spark/test/InMemoryStream.scala b/spark/src/test/scala/ai/chronon/spark/test/InMemoryStream.scala index ce3c7c6a1..8c075619c 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/InMemoryStream.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/InMemoryStream.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.api.{Constants, StructType} import ai.chronon.online.{AvroConversions, Mutation, SparkConversions} import ai.chronon.online.Extensions.StructTypeOps @@ -34,6 +35,7 @@ import org.apache.spark.sql.{DataFrame, Dataset, Encoder, Encoders, Row, SparkSe import java.util.Base64 class InMemoryStream { + private val logger = LoggerFactory.getLogger(getClass) private def encode(schema: org.apache.avro.Schema)(row: Row): Array[Byte] = { val gr: GenericRecord = new GenericData.Record(schema) @@ -61,7 +63,7 @@ class InMemoryStream { // encode input as avro byte array and insert into memory stream. def getInMemoryStreamDF(spark: SparkSession, inputDf: Dataset[Row]): DataFrame = { val schema: StructType = StructType.from("input", SparkConversions.toChrononSchema(inputDf.schema)) - println(s"Creating in-memory stream with schema: ${SparkConversions.fromChrononSchema(schema).catalogString}") + logger.info(s"Creating in-memory stream with schema: ${SparkConversions.fromChrononSchema(schema).catalogString}") val avroSchema = AvroConversions.fromChrononSchema(schema) import spark.implicits._ val input: MemoryStream[Array[Byte]] = @@ -83,7 +85,7 @@ class InMemoryStream { val inputDf = noDs.selectExpr(baseFields ++ mutationFields: _*) // encode and write - println(s"encoding stream with schema: ${inputDf.schema.catalogString}") + logger.info(s"encoding stream with schema: ${inputDf.schema.catalogString}") inputDf.show() val schema: StructType = StructType.from("input", SparkConversions.toChrononSchema(inputDf.schema)) val avroSchema = AvroConversions.fromChrononSchema(schema) diff --git a/spark/src/test/scala/ai/chronon/spark/test/JoinTest.scala b/spark/src/test/scala/ai/chronon/spark/test/JoinTest.scala index 2f93e4b96..6460d6fff 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/JoinTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/JoinTest.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.test.Column import ai.chronon.api import ai.chronon.api.{Accuracy, Builders, Constants, LongType, Operation, StringType, TimeUnit, Window} @@ -35,6 +36,7 @@ import scala.collection.JavaConverters._ import scala.util.ScalaJavaConversions.ListOps class JoinTest { + private val logger = LoggerFactory.getLogger(getClass) val spark: SparkSession = SparkSessionBuilder.build( "JoinTest", @@ -83,7 +85,7 @@ class JoinTest { snapshotTable = dollarTable ) - //println("Rupee Source start partition $month") + //logger.info("Rupee Source start partition $month") val rupeeSource = Builders.Source.entities( query = Builders.Query( @@ -147,7 +149,7 @@ class JoinTest { dropStart, dropEnd ) - println(tableUtils.partitions(s"$namespace.test_user_transaction_features")) + logger.info(tableUtils.partitions(s"$namespace.test_user_transaction_features")) joinConf.joinParts.toScala .map(jp => joinConf.partOutputTable(jp)) @@ -161,7 +163,7 @@ class JoinTest { resetUDFs() val runner2 = new Join(joinConf, end, tableUtils) val computed = runner2.computeJoin(Some(3)) - println(s"join start = $start") + logger.info(s"join start = $start") val expectedQuery = s""" |WITH @@ -211,11 +213,11 @@ class JoinTest { val diff = Comparison.sideBySide(computed, expected, List("user_name", "user", "ts", "ds")) if (diff.count() > 0) { - println(s"Actual count: ${computed.count()}") - println(s"Expected count: ${expected.count()}") - println(s"Diff count: ${diff.count()}") - println(s"Queries count: ${queries.count()}") - println(s"diff result rows") + logger.info(s"Actual count: ${computed.count()}") + logger.info(s"Expected count: ${expected.count()}") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"Queries count: ${queries.count()}") + logger.info(s"diff result rows") diff.show() } assertEquals(0, diff.count()) @@ -229,7 +231,7 @@ class JoinTest { val endMinus2 = tableUtils.partitionSpec.minus(end, new Window(2, TimeUnit.DAYS)) tableUtils.dropPartitionRange(s"$namespace.test_user_transaction_features", endMinus1, endMinus1) - println(tableUtils.partitions(s"$namespace.test_user_transaction_features")) + logger.info(tableUtils.partitions(s"$namespace.test_user_transaction_features")) joinConf.joinParts.asScala .map(jp => joinConf.partOutputTable(jp)) @@ -243,11 +245,11 @@ class JoinTest { val diff2 = Comparison.sideBySide(computed2, expected2, List("user_name", "user", "ts", "ds")) if (diff2.count() > 0) { - println(s"Actual count: ${computed2.count()}") - println(s"Expected count: ${expected2.count()}") - println(s"Diff count: ${diff2.count()}") - println(s"Queries count: ${queries.count()}") - println(s"diff result rows") + logger.info(s"Actual count: ${computed2.count()}") + logger.info(s"Expected count: ${expected2.count()}") + logger.info(s"Diff count: ${diff2.count()}") + logger.info(s"Queries count: ${queries.count()}") + logger.info(s"diff result rows") diff2.show() } assertEquals(0, diff2.count()) @@ -342,18 +344,18 @@ class JoinTest { | AND countries.country = grouped_heights.country """.stripMargin) - println("showing join result") + logger.info("showing join result") computed.show() - println("showing query result") + logger.info("showing query result") expected.show() - println( + logger.info( s"Left side count: ${spark.sql(s"SELECT country, ds from $countryTable where ds >= '$start' and ds <= '$end'").count()}") - println(s"Actual count: ${computed.count()}") - println(s"Expected count: ${expected.count()}") + logger.info(s"Actual count: ${computed.count()}") + logger.info(s"Expected count: ${expected.count()}") val diff = Comparison.sideBySide(computed, expected, List("country", "ds")) if (diff.count() > 0) { - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff.show() } assertEquals(diff.count(), 0) @@ -362,14 +364,14 @@ class JoinTest { * should not trigger a backfill and exit the program properly */ - // use console to redirect println message to Java IO + // use console to redirect logger.info message to Java IO val stream = new java.io.ByteArrayOutputStream() Console.withOut(stream) { // rerun the same join job runner.computeJoin(Some(7)) } val stdOutMsg = stream.toString() - println(s"std out message =\n $stdOutMsg") + logger.info(s"std out message =\n $stdOutMsg") // make sure that the program exits with target print statements assertTrue(stdOutMsg.contains(s"There is no data to compute based on end partition of $end.")) } @@ -415,12 +417,12 @@ class JoinTest { val runner = new Join(joinConf, end, tableUtils) val computed = runner.computeJoin(Some(7)) - println("showing join result") + logger.info("showing join result") computed.show() val leftSideCount = spark.sql(s"SELECT country, ds from $countryTable where ds == '$end'").count() - println(s"Left side expected count: $leftSideCount") - println(s"Actual count: ${computed.count()}") + logger.info(s"Left side expected count: $leftSideCount") + logger.info(s"Actual count: ${computed.count()}") assertEquals(leftSideCount, computed.count()) // There should be only one partition in computed df which equals to end partition val allPartitions = computed.select("ds").rdd.map(row => row(0)).collect().toSet @@ -491,8 +493,8 @@ class JoinTest { val diff = Comparison.sideBySide(computed, expected, List("item", "ts", "ds")) if (diff.count() > 0) { - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff.show() } assertEquals(diff.count(), 0) @@ -566,8 +568,8 @@ class JoinTest { tableUtils.sql(s"SELECT item, ts, ds from $itemQueriesTable where ds >= '$start' and ds <= '$dayAndMonthBefore'") assertEquals(queriesBare.count(), computed.count()) if (diff.count() > 0) { - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff .replaceWithReadableTime(Seq("ts", "a_user_unit_test_item_views_ts_max", "b_user_unit_test_item_views_ts_max"), dropOriginal = true) @@ -587,7 +589,7 @@ class JoinTest { // Run job val itemQueriesTable = s"$namespace.item_queries" - println("Item Queries DF: ") + logger.info("Item Queries DF: ") val q = s""" |SELECT @@ -628,8 +630,8 @@ class JoinTest { tableUtils.sql(s"SELECT item, ts, ds from $itemQueriesTable where ds >= '$start' and ds <= '$dayAndMonthBefore'") assertEquals(queriesBare.count(), computed.count()) if (diff.count() > 0) { - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff.show() } assertEquals(diff.count(), 0) @@ -702,7 +704,7 @@ class JoinTest { None, viewsGroupByCumulative.inferredAccuracy ) - println(renderedIncremental) + logger.info(renderedIncremental) assert(renderedIncremental.contains(s"(ds >= '2021-01-01') AND (ds <= '2021-01-01')")) } @@ -752,7 +754,7 @@ class JoinTest { val runner = new Join(joinConf, end, tableUtils) val computed = runner.computeJoin(Some(7)) - println(s"join start = $start") + logger.info(s"join start = $start") val expected = tableUtils.sql(s""" |WITH | users AS (SELECT user, ds from $usersTable where ds >= '$start' and ds <= '$end'), @@ -770,18 +772,18 @@ class JoinTest { | AND users.ds = grouped_names.ds """.stripMargin) - println("showing join result") + logger.info("showing join result") computed.show() - println("showing query result") + logger.info("showing query result") expected.show() - println( + logger.info( s"Left side count: ${spark.sql(s"SELECT user, ds from $namesTable where ds >= '$start' and ds <= '$end'").count()}") - println(s"Actual count: ${computed.count()}") - println(s"Expected count: ${expected.count()}") + logger.info(s"Actual count: ${computed.count()}") + logger.info(s"Expected count: ${expected.count()}") val diff = Comparison.sideBySide(computed, expected, List("user", "ds")) if (diff.count() > 0) { - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff.show() } assertEquals(diff.count(), 0) @@ -805,7 +807,7 @@ class JoinTest { val leftChangeJoin = new Join(joinConf = leftChangeJoinConf, endPartition = dayAndMonthBefore, tableUtils) val leftChangeRecompute = JoinUtils.tablesToRecompute(leftChangeJoinConf, leftChangeJoinConf.metaData.outputTable, tableUtils) - println(leftChangeRecompute) + logger.info(leftChangeRecompute) assertEquals(leftChangeRecompute.size, 3) val partTable = s"${leftChangeJoinConf.metaData.outputTable}_user_unit_test_item_views" assertEquals(leftChangeRecompute, @@ -872,8 +874,8 @@ class JoinTest { tableUtils.sql(s"SELECT item, ts, ds from $itemQueriesTable where ds >= '$start' and ds <= '$dayAndMonthBefore'") assertEquals(queriesBare.count(), computed.count()) if (diff.count() > 0) { - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff .replaceWithReadableTime( Seq("ts", "a_user_3_unit_test_item_views_ts_max", "b_user_3_unit_test_item_views_ts_max"), @@ -1006,7 +1008,7 @@ class JoinTest { ) val skipBloomComputed = new Join(joinConf, today, testTableUtils).computeJoin() val leftSideCount = testSpark.sql(s"SELECT item, ts, ds from $itemQueriesTable where ds >= '$start'").count() - println("computed count: " + skipBloomComputed.count()) + logger.info("computed count: " + skipBloomComputed.count()) assertEquals(leftSideCount, skipBloomComputed.count()) } diff --git a/spark/src/test/scala/ai/chronon/spark/test/LabelJoinTest.scala b/spark/src/test/scala/ai/chronon/spark/test/LabelJoinTest.scala index e9a093e8c..70ac0d5f8 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/LabelJoinTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/LabelJoinTest.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.api.{Accuracy, Builders, Constants, Operation, TimeUnit, Window} import ai.chronon.spark._ import org.apache.spark.sql.{Row, SparkSession} @@ -23,6 +24,7 @@ import org.junit.Assert.assertEquals import org.junit.Test class LabelJoinTest { + private val logger = LoggerFactory.getLogger(getClass) val spark: SparkSession = SparkSessionBuilder.build("LabelJoinTest", local = true) @@ -47,7 +49,7 @@ class LabelJoinTest { ) val runner = new LabelJoin(joinConf, tableUtils, labelDS) val computed = runner.computeLabelJoin(skipFinalJoin = true) - println(" == Computed == ") + logger.info(" == Computed == ") computed.show() val expected = tableUtils.sql(s""" SELECT v.listing_id as listing, @@ -58,7 +60,7 @@ class LabelJoinTest { LEFT OUTER JOIN label_join.listing_attributes as a ON v.listing_id = a.listing_id WHERE a.ds = '2022-10-30'""".stripMargin) - println(" == Expected == ") + logger.info(" == Expected == ") expected.show() assertEquals(computed.count(), expected.count()) assertEquals(computed.select("label_ds").first().get(0), labelDS) @@ -67,9 +69,9 @@ class LabelJoinTest { expected, List("listing", "ds")) if (diff.count() > 0) { - println(s"Actual count: ${computed.count()}") - println(s"Expected count: ${expected.count()}") - println(s"Diff count: ${diff.count()}") + logger.info(s"Actual count: ${computed.count()}") + logger.info(s"Expected count: ${expected.count()}") + logger.info(s"Diff count: ${diff.count()}") diff.show() } assertEquals(0, diff.count()) @@ -88,7 +90,7 @@ class LabelJoinTest { ) val runner = new LabelJoin(joinConf, tableUtils, labelDS) val computed = runner.computeLabelJoin(skipFinalJoin = true) - println(" == Computed == ") + logger.info(" == Computed == ") computed.show() val expected = tableUtils.sql(s""" |SELECT listing, @@ -113,16 +115,16 @@ class LabelJoinTest { |) b |ON aa.listing = b.listing_id """.stripMargin) - println(" == Expected == ") + logger.info(" == Expected == ") expected.show() assertEquals(computed.count(), expected.count()) assertEquals(computed.select("label_ds").first().get(0), labelDS) val diff = Comparison.sideBySide(computed, expected, List("listing", "ds")) if (diff.count() > 0) { - println(s"Actual count: ${computed.count()}") - println(s"Expected count: ${expected.count()}") - println(s"Diff count: ${diff.count()}") + logger.info(s"Actual count: ${computed.count()}") + logger.info(s"Expected count: ${expected.count()}") + logger.info(s"Diff count: ${diff.count()}") diff.show() } assertEquals(0, diff.count()) @@ -141,7 +143,7 @@ class LabelJoinTest { // label ds does not exist in label table, labels should be null val runner = new LabelJoin(joinConf, tableUtils, "2022-11-01") val computed = runner.computeLabelJoin(skipFinalJoin = true) - println(" == Computed == ") + logger.info(" == Computed == ") computed.show() assertEquals(computed.select("label_ds").first().get(0), "2022-11-01") assertEquals(computed @@ -164,7 +166,7 @@ class LabelJoinTest { val runner = new LabelJoin(joinConf, tableUtils, labelDS) val computed = runner.computeLabelJoin(skipFinalJoin = true) - println(" == Computed == ") + logger.info(" == Computed == ") computed.show() assertEquals(computed.count(), 6) val computedRows = computed.collect() @@ -175,7 +177,7 @@ class LabelJoinTest { val runner2 = new LabelJoin(joinConf, tableUtils, labelDS) val refreshed = runner2.computeLabelJoin(skipFinalJoin = true) - println(" == Refreshed == ") + logger.info(" == Refreshed == ") refreshed.show() assertEquals(refreshed.count(), 6) val refreshedRows = refreshed.collect() @@ -195,7 +197,7 @@ class LabelJoinTest { ) val runner = new LabelJoin(joinConf, tableUtils, labelDS) val computed = runner.computeLabelJoin(skipFinalJoin = true) - println(" == First Run == ") + logger.info(" == First Run == ") computed.show() assertEquals(computed.count(), 6) @@ -216,7 +218,7 @@ class LabelJoinTest { ) val runner2 = new LabelJoin(updatedJoinConf, tableUtils, "2022-11-01") val updated = runner2.computeLabelJoin(skipFinalJoin = true) - println(" == Updated Run == ") + logger.info(" == Updated Run == ") updated.show() assertEquals(updated.count(), 12) assertEquals(updated.where(updated("label_ds") === "2022-11-01").count(), 6) @@ -337,7 +339,7 @@ class LabelJoinTest { ) val runner = new LabelJoin(joinConf, tableUtils, "2022-10-06") val computed = runner.computeLabelJoin(skipFinalJoin = true) - println(" == computed == ") + logger.info(" == computed == ") computed.show() val expected = tableUtils.sql( @@ -354,16 +356,16 @@ class LabelJoinTest { | WHERE v.ds == '2022-10-02' | GROUP BY v.listing_id, v.ds) |""".stripMargin) - println(" == Expected == ") + logger.info(" == Expected == ") expected.show() assertEquals(computed.count(), expected.count()) assertEquals(computed.select("label_ds").first().get(0), "2022-10-06") val diff = Comparison.sideBySide(computed, expected, List("listing", "ds")) if (diff.count() > 0) { - println(s"Actual count: ${computed.count()}") - println(s"Expected count: ${expected.count()}") - println(s"Diff count: ${diff.count()}") + logger.info(s"Actual count: ${computed.count()}") + logger.info(s"Expected count: ${expected.count()}") + logger.info(s"Diff count: ${diff.count()}") diff.show() } assertEquals(0, diff.count()) @@ -394,7 +396,7 @@ class LabelJoinTest { val today = tableUtils.partitionSpec.at(now) val runner = new LabelJoin(joinConf, tableUtils, today) val computed = runner.computeLabelJoin(skipFinalJoin = true) - println(" == computed == ") + logger.info(" == computed == ") computed.show() // For window based label, given specific label_ds and window, only one ds will be updated with label. @@ -414,13 +416,13 @@ class LabelJoinTest { | WHERE v.ds == DATE_SUB(from_unixtime(round($now / 1000), 'yyyy-MM-dd'), 4) | GROUP BY v.listing_id, v.ds) |""".stripMargin) - println(" == Expected == ") + logger.info(" == Expected == ") expected.show() val diff = Comparison.sideBySide(computed, expected, List("listing_id", "ds")) if (diff.count() > 0) { - println(s"Actual count: ${computed.count()}") - println(s"Expected count: ${expected.count()}") - println(s"Diff count: ${diff.count()}") + logger.info(s"Actual count: ${computed.count()}") + logger.info(s"Expected count: ${expected.count()}") + logger.info(s"Diff count: ${diff.count()}") diff.show() } assertEquals(0, diff.count()) diff --git a/spark/src/test/scala/ai/chronon/spark/test/MetadataExporterTest.scala b/spark/src/test/scala/ai/chronon/spark/test/MetadataExporterTest.scala index a9ec34db1..5f5b49cf6 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/MetadataExporterTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/MetadataExporterTest.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.test.Column import ai.chronon.api import ai.chronon.spark.Extensions._ @@ -30,6 +31,7 @@ import scala.io.Source import java.io.File class MetadataExporterTest extends TestCase { + private val logger = LoggerFactory.getLogger(getClass) val sessionName = "MetadataExporter" val spark: SparkSession = SparkSessionBuilder.build(sessionName, local = true) @@ -39,22 +41,22 @@ class MetadataExporterTest extends TestCase { val directory = new File(directoryPath) if (directory.exists && directory.isDirectory) { - println("Valid Directory") + logger.info("Valid Directory") val files = directory.listFiles for (file <- files) { - println(file.getPath) + logger.info(file.getPath) if (file.isFile) { - println(s"File: ${file.getName}") + logger.info(s"File: ${file.getName}") val source = Source.fromFile(file) val fileContents = source.getLines.mkString("\n") source.close() - println(fileContents) - println("----------------------------------------") + logger.info(fileContents) + logger.info("----------------------------------------") } } } else { - println("Invalid directory path!") + logger.info("Invalid directory path!") } } diff --git a/spark/src/test/scala/ai/chronon/spark/test/MockApi.scala b/spark/src/test/scala/ai/chronon/spark/test/MockApi.scala index 5745f35e4..b704ffe32 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/MockApi.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/MockApi.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.api.Extensions.{GroupByOps, SourceOps} import ai.chronon.api.{Constants, StructType} import ai.chronon.online.Fetcher.Response @@ -38,6 +39,7 @@ import scala.util.ScalaJavaConversions.{IteratorOps, JListOps, JMapOps} import scala.util.Success class MockDecoder(inputSchema: StructType) extends StreamDecoder { + private val logger = LoggerFactory.getLogger(getClass) private def byteArrayToAvro(avro: Array[Byte], schema: Schema): GenericRecord = { val reader = new SpecificDatumReader[GenericRecord](schema) @@ -65,9 +67,10 @@ class MockDecoder(inputSchema: StructType) extends StreamDecoder { } class MockStreamBuilder extends StreamBuilder { + private val logger = LoggerFactory.getLogger(getClass) override def from(topicInfo: TopicInfo)(implicit session: SparkSession, props: Map[String, String]): DataStream = { val tableUtils = TableUtils(session) - println(s"""building stream from topic: ${topicInfo.name}""") + logger.info(s"""building stream from topic: ${topicInfo.name}""") val ds = topicInfo.params("ds") val df = tableUtils.sql(s"select * from ${topicInfo.name} where ds >= '$ds'") val encodedDf = (new InMemoryStream).getContinuousStreamDF(session, df.drop("ds")) @@ -77,7 +80,9 @@ class MockStreamBuilder extends StreamBuilder { } class MockApi(kvStore: () => KVStore, val namespace: String) extends Api(null) { + private val logger = LoggerFactory.getLogger(getClass) class PlusOneExternalHandler extends ExternalSourceHandler { + private val logger = LoggerFactory.getLogger(getClass) override def fetch(requests: collection.Seq[Fetcher.Request]): Future[collection.Seq[Fetcher.Response]] = { Future( requests.map(req => @@ -87,6 +92,7 @@ class MockApi(kvStore: () => KVStore, val namespace: String) extends Api(null) { } class AlwaysFailsHandler extends JavaExternalSourceHandler { + private val logger = LoggerFactory.getLogger(getClass) override def fetchJava(requests: util.List[JavaRequest]): CompletableFuture[util.List[JavaResponse]] = { CompletableFuture.completedFuture[util.List[JavaResponse]]( requests @@ -106,6 +112,7 @@ class MockApi(kvStore: () => KVStore, val namespace: String) extends Api(null) { } class JavaPlusOneExternalHandler extends JavaExternalSourceHandler { + private val logger = LoggerFactory.getLogger(getClass) override def fetchJava(requests: util.List[JavaRequest]): CompletableFuture[util.List[JavaResponse]] = { CompletableFuture.completedFuture( requests @@ -132,7 +139,7 @@ class MockApi(kvStore: () => KVStore, val namespace: String) extends Api(null) { new ConcurrentLinkedQueue[LoggableResponseBase64] override def streamDecoder(parsedInfo: GroupByServingInfoParsed): StreamDecoder = { - println( + logger.info( s"decoding stream ${parsedInfo.groupBy.streamingSource.get.topic} with " + s"schema: ${SparkConversions.fromChrononSchema(parsedInfo.streamChrononSchema).catalogString}") new MockDecoder(parsedInfo.streamChrononSchema) diff --git a/spark/src/test/scala/ai/chronon/spark/test/MutationsTest.scala b/spark/src/test/scala/ai/chronon/spark/test/MutationsTest.scala index e061856bf..5bd56c710 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/MutationsTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/MutationsTest.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.test.Column import ai.chronon.aggregator.windowing.TsUtils import ai.chronon.api @@ -32,6 +33,7 @@ import org.junit.Test * Join is the events and the entity value at the exact timestamp of the ts. */ class MutationsTest { + private val logger = LoggerFactory.getLogger(getClass) lazy val spark: SparkSession = SparkSessionBuilder.build( "MutationsTest", @@ -124,15 +126,15 @@ class MutationsTest { } val joinRdd = expectedRdd.join(computedRdd) if (totalExpectedRows == joinRdd.count()) return true - println("Failed to assert equality!") - println("== Joined RDD (listing_id, ts, rating_average)") + logger.info("Failed to assert equality!") + logger.info("== Joined RDD (listing_id, ts, rating_average)") val readableRDD = joinRdd.map { case ((id, ts, event, avg, ds), _) => Row(id, ts, event, avg, ds) } spark.createDataFrame(readableRDD, expectedSchema).show() - println("== Expected") + logger.info("== Expected") df.replaceWithReadableTime(Seq("ts"), false).show() - println("== Computed") + logger.info("== Computed") computed.replaceWithReadableTime(Seq("ts"), false).show() false } @@ -990,15 +992,15 @@ class MutationsTest { val expected = computeSimpleAverageThroughSql(testNamespace) val diff = Comparison.sideBySide(result, expected, List("listing_id", "ts", "ds")) if (diff.count() > 0) { - println(s"Actual count: ${result.count()}") - println(s"Expected count: ${expected.count()}") - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Actual count: ${result.count()}") + logger.info(s"Expected count: ${expected.count()}") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff.show() val recomputedResult = computeJoinFromTables(suffix, minDs, maxDs, null, Operation.AVERAGE) val recomputedDiff = Comparison.sideBySide(recomputedResult, expected, List("listing_id", "ts", "ds")) - println("Checking second run of the same data.") - println(s"recomputed diff result rows") + logger.info("Checking second run of the same data.") + logger.info(s"recomputed diff result rows") recomputedDiff.show() assert(recomputedDiff.count() == 0) } diff --git a/spark/src/test/scala/ai/chronon/spark/test/StagingQueryTest.scala b/spark/src/test/scala/ai/chronon/spark/test/StagingQueryTest.scala index c828cdc17..53a449306 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/StagingQueryTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/StagingQueryTest.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.test.Column import ai.chronon.api.Extensions._ import ai.chronon.api._ @@ -26,6 +27,7 @@ import org.junit.Assert.assertEquals import org.junit.Test class StagingQueryTest { + private val logger = LoggerFactory.getLogger(getClass) lazy val spark: SparkSession = SparkSessionBuilder.build("StagingQueryTest", local = true) implicit private val tableUtils: TableUtils = TableUtils(spark) @@ -44,7 +46,7 @@ class StagingQueryTest { val df = DataFrameGen .events(spark, schema, count = 100000, partitions = 100) .dropDuplicates("ts") // duplicates can create issues in comparisons - println("Generated staging query data:") + logger.info("Generated staging query data:") df.show() val viewName = s"$namespace.test_staging_query_compare" df.save(viewName) @@ -67,12 +69,12 @@ class StagingQueryTest { val computed = tableUtils.sql(s"select * from ${stagingQueryConf.metaData.outputTable} WHERE user IS NOT NULL") val diff = Comparison.sideBySide(expected, computed, List("user", "ts", "ds")) if (diff.count() > 0) { - println(s"Actual count: ${expected.count()}") - println(expected.show()) - println(s"Computed count: ${computed.count()}") - println(computed.show()) - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Actual count: ${expected.count()}") + logger.info(expected.show()) + logger.info(s"Computed count: ${computed.count()}") + logger.info(computed.show()) + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff.show() } assertEquals(0, diff.count()) @@ -91,7 +93,7 @@ class StagingQueryTest { val df = DataFrameGen .events(spark, schema, count = 30, partitions = 8) .dropDuplicates("ts") // duplicates can create issues in comparisons - println("Generated staging query data:") + logger.info("Generated staging query data:") df.show() val viewName = s"$namespace.test_staging_query_view" df.save(viewName) @@ -141,12 +143,12 @@ class StagingQueryTest { val diffV2 = Comparison.sideBySide(expectedUpdated, computedUpdated, List("user", "ts", "ds")) if (diffV2.count() > 0) { - println(s"Actual count: ${expectedUpdated.count()}") - println(expectedUpdated.show()) - println(s"Computed count: ${computedUpdated.count()}") - println(computedUpdated.show()) - println(s"Diff count: ${diffV2.count()}") - println(s"diff result rows") + logger.info(s"Actual count: ${expectedUpdated.count()}") + logger.info(expectedUpdated.show()) + logger.info(s"Computed count: ${computedUpdated.count()}") + logger.info(computedUpdated.show()) + logger.info(s"Diff count: ${diffV2.count()}") + logger.info(s"diff result rows") diffV2.show() } assertEquals(0, diffV2.count()) @@ -197,12 +199,12 @@ class StagingQueryTest { |""".stripMargin) val diff = Comparison.sideBySide(expected, computed, List("user", "ts", "ds")) if (diff.count() > 0) { - println(s"Actual count: ${expected.count()}") - println(expected.show()) - println(s"Computed count: ${computed.count()}") - println(computed.show()) - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Actual count: ${expected.count()}") + logger.info(expected.show()) + logger.info(s"Computed count: ${computed.count()}") + logger.info(computed.show()) + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff.show() } assertEquals(0, diff.count()) @@ -249,12 +251,12 @@ class StagingQueryTest { |""".stripMargin) val diff = Comparison.sideBySide(expected, computed, List("user", "ts", "ds")) if (diff.count() > 0) { - println(s"Actual count: ${expected.count()}") - println(expected.show()) - println(s"Computed count: ${computed.count()}") - println(computed.show()) - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Actual count: ${expected.count()}") + logger.info(expected.show()) + logger.info(s"Computed count: ${computed.count()}") + logger.info(computed.show()) + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff.show() } assertEquals(0, diff.count()) diff --git a/spark/src/test/scala/ai/chronon/spark/test/StatsComputeTest.scala b/spark/src/test/scala/ai/chronon/spark/test/StatsComputeTest.scala index bf724d130..d079260bc 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/StatsComputeTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/StatsComputeTest.scala @@ -15,6 +15,7 @@ */ package ai.chronon.spark.test +import org.slf4j.LoggerFactory import ai.chronon.aggregator.row.StatsGenerator import ai.chronon.aggregator.test.Column import ai.chronon.api._ @@ -27,6 +28,7 @@ import ai.chronon.spark.stats.StatsCompute import org.apache.spark.sql.functions.lit class StatsComputeTest { + private val logger = LoggerFactory.getLogger(getClass) lazy val spark: SparkSession = SparkSessionBuilder.build("StatsComputeTest", local = true) implicit val tableUtils = TableUtils(spark) val namespace: String = "stats_compute_test" @@ -85,18 +87,18 @@ class StatsComputeTest { StructType.from("generatedTest", toChrononSchema(stats.selectedDf.schema))) val daily = stats.dailySummary(aggregator, timeBucketMinutes = 0).toFlatDf - println("Daily Stats") + logger.info("Daily Stats") daily.show() val bucketed = stats .dailySummary(aggregator) .toFlatDf .replaceWithReadableTime(Seq(Constants.TimeColumn), false) - println("Bucketed Stats") + logger.info("Bucketed Stats") bucketed.show() val denormalized = stats.addDerivedMetrics(bucketed, aggregator) - println("With Derived Data") + logger.info("With Derived Data") denormalized.show(truncate = false) } @@ -115,15 +117,15 @@ class StatsComputeTest { StructType.from("noTsTest", toChrononSchema(stats.selectedDf.schema))) val daily = stats.dailySummary(aggregator, timeBucketMinutes = 0).toFlatDf - println("Daily Stats") + logger.info("Daily Stats") daily.show() val bucketed = stats.dailySummary(aggregator).toFlatDf - println("Bucketed Stats") + logger.info("Bucketed Stats") bucketed.show() val denormalized = stats.addDerivedMetrics(bucketed, aggregator) - println("With Derived Data") + logger.info("With Derived Data") denormalized.show(truncate = false) } @@ -147,18 +149,18 @@ class StatsComputeTest { StructType.from("byteTest", toChrononSchema(stats.selectedDf.schema))) val daily = stats.dailySummary(aggregator, timeBucketMinutes = 0).toFlatDf - println("Daily Stats") + logger.info("Daily Stats") daily.show() val bucketed = stats .dailySummary(aggregator) .toFlatDf .replaceWithReadableTime(Seq(Constants.TimeColumn), false) - println("Bucketed Stats") + logger.info("Bucketed Stats") bucketed.show() val denormalized = stats.addDerivedMetrics(bucketed, aggregator) - println("With Derived Data") + logger.info("With Derived Data") denormalized.show(truncate = false) } } diff --git a/spark/src/test/scala/ai/chronon/spark/test/bootstrap/DerivationTest.scala b/spark/src/test/scala/ai/chronon/spark/test/bootstrap/DerivationTest.scala index c34c85b41..7e74cf0de 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/bootstrap/DerivationTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/bootstrap/DerivationTest.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test.bootstrap +import org.slf4j.LoggerFactory import ai.chronon.api.Builders.Derivation import ai.chronon.api.Extensions._ import ai.chronon.api._ @@ -35,6 +36,7 @@ import scala.concurrent.duration.Duration import scala.util.ScalaJavaConversions.JListOps class DerivationTest { + private val logger = LoggerFactory.getLogger(getClass) val spark: SparkSession = SparkSessionBuilder.build("DerivationTest", local = true) private val tableUtils = TableUtils(spark) @@ -277,10 +279,10 @@ class DerivationTest { val diff = Comparison.sideBySide(computed, expected, List("request_id", "user", "ts", "ds")) if (diff.count() > 0) { - println(s"Actual count: ${computed.count()}") - println(s"Expected count: ${expected.count()}") - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Actual count: ${computed.count()}") + logger.info(s"Expected count: ${expected.count()}") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff.show() } @@ -351,10 +353,10 @@ class DerivationTest { val diff = Comparison.sideBySide(outputDf, bootstrapDf, List("request_id", "user", "ts", "ds")) if (diff.count() > 0) { - println(s"Actual count: ${outputDf.count()}") - println(s"Expected count: ${bootstrapDf.count()}") - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Actual count: ${outputDf.count()}") + logger.info(s"Expected count: ${bootstrapDf.count()}") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff.show() } @@ -485,10 +487,10 @@ class DerivationTest { val diff = Comparison.sideBySide(computedDf, expectedDf, List("request_id", "user", "ts", "ds")) if (diff.count() > 0) { - println(s"Actual count: ${computedDf.count()}") - println(s"Expected count: ${expectedDf.count()}") - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Actual count: ${computedDf.count()}") + logger.info(s"Expected count: ${expectedDf.count()}") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff.show() } @@ -656,10 +658,10 @@ class DerivationTest { val diff = Comparison.sideBySide(actualDf, expectedDf, List("user", "ds")) if (diff.count() > 0) { - println(s"Actual count: ${actualDf.count()}") - println(s"Expected count: ${expectedDf.count()}") - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Actual count: ${actualDf.count()}") + logger.info(s"Expected count: ${expectedDf.count()}") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff.show() } assertEquals(0, diff.count()) diff --git a/spark/src/test/scala/ai/chronon/spark/test/bootstrap/LogBootstrapTest.scala b/spark/src/test/scala/ai/chronon/spark/test/bootstrap/LogBootstrapTest.scala index 099c0d566..71337c0e1 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/bootstrap/LogBootstrapTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/bootstrap/LogBootstrapTest.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test.bootstrap +import org.slf4j.LoggerFactory import ai.chronon.api.Extensions._ import ai.chronon.api._ import ai.chronon.online.Fetcher.Request @@ -33,6 +34,7 @@ import scala.concurrent.duration.Duration import scala.util.ScalaJavaConversions._ class LogBootstrapTest { + private val logger = LoggerFactory.getLogger(getClass) val spark: SparkSession = SparkSessionBuilder.build("BootstrapTest", local = true) val namespace = "test_log_bootstrap" @@ -170,17 +172,17 @@ class LogBootstrapTest { val computed = joinJob.computeJoin() val overlapCount = baseOutput.join(logDf, Seq("request_id", "ds")).count() - println(s"""Debugging information: + logger.info(s"""Debugging information: |base count: ${baseOutput.count()} |overlap keys between base and log: ${overlapCount} |""".stripMargin) val diff = Comparison.sideBySide(computed, expected, List("request_id", "user", "ts", "ds")) if (diff.count() > 0) { - println(s"Actual count: ${computed.count()}") - println(s"Expected count: ${expected.count()}") - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Actual count: ${computed.count()}") + logger.info(s"Expected count: ${expected.count()}") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff.show() } diff --git a/spark/src/test/scala/ai/chronon/spark/test/bootstrap/TableBootstrapTest.scala b/spark/src/test/scala/ai/chronon/spark/test/bootstrap/TableBootstrapTest.scala index f693734b3..f30f0b20d 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/bootstrap/TableBootstrapTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/bootstrap/TableBootstrapTest.scala @@ -16,6 +16,7 @@ package ai.chronon.spark.test.bootstrap +import org.slf4j.LoggerFactory import ai.chronon.api.Extensions.JoinOps import ai.chronon.api._ import ai.chronon.spark.Extensions._ @@ -28,6 +29,7 @@ import org.junit.Test import scala.util.ScalaJavaConversions.JListOps class TableBootstrapTest { + private val logger = LoggerFactory.getLogger(getClass) val spark: SparkSession = SparkSessionBuilder.build("BootstrapTest", local = true) private val tableUtils = TableUtils(spark) @@ -136,7 +138,7 @@ class TableBootstrapTest { val overlapBaseBootstrap1 = baseOutput.join(bootstrapDf1, Seq("request_id", "ds")).count() val overlapBaseBootstrap2 = baseOutput.join(bootstrapDf2, Seq("request_id", "ds")).count() val overlapBootstrap12 = bootstrapDf1.join(bootstrapDf2, Seq("request_id", "ds")).count() - println(s"""Debug information: + logger.info(s"""Debug information: |base count: ${baseOutput.count()} |overlap keys between base and bootstrap1 count: ${overlapBaseBootstrap1} |overlap keys between base and bootstrap2 count: ${overlapBaseBootstrap2} @@ -145,10 +147,10 @@ class TableBootstrapTest { val diff = Comparison.sideBySide(computed, expected, List("request_id", "user", "ts", "ds")) if (diff.count() > 0) { - println(s"Actual count: ${computed.count()}") - println(s"Expected count: ${expected.count()}") - println(s"Diff count: ${diff.count()}") - println(s"diff result rows") + logger.info(s"Actual count: ${computed.count()}") + logger.info(s"Expected count: ${expected.count()}") + logger.info(s"Diff count: ${diff.count()}") + logger.info(s"diff result rows") diff.show() }