-
Notifications
You must be signed in to change notification settings - Fork 3.7k
/
S3BaseParquetDestinationAcceptanceTest.kt
97 lines (85 loc) · 4.11 KB
/
S3BaseParquetDestinationAcceptanceTest.kt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
/*
* Copyright (c) 2023 Airbyte, Inc., all rights reserved.
*/
package io.airbyte.cdk.integrations.destination.s3
import com.fasterxml.jackson.databind.JsonNode
import com.fasterxml.jackson.databind.ObjectReader
import io.airbyte.cdk.integrations.destination.s3.avro.AvroConstants
import io.airbyte.cdk.integrations.destination.s3.parquet.S3ParquetWriter
import io.airbyte.cdk.integrations.destination.s3.util.AvroRecordHelper
import io.airbyte.cdk.integrations.standardtest.destination.comparator.TestDataComparator
import io.airbyte.commons.json.Jsons
import java.io.IOException
import java.net.URI
import java.net.URISyntaxException
import java.util.*
import org.apache.avro.Schema
import org.apache.avro.generic.GenericData
import org.apache.hadoop.fs.Path
import org.apache.parquet.avro.AvroReadSupport
import org.apache.parquet.hadoop.ParquetReader
abstract class S3BaseParquetDestinationAcceptanceTest protected constructor() :
S3AvroParquetDestinationAcceptanceTest(S3Format.PARQUET) {
override val formatConfig: JsonNode?
get() =
Jsons.jsonNode(java.util.Map.of("format_type", "Parquet", "compression_codec", "GZIP"))
@Throws(IOException::class, URISyntaxException::class)
override fun retrieveRecords(
testEnv: TestDestinationEnv?,
streamName: String,
namespace: String,
streamSchema: JsonNode
): List<JsonNode> {
val nameUpdater =
AvroRecordHelper.getFieldNameUpdater(streamName, namespace, streamSchema)
val objectSummaries = getAllSyncedObjects(streamName, namespace)
val jsonRecords: MutableList<JsonNode> = LinkedList()
for (objectSummary in objectSummaries) {
val `object` = s3Client!!.getObject(objectSummary.bucketName, objectSummary.key)
val uri = URI(String.format("s3a://%s/%s", `object`.bucketName, `object`.key))
val path = Path(uri)
val hadoopConfig = S3ParquetWriter.getHadoopConfig(s3DestinationConfig)
ParquetReader.builder<GenericData.Record>(AvroReadSupport<GenericData.Record>(), path)
.withConf(hadoopConfig)
.build()
.use { parquetReader ->
val jsonReader: ObjectReader =
S3DestinationAcceptanceTest.Companion.MAPPER.reader()
var record: GenericData.Record?
while ((parquetReader.read().also { record = it }) != null) {
val jsonBytes = AvroConstants.JSON_CONVERTER.convertToJson(record)
var jsonRecord = jsonReader.readTree(jsonBytes)
jsonRecord = nameUpdater.getJsonWithOriginalFieldNames(jsonRecord)
jsonRecords.add(AvroRecordHelper.pruneAirbyteJson(jsonRecord))
}
}
}
return jsonRecords
}
override fun getTestDataComparator(): TestDataComparator = S3AvroParquetTestDataComparator()
@Throws(Exception::class)
override fun retrieveDataTypesFromPersistedFiles(
streamName: String,
namespace: String
): Map<String, Set<Schema.Type>> {
val objectSummaries = getAllSyncedObjects(streamName, namespace)
val resultDataTypes: MutableMap<String, Set<Schema.Type>> = HashMap()
for (objectSummary in objectSummaries) {
val `object` = s3Client!!.getObject(objectSummary.bucketName, objectSummary.key)
val uri = URI(String.format("s3a://%s/%s", `object`.bucketName, `object`.key))
val path = Path(uri)
val hadoopConfig = S3ParquetWriter.getHadoopConfig(s3DestinationConfig)
ParquetReader.builder(AvroReadSupport<GenericData.Record>(), path)
.withConf(hadoopConfig)
.build()
.use { parquetReader ->
var record: GenericData.Record?
while ((parquetReader.read().also { record = it }) != null) {
val actualDataTypes = getTypes(record!!)
resultDataTypes.putAll(actualDataTypes)
}
}
}
return resultDataTypes
}
}