diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/metadata_column_resolution.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/metadata_column_resolution.sql.out new file mode 100644 index 0000000000000..a4635a58fc6ef --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/metadata_column_resolution.sql.out @@ -0,0 +1,542 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +CREATE DATABASE IF NOT EXISTS testdb +-- !query analysis +CreateNamespace true ++- ResolvedNamespace V2SessionCatalog(spark_catalog), [testdb] + + +-- !query +USE testdb +-- !query analysis +SetCatalogAndNamespace ++- ResolvedNamespace V2SessionCatalog(spark_catalog), [testdb] + + +-- !query +CREATE TABLE t1(id INT, name STRING, value DOUBLE) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`testdb`.`t1`, false + + +-- !query +CREATE TABLE t2(id INT, category STRING, amount BIGINT) USING CSV +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`testdb`.`t2`, false + + +-- !query +CREATE TABLE t3(a INT, b STRING, c DOUBLE) USING ORC +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`testdb`.`t3`, false + + +-- !query +SELECT _metadata.file_name FROM t1 +-- !query analysis +Project [_metadata#x.file_name AS file_name#x] ++- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT _metadata.file_path FROM t1 +-- !query analysis +Project [_metadata#x.file_path AS file_path#x] ++- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT _metadata.file_size FROM t1 +-- !query analysis +Project [_metadata#x.file_size AS file_size#xL] ++- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT _metadata.file_modification_time FROM t1 +-- !query analysis +Project [_metadata#x.file_modification_time AS file_modification_time#x] ++- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT _metadata.row_index FROM t1 +-- !query analysis +Project [_metadata#x.row_index AS row_index#xL] ++- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT _metadata.file_block_start FROM t1 +-- !query analysis +Project [_metadata#x.file_block_start AS file_block_start#xL] ++- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT _metadata.file_block_length FROM t1 +-- !query analysis +Project [_metadata#x.file_block_length AS file_block_length#xL] ++- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT t1._metadata.file_name FROM t1 +-- !query analysis +Project [_metadata#x.file_name AS file_name#x] ++- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT testdb.t1._metadata.file_name FROM testdb.t1 +-- !query analysis +Project [_metadata#x.file_name AS file_name#x] ++- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT _metadata.file_name, t1._metadata.file_path FROM t1 +-- !query analysis +Project [_metadata#x.file_name AS file_name#x, _metadata#x.file_path AS file_path#x] ++- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT t1._metadata.file_name, testdb.t1._metadata.file_path FROM testdb.t1 +-- !query analysis +Project [_metadata#x.file_name AS file_name#x, _metadata#x.file_path AS file_path#x] ++- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT * FROM t1 WHERE _metadata.file_size > 1000 +-- !query analysis +Project [id#x, name#x, value#x] ++- Project [id#x, name#x, value#x] + +- Filter (_metadata#x.file_size > cast(1000 as bigint)) + +- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT id, name FROM t1 WHERE _metadata.row_index < 100 +-- !query analysis +Project [id#x, name#x] ++- Project [id#x, name#x, value#x] + +- Filter (_metadata#x.row_index < cast(100 as bigint)) + +- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT * FROM t1 WHERE _metadata.file_name = 'data.parquet' +-- !query analysis +Project [id#x, name#x, value#x] ++- Project [id#x, name#x, value#x] + +- Filter (_metadata#x.file_name = data.parquet) + +- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT * FROM t1 WHERE _metadata.file_modification_time > timestamp'2024-01-01 00:00:00' +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + +-- !query +SELECT * FROM t1 WHERE t1._metadata.file_size > 1000 +-- !query analysis +Project [id#x, name#x, value#x] ++- Project [id#x, name#x, value#x] + +- Filter (_metadata#x.file_size > cast(1000 as bigint)) + +- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT * FROM testdb.t1 WHERE testdb.t1._metadata.row_index < 100 +-- !query analysis +Project [id#x, name#x, value#x] ++- Project [id#x, name#x, value#x] + +- Filter (_metadata#x.row_index < cast(100 as bigint)) + +- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT id, name, _metadata.file_name FROM t1 ORDER BY _metadata.file_size +-- !query analysis +Project [id#x, name#x, file_name#x] ++- Sort [_metadata#x.file_size ASC NULLS FIRST], true + +- Project [id#x, name#x, _metadata#x.file_name AS file_name#x, _metadata#x] + +- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT * FROM t1 ORDER BY _metadata.row_index DESC +-- !query analysis +Project [id#x, name#x, value#x] ++- Sort [_metadata#x.row_index DESC NULLS LAST], true + +- Project [id#x, name#x, value#x, _metadata#x] + +- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT id, _metadata.file_name FROM t1 ORDER BY _metadata.file_modification_time, _metadata.row_index +-- !query analysis +Project [id#x, file_name#x] ++- Sort [_metadata#x.file_modification_time ASC NULLS FIRST, _metadata#x.row_index ASC NULLS FIRST], true + +- Project [id#x, _metadata#x.file_name AS file_name#x, _metadata#x] + +- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT t1._metadata.file_name, COUNT(*) FROM t1 GROUP BY t1._metadata.file_name +-- !query analysis +Aggregate [_metadata#x.file_name], [_metadata#x.file_name AS file_name#x, count(1) AS count(1)#xL] ++- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT testdb.t1._metadata.file_path, SUM(value) FROM testdb.t1 GROUP BY testdb.t1._metadata.file_path +-- !query analysis +Aggregate [_metadata#x.file_path], [_metadata#x.file_path AS file_path#x, sum(value#x) AS sum(value)#x] ++- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT _metadata.file_name, MIN(_metadata.row_index), MAX(_metadata.row_index) FROM t1 +GROUP BY _metadata.file_name +-- !query analysis +Aggregate [_metadata#x.file_name], [_metadata#x.file_name AS file_name#x, min(_metadata#x.row_index) AS min(_metadata.row_index)#xL, max(_metadata#x.row_index) AS max(_metadata.row_index)#xL] ++- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT _metadata.file_path, COUNT(DISTINCT _metadata.file_name) FROM t1 +GROUP BY _metadata.file_path +-- !query analysis +Aggregate [_metadata#x.file_path], [_metadata#x.file_path AS file_path#x, count(distinct _metadata#x.file_name) AS count(DISTINCT _metadata.file_name)#xL] ++- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT t1.id, t2.category, t1._metadata.file_name +FROM t1 JOIN t2 ON t1.id = t2.id +-- !query analysis +Project [id#x, category#x, _metadata#x.file_name AS file_name#x] ++- Join Inner, (id#x = id#x) + :- SubqueryAlias spark_catalog.testdb.t1 + : +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + +- SubqueryAlias spark_catalog.testdb.t2 + +- Relation spark_catalog.testdb.t2[id#x,category#x,amount#xL] csv + + +-- !query +SELECT t1.id, t1._metadata.file_name, t2._metadata.file_name +FROM t1 JOIN t2 ON t1.id = t2.id +-- !query analysis +Project [id#x, _metadata#x.file_name AS file_name#x, _metadata#x.file_name AS file_name#x] ++- Join Inner, (id#x = id#x) + :- SubqueryAlias spark_catalog.testdb.t1 + : +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + +- SubqueryAlias spark_catalog.testdb.t2 + +- Relation spark_catalog.testdb.t2[id#x,category#x,amount#xL,_metadata#x] csv + + +-- !query +SELECT t1.id, t1._metadata.row_index, t2._metadata.row_index +FROM t1 JOIN t2 ON t1.id = t2.id +WHERE t1._metadata.file_size > 1000 +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "FIELD_NOT_FOUND", + "sqlState" : "42704", + "messageParameters" : { + "fieldName" : "`row_index`", + "fields" : "`file_path`, `file_name`, `file_size`, `file_block_start`, `file_block_length`, `file_modification_time`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 39, + "stopIndex" : 60, + "fragment" : "t2._metadata.row_index" + } ] +} + + +-- !query +SELECT t1.name, t1._metadata.file_name, t2._metadata.file_name +FROM t1 LEFT JOIN t2 ON t1.id = t2.id +-- !query analysis +Project [name#x, _metadata#x.file_name AS file_name#x, _metadata#x.file_name AS file_name#x] ++- Join LeftOuter, (id#x = id#x) + :- SubqueryAlias spark_catalog.testdb.t1 + : +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + +- SubqueryAlias spark_catalog.testdb.t2 + +- Relation spark_catalog.testdb.t2[id#x,category#x,amount#xL,_metadata#x] csv + + +-- !query +SELECT t1._metadata.file_path, t2._metadata.file_path, t1.id +FROM t1 RIGHT JOIN t2 ON t1.id = t2.id +-- !query analysis +Project [_metadata#x.file_path AS file_path#x, _metadata#x.file_path AS file_path#x, id#x] ++- Join RightOuter, (id#x = id#x) + :- SubqueryAlias spark_catalog.testdb.t1 + : +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + +- SubqueryAlias spark_catalog.testdb.t2 + +- Relation spark_catalog.testdb.t2[id#x,category#x,amount#xL,_metadata#x] csv + + +-- !query +SELECT t1._metadata.file_name, t2._metadata.file_name, t1.id, t2.id +FROM t1 FULL OUTER JOIN t2 ON t1.id = t2.id +-- !query analysis +Project [_metadata#x.file_name AS file_name#x, _metadata#x.file_name AS file_name#x, id#x, id#x] ++- Join FullOuter, (id#x = id#x) + :- SubqueryAlias spark_catalog.testdb.t1 + : +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + +- SubqueryAlias spark_catalog.testdb.t2 + +- Relation spark_catalog.testdb.t2[id#x,category#x,amount#xL,_metadata#x] csv + + +-- !query +SELECT a.id, a._metadata.file_name, b._metadata.file_name +FROM t1 a JOIN t1 b ON a.id = b.id +WHERE a._metadata.row_index < b._metadata.row_index +-- !query analysis +Project [id#x, _metadata#x.file_name AS file_name#x, _metadata#x.file_name AS file_name#x] ++- Filter (_metadata#x.row_index < _metadata#x.row_index) + +- Join Inner, (id#x = id#x) + :- SubqueryAlias a + : +- SubqueryAlias spark_catalog.testdb.t1 + : +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + +- SubqueryAlias b + +- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT t._metadata.file_name, t.id FROM t1 t +-- !query analysis +Project [_metadata#x.file_name AS file_name#x, id#x] ++- SubqueryAlias t + +- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT x._metadata.file_name, x._metadata.row_index FROM t1 x +WHERE x._metadata.file_size > 500 +-- !query analysis +Project [_metadata#x.file_name AS file_name#x, _metadata#x.row_index AS row_index#xL] ++- Project [id#x, name#x, value#x, _metadata#x] + +- Filter (_metadata#x.file_size > cast(500 as bigint)) + +- SubqueryAlias x + +- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT * FROM (SELECT id, _metadata.file_name FROM t1) sub +-- !query analysis +Project [id#x, file_name#x] ++- SubqueryAlias sub + +- Project [id#x, _metadata#x.file_name AS file_name#x] + +- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT id FROM t1 WHERE _metadata.file_name IN + (SELECT _metadata.file_name FROM t2 WHERE amount > 100) +-- !query analysis +Project [id#x] ++- Project [id#x, name#x, value#x] + +- Filter _metadata#x.file_name IN (list#x []) + : +- Project [_metadata#x.file_name AS file_name#x] + : +- Filter (amount#xL > cast(100 as bigint)) + : +- SubqueryAlias spark_catalog.testdb.t2 + : +- Relation spark_catalog.testdb.t2[id#x,category#x,amount#xL,_metadata#x] csv + +- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT _metadata.file_name, id FROM t1 +UNION +SELECT _metadata.file_name, id FROM t2 +-- !query analysis +Distinct ++- Union false, false + :- Project [_metadata#x.file_name AS file_name#x, id#x] + : +- SubqueryAlias spark_catalog.testdb.t1 + : +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + +- Project [_metadata#x.file_name AS file_name#x, id#x] + +- SubqueryAlias spark_catalog.testdb.t2 + +- Relation spark_catalog.testdb.t2[id#x,category#x,amount#xL,_metadata#x] csv + + +-- !query +SELECT _metadata.* FROM t1 +-- !query analysis +Project [_metadata#x.file_path AS file_path#x, _metadata#x.file_name AS file_name#x, _metadata#x.file_size AS file_size#xL, _metadata#x.file_block_start AS file_block_start#xL, _metadata#x.file_block_length AS file_block_length#xL, _metadata#x.file_modification_time AS file_modification_time#x, _metadata#x.row_index AS row_index#xL] ++- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT _metadata.* FROM t2 +-- !query analysis +Project [_metadata#x.file_path AS file_path#x, _metadata#x.file_name AS file_name#x, _metadata#x.file_size AS file_size#xL, _metadata#x.file_block_start AS file_block_start#xL, _metadata#x.file_block_length AS file_block_length#xL, _metadata#x.file_modification_time AS file_modification_time#x] ++- SubqueryAlias spark_catalog.testdb.t2 + +- Relation spark_catalog.testdb.t2[id#x,category#x,amount#xL,_metadata#x] csv + + +-- !query +SELECT _metadata.* FROM t3 +-- !query analysis +Project [_metadata#x.file_path AS file_path#x, _metadata#x.file_name AS file_name#x, _metadata#x.file_size AS file_size#xL, _metadata#x.file_block_start AS file_block_start#xL, _metadata#x.file_block_length AS file_block_length#xL, _metadata#x.file_modification_time AS file_modification_time#x] ++- SubqueryAlias spark_catalog.testdb.t3 + +- Relation spark_catalog.testdb.t3[a#x,b#x,c#x,_metadata#x] orc + + +-- !query +SELECT t1._metadata.* FROM t1 +-- !query analysis +Project [_metadata#x.file_path AS file_path#x, _metadata#x.file_name AS file_name#x, _metadata#x.file_size AS file_size#xL, _metadata#x.file_block_start AS file_block_start#xL, _metadata#x.file_block_length AS file_block_length#xL, _metadata#x.file_modification_time AS file_modification_time#x, _metadata#x.row_index AS row_index#xL] ++- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT t2._metadata.* FROM t2 +-- !query analysis +Project [_metadata#x.file_path AS file_path#x, _metadata#x.file_name AS file_name#x, _metadata#x.file_size AS file_size#xL, _metadata#x.file_block_start AS file_block_start#xL, _metadata#x.file_block_length AS file_block_length#xL, _metadata#x.file_modification_time AS file_modification_time#x] ++- SubqueryAlias spark_catalog.testdb.t2 + +- Relation spark_catalog.testdb.t2[id#x,category#x,amount#xL,_metadata#x] csv + + +-- !query +SELECT t3._metadata.* FROM t3 +-- !query analysis +Project [_metadata#x.file_path AS file_path#x, _metadata#x.file_name AS file_name#x, _metadata#x.file_size AS file_size#xL, _metadata#x.file_block_start AS file_block_start#xL, _metadata#x.file_block_length AS file_block_length#xL, _metadata#x.file_modification_time AS file_modification_time#x] ++- SubqueryAlias spark_catalog.testdb.t3 + +- Relation spark_catalog.testdb.t3[a#x,b#x,c#x,_metadata#x] orc + + +-- !query +SELECT id, name, _metadata.* FROM t1 ORDER BY id +-- !query analysis +Sort [id#x ASC NULLS FIRST], true ++- Project [id#x, name#x, _metadata#x.file_path AS file_path#x, _metadata#x.file_name AS file_name#x, _metadata#x.file_size AS file_size#xL, _metadata#x.file_block_start AS file_block_start#xL, _metadata#x.file_block_length AS file_block_length#xL, _metadata#x.file_modification_time AS file_modification_time#x, _metadata#x.row_index AS row_index#xL] + +- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT id, category, _metadata.* FROM t2 ORDER BY id +-- !query analysis +Sort [id#x ASC NULLS FIRST], true ++- Project [id#x, category#x, _metadata#x.file_path AS file_path#x, _metadata#x.file_name AS file_name#x, _metadata#x.file_size AS file_size#xL, _metadata#x.file_block_start AS file_block_start#xL, _metadata#x.file_block_length AS file_block_length#xL, _metadata#x.file_modification_time AS file_modification_time#x] + +- SubqueryAlias spark_catalog.testdb.t2 + +- Relation spark_catalog.testdb.t2[id#x,category#x,amount#xL,_metadata#x] csv + + +-- !query +SELECT sub.file_name, sub.file_size FROM (SELECT _metadata.* FROM t1) sub ORDER BY sub.file_name +-- !query analysis +Sort [file_name#x ASC NULLS FIRST], true ++- Project [file_name#x, file_size#xL] + +- SubqueryAlias sub + +- Project [_metadata#x.file_path AS file_path#x, _metadata#x.file_name AS file_name#x, _metadata#x.file_size AS file_size#xL, _metadata#x.file_block_start AS file_block_start#xL, _metadata#x.file_block_length AS file_block_length#xL, _metadata#x.file_modification_time AS file_modification_time#x, _metadata#x.row_index AS row_index#xL] + +- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT id, _metadata.* FROM t1 WHERE _metadata.file_size >= 0 ORDER BY id +-- !query analysis +Sort [id#x ASC NULLS FIRST], true ++- Project [id#x, _metadata#x.file_path AS file_path#x, _metadata#x.file_name AS file_name#x, _metadata#x.file_size AS file_size#xL, _metadata#x.file_block_start AS file_block_start#xL, _metadata#x.file_block_length AS file_block_length#xL, _metadata#x.file_modification_time AS file_modification_time#x, _metadata#x.row_index AS row_index#xL] + +- Project [id#x, name#x, value#x, _metadata#x] + +- Filter (_metadata#x.file_size >= cast(0 as bigint)) + +- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT _metadata.*, COUNT(*) AS cnt FROM t1 GROUP BY ALL ORDER BY _metadata.file_name +-- !query analysis +Sort [file_name#x ASC NULLS FIRST], true ++- Aggregate [_metadata#x.file_path, _metadata#x.file_name, _metadata#x.file_size, _metadata#x.file_block_start, _metadata#x.file_block_length, _metadata#x.file_modification_time, _metadata#x.row_index], [_metadata#x.file_path AS file_path#x, _metadata#x.file_name AS file_name#x, _metadata#x.file_size AS file_size#xL, _metadata#x.file_block_start AS file_block_start#xL, _metadata#x.file_block_length AS file_block_length#xL, _metadata#x.file_modification_time AS file_modification_time#x, _metadata#x.row_index AS row_index#xL, count(1) AS cnt#xL] + +- SubqueryAlias spark_catalog.testdb.t1 + +- Relation spark_catalog.testdb.t1[id#x,name#x,value#x,_metadata#x] parquet + + +-- !query +SELECT _metadata.*, COUNT(*) AS cnt FROM t2 GROUP BY ALL ORDER BY _metadata.file_name +-- !query analysis +Sort [file_name#x ASC NULLS FIRST], true ++- Aggregate [_metadata#x.file_path, _metadata#x.file_name, _metadata#x.file_size, _metadata#x.file_block_start, _metadata#x.file_block_length, _metadata#x.file_modification_time], [_metadata#x.file_path AS file_path#x, _metadata#x.file_name AS file_name#x, _metadata#x.file_size AS file_size#xL, _metadata#x.file_block_start AS file_block_start#xL, _metadata#x.file_block_length AS file_block_length#xL, _metadata#x.file_modification_time AS file_modification_time#x, count(1) AS cnt#xL] + +- SubqueryAlias spark_catalog.testdb.t2 + +- Relation spark_catalog.testdb.t2[id#x,category#x,amount#xL,_metadata#x] csv + + +-- !query +SELECT _metadata.*, COUNT(*) AS cnt FROM t3 GROUP BY ALL ORDER BY _metadata.file_name +-- !query analysis +Sort [file_name#x ASC NULLS FIRST], true ++- Aggregate [_metadata#x.file_path, _metadata#x.file_name, _metadata#x.file_size, _metadata#x.file_block_start, _metadata#x.file_block_length, _metadata#x.file_modification_time], [_metadata#x.file_path AS file_path#x, _metadata#x.file_name AS file_name#x, _metadata#x.file_size AS file_size#xL, _metadata#x.file_block_start AS file_block_start#xL, _metadata#x.file_block_length AS file_block_length#xL, _metadata#x.file_modification_time AS file_modification_time#x, count(1) AS cnt#xL] + +- SubqueryAlias spark_catalog.testdb.t3 + +- Relation spark_catalog.testdb.t3[a#x,b#x,c#x,_metadata#x] orc + + +-- !query +DROP TABLE t1 +-- !query analysis +DropTable false, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), testdb.t1 + + +-- !query +DROP TABLE t2 +-- !query analysis +DropTable false, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), testdb.t2 + + +-- !query +DROP TABLE t3 +-- !query analysis +DropTable false, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), testdb.t3 + + +-- !query +DROP DATABASE testdb +-- !query analysis +DropNamespace false, false ++- ResolvedNamespace V2SessionCatalog(spark_catalog), [testdb] diff --git a/sql/core/src/test/resources/sql-tests/inputs/metadata_column_resolution.sql b/sql/core/src/test/resources/sql-tests/inputs/metadata_column_resolution.sql new file mode 100644 index 0000000000000..2a77a3abc4158 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/metadata_column_resolution.sql @@ -0,0 +1,129 @@ +-- Tests for _metadata virtual column resolution across different file formats, +-- qualifications, clauses, and join types. +-- t1 uses PARQUET, t2 uses CSV, t3 uses ORC. + +CREATE DATABASE IF NOT EXISTS testdb; +USE testdb; + +CREATE TABLE t1(id INT, name STRING, value DOUBLE) USING PARQUET; +CREATE TABLE t2(id INT, category STRING, amount BIGINT) USING CSV; +CREATE TABLE t3(a INT, b STRING, c DOUBLE) USING ORC; + +-- Basic metadata column selection +SELECT _metadata.file_name FROM t1; +SELECT _metadata.file_path FROM t1; +SELECT _metadata.file_size FROM t1; +SELECT _metadata.file_modification_time FROM t1; +SELECT _metadata.row_index FROM t1; +SELECT _metadata.file_block_start FROM t1; +SELECT _metadata.file_block_length FROM t1; + +-- Qualified table references with metadata +SELECT t1._metadata.file_name FROM t1; +SELECT testdb.t1._metadata.file_name FROM testdb.t1; + +-- Mixed qualified and unqualified metadata references +SELECT _metadata.file_name, t1._metadata.file_path FROM t1; +SELECT t1._metadata.file_name, testdb.t1._metadata.file_path FROM testdb.t1; + +-- Metadata in WHERE clause +SELECT * FROM t1 WHERE _metadata.file_size > 1000; +SELECT id, name FROM t1 WHERE _metadata.row_index < 100; +SELECT * FROM t1 WHERE _metadata.file_name = 'data.parquet'; +SELECT * FROM t1 WHERE _metadata.file_modification_time > timestamp'2024-01-01 00:00:00'; + +-- Qualified metadata in WHERE clause +SELECT * FROM t1 WHERE t1._metadata.file_size > 1000; +SELECT * FROM testdb.t1 WHERE testdb.t1._metadata.row_index < 100; + +-- Metadata in ORDER BY clause +SELECT id, name, _metadata.file_name FROM t1 ORDER BY _metadata.file_size; +SELECT * FROM t1 ORDER BY _metadata.row_index DESC; +SELECT id, _metadata.file_name FROM t1 ORDER BY _metadata.file_modification_time, _metadata.row_index; + +-- Qualified metadata in GROUP BY +SELECT t1._metadata.file_name, COUNT(*) FROM t1 GROUP BY t1._metadata.file_name; +SELECT testdb.t1._metadata.file_path, SUM(value) FROM testdb.t1 GROUP BY testdb.t1._metadata.file_path; + +-- Metadata with aggregations +SELECT _metadata.file_name, MIN(_metadata.row_index), MAX(_metadata.row_index) FROM t1 +GROUP BY _metadata.file_name; +SELECT _metadata.file_path, COUNT(DISTINCT _metadata.file_name) FROM t1 +GROUP BY _metadata.file_path; + +-- Metadata in JOIN conditions +SELECT t1.id, t2.category, t1._metadata.file_name +FROM t1 JOIN t2 ON t1.id = t2.id; + +SELECT t1.id, t1._metadata.file_name, t2._metadata.file_name +FROM t1 JOIN t2 ON t1.id = t2.id; + +SELECT t1.id, t1._metadata.row_index, t2._metadata.row_index +FROM t1 JOIN t2 ON t1.id = t2.id +WHERE t1._metadata.file_size > 1000; + +-- Metadata in LEFT/RIGHT JOIN +SELECT t1.name, t1._metadata.file_name, t2._metadata.file_name +FROM t1 LEFT JOIN t2 ON t1.id = t2.id; + +SELECT t1._metadata.file_path, t2._metadata.file_path, t1.id +FROM t1 RIGHT JOIN t2 ON t1.id = t2.id; + +-- Metadata in FULL OUTER JOIN +SELECT t1._metadata.file_name, t2._metadata.file_name, t1.id, t2.id +FROM t1 FULL OUTER JOIN t2 ON t1.id = t2.id; + +-- Metadata in self-join +SELECT a.id, a._metadata.file_name, b._metadata.file_name +FROM t1 a JOIN t1 b ON a.id = b.id +WHERE a._metadata.row_index < b._metadata.row_index; + +-- Metadata with table aliases +SELECT t._metadata.file_name, t.id FROM t1 t; +SELECT x._metadata.file_name, x._metadata.row_index FROM t1 x +WHERE x._metadata.file_size > 500; + +-- Metadata in subqueries +SELECT * FROM (SELECT id, _metadata.file_name FROM t1) sub; +SELECT id FROM t1 WHERE _metadata.file_name IN + (SELECT _metadata.file_name FROM t2 WHERE amount > 100); + +-- Metadata in UNION +SELECT _metadata.file_name, id FROM t1 +UNION +SELECT _metadata.file_name, id FROM t2; + +-- === _metadata.* struct expansion === + +-- expand all metadata fields for each format +SELECT _metadata.* FROM t1; +SELECT _metadata.* FROM t2; +SELECT _metadata.* FROM t3; + +-- qualified _metadata.* expansion +SELECT t1._metadata.* FROM t1; +SELECT t2._metadata.* FROM t2; +SELECT t3._metadata.* FROM t3; + +-- _metadata.* alongside regular columns (parquet) +SELECT id, name, _metadata.* FROM t1 ORDER BY id; + +-- _metadata.* alongside regular columns (csv) +SELECT id, category, _metadata.* FROM t2 ORDER BY id; + +-- _metadata.* in subquery +SELECT sub.file_name, sub.file_size FROM (SELECT _metadata.* FROM t1) sub ORDER BY sub.file_name; + +-- _metadata.* with WHERE on expanded fields +SELECT id, _metadata.* FROM t1 WHERE _metadata.file_size >= 0 ORDER BY id; + +-- _metadata.* in Aggregate +SELECT _metadata.*, COUNT(*) AS cnt FROM t1 GROUP BY ALL ORDER BY _metadata.file_name; +SELECT _metadata.*, COUNT(*) AS cnt FROM t2 GROUP BY ALL ORDER BY _metadata.file_name; +SELECT _metadata.*, COUNT(*) AS cnt FROM t3 GROUP BY ALL ORDER BY _metadata.file_name; + +-- Cleanup +DROP TABLE t1; +DROP TABLE t2; +DROP TABLE t3; +DROP DATABASE testdb; diff --git a/sql/core/src/test/resources/sql-tests/results/metadata_column_resolution.sql.out b/sql/core/src/test/resources/sql-tests/results/metadata_column_resolution.sql.out new file mode 100644 index 0000000000000..d9a1176a8c8e4 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/metadata_column_resolution.sql.out @@ -0,0 +1,493 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +CREATE DATABASE IF NOT EXISTS testdb +-- !query schema +struct<> +-- !query output + + + +-- !query +USE testdb +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t1(id INT, name STRING, value DOUBLE) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t2(id INT, category STRING, amount BIGINT) USING CSV +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t3(a INT, b STRING, c DOUBLE) USING ORC +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT _metadata.file_name FROM t1 +-- !query schema +struct +-- !query output + + + +-- !query +SELECT _metadata.file_path FROM t1 +-- !query schema +struct +-- !query output + + + +-- !query +SELECT _metadata.file_size FROM t1 +-- !query schema +struct +-- !query output + + + +-- !query +SELECT _metadata.file_modification_time FROM t1 +-- !query schema +struct +-- !query output + + + +-- !query +SELECT _metadata.row_index FROM t1 +-- !query schema +struct +-- !query output + + + +-- !query +SELECT _metadata.file_block_start FROM t1 +-- !query schema +struct +-- !query output + + + +-- !query +SELECT _metadata.file_block_length FROM t1 +-- !query schema +struct +-- !query output + + + +-- !query +SELECT t1._metadata.file_name FROM t1 +-- !query schema +struct +-- !query output + + + +-- !query +SELECT testdb.t1._metadata.file_name FROM testdb.t1 +-- !query schema +struct +-- !query output + + + +-- !query +SELECT _metadata.file_name, t1._metadata.file_path FROM t1 +-- !query schema +struct +-- !query output + + + +-- !query +SELECT t1._metadata.file_name, testdb.t1._metadata.file_path FROM testdb.t1 +-- !query schema +struct +-- !query output + + + +-- !query +SELECT * FROM t1 WHERE _metadata.file_size > 1000 +-- !query schema +struct +-- !query output + + + +-- !query +SELECT id, name FROM t1 WHERE _metadata.row_index < 100 +-- !query schema +struct +-- !query output + + + +-- !query +SELECT * FROM t1 WHERE _metadata.file_name = 'data.parquet' +-- !query schema +struct +-- !query output + + + +-- !query +SELECT * FROM t1 WHERE _metadata.file_modification_time > timestamp'2024-01-01 00:00:00' +-- !query schema +struct +-- !query output + + + +-- !query +SELECT * FROM t1 WHERE t1._metadata.file_size > 1000 +-- !query schema +struct +-- !query output + + + +-- !query +SELECT * FROM testdb.t1 WHERE testdb.t1._metadata.row_index < 100 +-- !query schema +struct +-- !query output + + + +-- !query +SELECT id, name, _metadata.file_name FROM t1 ORDER BY _metadata.file_size +-- !query schema +struct +-- !query output + + + +-- !query +SELECT * FROM t1 ORDER BY _metadata.row_index DESC +-- !query schema +struct +-- !query output + + + +-- !query +SELECT id, _metadata.file_name FROM t1 ORDER BY _metadata.file_modification_time, _metadata.row_index +-- !query schema +struct +-- !query output + + + +-- !query +SELECT t1._metadata.file_name, COUNT(*) FROM t1 GROUP BY t1._metadata.file_name +-- !query schema +struct +-- !query output + + + +-- !query +SELECT testdb.t1._metadata.file_path, SUM(value) FROM testdb.t1 GROUP BY testdb.t1._metadata.file_path +-- !query schema +struct +-- !query output + + + +-- !query +SELECT _metadata.file_name, MIN(_metadata.row_index), MAX(_metadata.row_index) FROM t1 +GROUP BY _metadata.file_name +-- !query schema +struct +-- !query output + + + +-- !query +SELECT _metadata.file_path, COUNT(DISTINCT _metadata.file_name) FROM t1 +GROUP BY _metadata.file_path +-- !query schema +struct +-- !query output + + + +-- !query +SELECT t1.id, t2.category, t1._metadata.file_name +FROM t1 JOIN t2 ON t1.id = t2.id +-- !query schema +struct +-- !query output + + + +-- !query +SELECT t1.id, t1._metadata.file_name, t2._metadata.file_name +FROM t1 JOIN t2 ON t1.id = t2.id +-- !query schema +struct +-- !query output + + + +-- !query +SELECT t1.id, t1._metadata.row_index, t2._metadata.row_index +FROM t1 JOIN t2 ON t1.id = t2.id +WHERE t1._metadata.file_size > 1000 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "FIELD_NOT_FOUND", + "sqlState" : "42704", + "messageParameters" : { + "fieldName" : "`row_index`", + "fields" : "`file_path`, `file_name`, `file_size`, `file_block_start`, `file_block_length`, `file_modification_time`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 39, + "stopIndex" : 60, + "fragment" : "t2._metadata.row_index" + } ] +} + + +-- !query +SELECT t1.name, t1._metadata.file_name, t2._metadata.file_name +FROM t1 LEFT JOIN t2 ON t1.id = t2.id +-- !query schema +struct +-- !query output + + + +-- !query +SELECT t1._metadata.file_path, t2._metadata.file_path, t1.id +FROM t1 RIGHT JOIN t2 ON t1.id = t2.id +-- !query schema +struct +-- !query output + + + +-- !query +SELECT t1._metadata.file_name, t2._metadata.file_name, t1.id, t2.id +FROM t1 FULL OUTER JOIN t2 ON t1.id = t2.id +-- !query schema +struct +-- !query output + + + +-- !query +SELECT a.id, a._metadata.file_name, b._metadata.file_name +FROM t1 a JOIN t1 b ON a.id = b.id +WHERE a._metadata.row_index < b._metadata.row_index +-- !query schema +struct +-- !query output + + + +-- !query +SELECT t._metadata.file_name, t.id FROM t1 t +-- !query schema +struct +-- !query output + + + +-- !query +SELECT x._metadata.file_name, x._metadata.row_index FROM t1 x +WHERE x._metadata.file_size > 500 +-- !query schema +struct +-- !query output + + + +-- !query +SELECT * FROM (SELECT id, _metadata.file_name FROM t1) sub +-- !query schema +struct +-- !query output + + + +-- !query +SELECT id FROM t1 WHERE _metadata.file_name IN + (SELECT _metadata.file_name FROM t2 WHERE amount > 100) +-- !query schema +struct +-- !query output + + + +-- !query +SELECT _metadata.file_name, id FROM t1 +UNION +SELECT _metadata.file_name, id FROM t2 +-- !query schema +struct +-- !query output + + + +-- !query +SELECT _metadata.* FROM t1 +-- !query schema +struct +-- !query output + + + +-- !query +SELECT _metadata.* FROM t2 +-- !query schema +struct +-- !query output + + + +-- !query +SELECT _metadata.* FROM t3 +-- !query schema +struct +-- !query output + + + +-- !query +SELECT t1._metadata.* FROM t1 +-- !query schema +struct +-- !query output + + + +-- !query +SELECT t2._metadata.* FROM t2 +-- !query schema +struct +-- !query output + + + +-- !query +SELECT t3._metadata.* FROM t3 +-- !query schema +struct +-- !query output + + + +-- !query +SELECT id, name, _metadata.* FROM t1 ORDER BY id +-- !query schema +struct +-- !query output + + + +-- !query +SELECT id, category, _metadata.* FROM t2 ORDER BY id +-- !query schema +struct +-- !query output + + + +-- !query +SELECT sub.file_name, sub.file_size FROM (SELECT _metadata.* FROM t1) sub ORDER BY sub.file_name +-- !query schema +struct +-- !query output + + + +-- !query +SELECT id, _metadata.* FROM t1 WHERE _metadata.file_size >= 0 ORDER BY id +-- !query schema +struct +-- !query output + + + +-- !query +SELECT _metadata.*, COUNT(*) AS cnt FROM t1 GROUP BY ALL ORDER BY _metadata.file_name +-- !query schema +struct +-- !query output + + + +-- !query +SELECT _metadata.*, COUNT(*) AS cnt FROM t2 GROUP BY ALL ORDER BY _metadata.file_name +-- !query schema +struct +-- !query output + + + +-- !query +SELECT _metadata.*, COUNT(*) AS cnt FROM t3 GROUP BY ALL ORDER BY _metadata.file_name +-- !query schema +struct +-- !query output + + + +-- !query +DROP TABLE t1 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE t2 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE t3 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP DATABASE testdb +-- !query schema +struct<> +-- !query output +