[Feature][Connector]add s3file save mode function (#6131)

chl-wxp · web-flow · commit 81c51073bfb2 · 2024-01-16T11:57:24.000+08:00
diff --git a/docs/en/connector-v2/sink/S3File.md b/docs/en/connector-v2/sink/S3File.md
@@ -117,7 +117,12 @@ If write to `csv`, `text` file type, All column will be string.
 | max_rows_in_memory               | int     | no       | -                                                     | Only used when file_format is excel.                                                                                                                                  |
 | sheet_name                       | string  | no       | Sheet${Random number}                                 | Only used when file_format is excel.                                                                                                                                  |
 | hadoop_s3_properties             | map     | no       |                                                       | If you need to add a other option, you could add it here and refer to this [link](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html)       |
-|                                  |
+| schema_save_mode                 | Enum    | no       | CREATE_SCHEMA_WHEN_NOT_EXIST                          | Before turning on the synchronous task, do different treatment of the target path                                                                                     |
+| data_save_mode                   | Enum    | no       | APPEND_DATA                                           | Before opening the synchronous task, the data file in the target path is differently processed                                                                        |
+
+### path [string]
+
+Store the path of the data file to support variable replacement. For example: path=/test/${database_name}/${schema_name}/${table_name}
 
 ### hadoop_s3_properties [map]
 
@@ -241,6 +246,22 @@ When File Format is Excel,The maximum number of data items that can be cached in
 
 Writer the sheet of the workbook
 
+### schema_save_mode[Enum]
+
+Before turning on the synchronous task, do different treatment of the target path.  
+Option introduction：  
+`RECREATE_SCHEMA` ：Will be created when the path does not exist. If the path already exists, delete the path and recreate it.         
+`CREATE_SCHEMA_WHEN_NOT_EXIST` ：Will Created when the path does not exist, use the path when the path is existed.        
+`ERROR_WHEN_SCHEMA_NOT_EXIST` ：Error will be reported when the path does not exist
+
+### data_save_mode[Enum]
+
+Before opening the synchronous task, the data file in the target path is differently processed.
+Option introduction：  
+`DROP_DATA`： use the path but delete data files in the path.
+`APPEND_DATA`：use the path, and add new files in the path for write data.   
+`ERROR_WHEN_DATA_EXISTS`：When there are some data files in the path, an error will is reported.
+
 ## Example
 
 ### Simple:
@@ -383,10 +404,67 @@ For orc file format simple config with `org.apache.hadoop.fs.s3a.SimpleAWSCreden
     access_key = "xxxxxxxxxxxxxxxxx"
     secret_key = "xxxxxxxxxxxxxxxxx"
     file_format_type = "orc"
+    schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST"
+    data_save_mode="APPEND_DATA"
   }
 
 ```
 
+Multi-table writing and saveMode
+
+```
+env {
+"job.name"="SeaTunnel_job"
+"job.mode"=STREAMING
+}
+source {
+MySQL-CDC {
+    
+    "connect.max-retries"=3
+    "connection.pool.size"=6
+    "startup.mode"=INITIAL
+    "exactly_once"="true"
+    "stop.mode"=NEVER
+    parallelism=1
+    "result_table_name"=Table11519548644512
+    "dag-parsing.mode"=MULTIPLEX
+    catalog {
+        factory=Mysql
+    }
+    database-names=[
+        "wls_t1"
+    ]
+    table-names=[
+        "wls_t1.mysqlcdc_to_s3_t3",
+        "wls_t1.mysqlcdc_to_s3_t4",
+        "wls_t1.mysqlcdc_to_s3_t5",
+        "wls_t1.mysqlcdc_to_s3_t1",
+        "wls_t1.mysqlcdc_to_s3_t2"
+    ]
+    password="xxxxxx"
+    username="xxxxxxxxxxxxx"
+    base-url="jdbc:mysql://localhost:3306/qa_source"
+    server-time-zone=UTC
+}
+}
+transform {
+}
+sink {
+S3File {
+    bucket = "s3a://seatunnel-test"
+    tmp_path = "/tmp/seatunnel"
+    path="/test/${table_name}"
+    fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn"
+    fs.s3a.aws.credentials.provider="org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider"
+    access_key = "xxxxxxxxxxxxxxxxx"
+    secret_key = "xxxxxxxxxxxxxxxxx"
+    file_format_type = "orc"
+    schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST"
+    data_save_mode="APPEND_DATA"
+  }
+}
+```
+
 ## Changelog
 
 ### 2.3.0-beta 2022-10-20
diff --git a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/hadoop/HadoopFileSystemProxy.java b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/hadoop/HadoopFileSystemProxy.java
@@ -28,7 +28,9 @@
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
 import org.apache.hadoop.security.UserGroupInformation;
 
 import lombok.NonNull;
@@ -140,6 +142,23 @@ public void createDir(@NonNull String filePath) throws IOException {
         }
     }
 
+    public List<LocatedFileStatus> listFile(String path) throws IOException {
+        if (fileSystem == null) {
+            initialize();
+        }
+        List<LocatedFileStatus> fileList = new ArrayList<>();
+        if (!fileExist(path)) {
+            return fileList;
+        }
+        Path fileName = new Path(path);
+        RemoteIterator<LocatedFileStatus> locatedFileStatusRemoteIterator =
+                fileSystem.listFiles(fileName, false);
+        while (locatedFileStatusRemoteIterator.hasNext()) {
+            fileList.add(locatedFileStatusRemoteIterator.next());
+        }
+        return fileList;
+    }
+
     public List<Path> getAllSubFiles(@NonNull String filePath) throws IOException {
         if (fileSystem == null) {
             initialize();
diff --git a/seatunnel-connectors-v2/connector-file/connector-file-s3/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/s3/catalog/S3FileCatalog.java b/seatunnel-connectors-v2/connector-file/connector-file-s3/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/s3/catalog/S3FileCatalog.java
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.seatunnel.connectors.seatunnel.file.s3.catalog;
+
+import org.apache.seatunnel.api.configuration.ReadonlyConfig;
+import org.apache.seatunnel.api.table.catalog.Catalog;
+import org.apache.seatunnel.api.table.catalog.CatalogTable;
+import org.apache.seatunnel.api.table.catalog.TablePath;
+import org.apache.seatunnel.api.table.catalog.exception.CatalogException;
+import org.apache.seatunnel.api.table.catalog.exception.DatabaseAlreadyExistException;
+import org.apache.seatunnel.api.table.catalog.exception.DatabaseNotExistException;
+import org.apache.seatunnel.api.table.catalog.exception.TableAlreadyExistException;
+import org.apache.seatunnel.api.table.catalog.exception.TableNotExistException;
+import org.apache.seatunnel.connectors.seatunnel.file.hadoop.HadoopFileSystemProxy;
+import org.apache.seatunnel.connectors.seatunnel.file.s3.config.S3ConfigOptions;
+
+import org.apache.commons.collections4.CollectionUtils;
+import org.apache.hadoop.fs.LocatedFileStatus;
+
+import lombok.AllArgsConstructor;
+import lombok.SneakyThrows;
+
+import java.util.List;
+
+@AllArgsConstructor
+public class S3FileCatalog implements Catalog {
+
+    private final HadoopFileSystemProxy hadoopFileSystemProxy;
+    private final ReadonlyConfig readonlyConfig;
+
+    @Override
+    public void open() throws CatalogException {}
+
+    @Override
+    public void close() throws CatalogException {}
+
+    @Override
+    public String name() {
+        return "S3File";
+    }
+
+    @Override
+    public String getDefaultDatabase() throws CatalogException {
+        return null;
+    }
+
+    @Override
+    public boolean databaseExists(String databaseName) throws CatalogException {
+        return false;
+    }
+
+    @Override
+    public List<String> listDatabases() throws CatalogException {
+        return null;
+    }
+
+    @Override
+    public List<String> listTables(String databaseName)
+            throws CatalogException, DatabaseNotExistException {
+        return null;
+    }
+
+    @SneakyThrows
+    @Override
+    public boolean tableExists(TablePath tablePath) throws CatalogException {
+        return hadoopFileSystemProxy.fileExist(readonlyConfig.get(S3ConfigOptions.FILE_PATH));
+    }
+
+    @Override
+    public CatalogTable getTable(TablePath tablePath)
+            throws CatalogException, TableNotExistException {
+        return null;
+    }
+
+    @SneakyThrows
+    @Override
+    public void createTable(TablePath tablePath, CatalogTable table, boolean ignoreIfExists)
+            throws TableAlreadyExistException, DatabaseNotExistException, CatalogException {
+        hadoopFileSystemProxy.createDir(readonlyConfig.get(S3ConfigOptions.FILE_PATH));
+    }
+
+    @SneakyThrows
+    @Override
+    public void dropTable(TablePath tablePath, boolean ignoreIfNotExists)
+            throws TableNotExistException, CatalogException {
+        hadoopFileSystemProxy.deleteFile(readonlyConfig.get(S3ConfigOptions.FILE_PATH));
+    }
+
+    @Override
+    public void createDatabase(TablePath tablePath, boolean ignoreIfExists)
+            throws DatabaseAlreadyExistException, CatalogException {}
+
+    @Override
+    public void dropDatabase(TablePath tablePath, boolean ignoreIfNotExists)
+            throws DatabaseNotExistException, CatalogException {}
+
+    @SneakyThrows
+    @Override
+    public void truncateTable(TablePath tablePath, boolean ignoreIfNotExists)
+            throws TableNotExistException, CatalogException {
+        hadoopFileSystemProxy.deleteFile(readonlyConfig.get(S3ConfigOptions.FILE_PATH));
+        hadoopFileSystemProxy.createDir(readonlyConfig.get(S3ConfigOptions.FILE_PATH));
+    }
+
+    @SneakyThrows
+    @Override
+    public boolean isExistsData(TablePath tablePath) {
+        final List<LocatedFileStatus> locatedFileStatuses =
+                hadoopFileSystemProxy.listFile(readonlyConfig.get(S3ConfigOptions.FILE_PATH));
+        return CollectionUtils.isNotEmpty(locatedFileStatuses);
+    }
+}
diff --git a/seatunnel-connectors-v2/connector-file/connector-file-s3/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/s3/catalog/S3FileCatalogFactory.java b/seatunnel-connectors-v2/connector-file/connector-file-s3/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/s3/catalog/S3FileCatalogFactory.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.seatunnel.connectors.seatunnel.file.s3.catalog;
+
+import org.apache.seatunnel.api.configuration.ReadonlyConfig;
+import org.apache.seatunnel.api.configuration.util.OptionRule;
+import org.apache.seatunnel.api.table.catalog.Catalog;
+import org.apache.seatunnel.api.table.factory.CatalogFactory;
+import org.apache.seatunnel.api.table.factory.Factory;
+import org.apache.seatunnel.connectors.seatunnel.file.config.HadoopConf;
+import org.apache.seatunnel.connectors.seatunnel.file.hadoop.HadoopFileSystemProxy;
+import org.apache.seatunnel.connectors.seatunnel.file.s3.config.S3Conf;
+
+import com.google.auto.service.AutoService;
+
+@AutoService(Factory.class)
+public class S3FileCatalogFactory implements CatalogFactory {
+    @Override
+    public Catalog createCatalog(String catalogName, ReadonlyConfig options) {
+        HadoopConf hadoopConf = S3Conf.buildWithReadOnlyConfig(options);
+        HadoopFileSystemProxy fileSystemUtils = new HadoopFileSystemProxy(hadoopConf);
+        return new S3FileCatalog(fileSystemUtils, options);
+    }
+
+    @Override
+    public String factoryIdentifier() {
+        return "S3";
+    }
+
+    @Override
+    public OptionRule optionRule() {
+        return OptionRule.builder().build();
+    }
+}
diff --git a/seatunnel-connectors-v2/connector-file/connector-file-s3/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/s3/config/S3Conf.java b/seatunnel-connectors-v2/connector-file/connector-file-s3/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/s3/config/S3Conf.java
@@ -19,6 +19,7 @@
 
 import org.apache.seatunnel.shade.com.typesafe.config.Config;
 
+import org.apache.seatunnel.api.configuration.ReadonlyConfig;
 import org.apache.seatunnel.common.config.CheckConfigUtil;
 import org.apache.seatunnel.connectors.seatunnel.file.config.HadoopConf;
 
@@ -70,6 +71,30 @@ public static HadoopConf buildWithConfig(Config config) {
         return hadoopConf;
     }
 
+    public static HadoopConf buildWithReadOnlyConfig(ReadonlyConfig readonlyConfig) {
+        Config config = readonlyConfig.toConfig();
+        HadoopConf hadoopConf = new S3Conf(readonlyConfig.get(S3ConfigOptions.S3_BUCKET));
+        String bucketName = readonlyConfig.get(S3ConfigOptions.S3_BUCKET);
+        if (bucketName.startsWith(S3A_SCHEMA)) {
+            SCHEMA = S3A_SCHEMA;
+        }
+        HashMap<String, String> s3Options = new HashMap<>();
+        putS3SK(s3Options, config);
+        if (CheckConfigUtil.isValidParam(config, S3ConfigOptions.S3_PROPERTIES.key())) {
+            config.getObject(S3ConfigOptions.S3_PROPERTIES.key())
+                    .forEach((key, value) -> s3Options.put(key, String.valueOf(value.unwrapped())));
+        }
+
+        s3Options.put(
+                S3ConfigOptions.S3A_AWS_CREDENTIALS_PROVIDER.key(),
+                readonlyConfig.get(S3ConfigOptions.S3A_AWS_CREDENTIALS_PROVIDER).getProvider());
+        s3Options.put(
+                S3ConfigOptions.FS_S3A_ENDPOINT.key(),
+                readonlyConfig.get(S3ConfigOptions.FS_S3A_ENDPOINT));
+        hadoopConf.setExtraOptions(s3Options);
+        return hadoopConf;
+    }
+
     private String switchHdfsImpl() {
         switch (SCHEMA) {
             case S3A_SCHEMA:
diff --git a/seatunnel-connectors-v2/connector-file/connector-file-s3/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/s3/config/S3ConfigOptions.java b/seatunnel-connectors-v2/connector-file/connector-file-s3/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/s3/config/S3ConfigOptions.java
@@ -19,10 +19,17 @@
 
 import org.apache.seatunnel.api.configuration.Option;
 import org.apache.seatunnel.api.configuration.Options;
+import org.apache.seatunnel.api.sink.DataSaveMode;
+import org.apache.seatunnel.api.sink.SchemaSaveMode;
 import org.apache.seatunnel.connectors.seatunnel.file.config.BaseSourceConfigOptions;
 
+import java.util.Arrays;
 import java.util.Map;
 
+import static org.apache.seatunnel.api.sink.DataSaveMode.APPEND_DATA;
+import static org.apache.seatunnel.api.sink.DataSaveMode.DROP_DATA;
+import static org.apache.seatunnel.api.sink.DataSaveMode.ERROR_WHEN_DATA_EXISTS;
+
 public class S3ConfigOptions extends BaseSourceConfigOptions {
     public static final Option<String> S3_ACCESS_KEY =
             Options.key("access_key")
@@ -48,6 +55,22 @@ public class S3ConfigOptions extends BaseSourceConfigOptions {
                     .defaultValue(S3aAwsCredentialsProvider.InstanceProfileCredentialsProvider)
                     .withDescription("s3a aws credentials provider");
 
+    public static final Option<SchemaSaveMode> SCHEMA_SAVE_MODE =
+            Options.key("schema_save_mode")
+                    .enumType(SchemaSaveMode.class)
+                    .defaultValue(SchemaSaveMode.CREATE_SCHEMA_WHEN_NOT_EXIST)
+                    .withDescription(
+                            "Before the synchronization task begins, process the existing path");
+
+    public static final Option<DataSaveMode> DATA_SAVE_MODE =
+            Options.key("data_save_mode")
+                    .singleChoice(
+                            DataSaveMode.class,
+                            Arrays.asList(DROP_DATA, APPEND_DATA, ERROR_WHEN_DATA_EXISTS))
+                    .defaultValue(APPEND_DATA)
+                    .withDescription(
+                            "Before the synchronization task begins, different processing of data files that already exist in the directory");
+
     /**
      * The current key for that config option. if you need to add a new option, you can add it here
      * and refer to this:
diff --git a/seatunnel-connectors-v2/connector-file/connector-file-s3/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/s3/sink/S3FileSink.java b/seatunnel-connectors-v2/connector-file/connector-file-s3/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/s3/sink/S3FileSink.java
diff --git a/seatunnel-connectors-v2/connector-file/connector-file-s3/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/s3/sink/S3FileSinkFactory.java b/seatunnel-connectors-v2/connector-file/connector-file-s3/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/s3/sink/S3FileSinkFactory.java