aws · igorborgest · Feb 2, 2020 · Feb 2, 2020 · Feb 2, 2020 · Feb 2, 2020
diff --git a/awswrangler/glue.py b/awswrangler/glue.py
@@ -379,6 +379,7 @@ def csv_partition_definition(partition, compression, extra_args=None):
         return {
             "StorageDescriptor": {
                 "InputFormat": "org.apache.hadoop.mapred.TextInputFormat",
+                "OutputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
                 "Location": partition[0],
                 "Compressed": compressed,
                 "SerdeInfo": {
@@ -440,7 +441,8 @@ def parquet_partition_definition(partition, compression):
         compressed = False if compression is None else True
         return {
             "StorageDescriptor": {
-                "InputFormat": "org.apache.hadoop.mapred.TextInputFormat",
+                "InputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
+                "OutputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
                 "Location": partition[0],
                 "Compressed": compressed,
                 "SerdeInfo": {

diff --git a/awswrangler/pandas.py b/awswrangler/pandas.py
@@ -63,7 +63,7 @@ def read_csv(self, path: str, max_result_size: Optional[int] = None, **pd_additi
 
         :param path: Amazon S3 path (e.g. s3://bucket_name/key_name)
         :param max_result_size: Max number of bytes on each request to S3. It offers functionality similar to chunksize in pandas.read_csv(), but with higher performance
-        :param **pd_additional_kwargs: Additional parameters forwarded to pandas.read_csv
+        :param pd_additional_kwargs: Additional parameters forwarded to pandas.read_csv
         :return: Pandas Dataframe or Iterator of Pandas Dataframes if max_result_size != None
         """
 
@@ -1583,7 +1583,7 @@ def read_csv_list(
         :param paths: List of Amazon S3 paths (e.g. ['s3://bucket_name/key_name1', 's3://bucket_name/key_name2'])
         :param max_result_size: Max number of bytes on each request to S3. It offers functionality similar to chunksize in pandas.read_csv(), but with higher performance
         :param procs_cpu_bound: Number of cores used for CPU bound tasks
-        :param **pd_additional_kwargs: Additional parameters forwarded to pandas.read_csv
+        :param pd_additional_kwargs: Additional parameters forwarded to pandas.read_csv
         :return: Pandas Dataframe or Iterator of Pandas Dataframes if max_result_size != None
         """
         if max_result_size is not None:
@@ -1636,7 +1636,7 @@ def _read_csv_list_iterator(self, paths: List[str], max_result_size=None, **pd_a
 
         :param paths: List of Amazon S3 paths (e.g. ['s3://bucket_name/key_name1', 's3://bucket_name/key_name2'])
         :param max_result_size: Max number of bytes on each request to S3. It offers functionality similar to chunksize in pandas.read_csv(), but with higher performance
-        :param **pd_additional_kwargs: Additional parameters forwarded to pandas.read_csv
+        :param pd_additional_kwargs: Additional parameters forwarded to pandas.read_csv
         :return: Iterator of iterators of Pandas Dataframes
         """
         for path in paths:

diff --git a/awswrangler/redshift.py b/awswrangler/redshift.py
@@ -44,7 +44,7 @@ def _validate_connection(database,
                              password,
                              tcp_keepalive=True,
                              application_name="aws-data-wrangler-validation",
-                             validation_timeout=5):
+                             validation_timeout=10):
         conn = pg8000.connect(database=database,
                               host=host,
                               port=int(port),
@@ -66,7 +66,7 @@ def generate_connection(database,
                             application_name="aws-data-wrangler",
                             connection_timeout=1_200_000,
                             statement_timeout=1_200_000,
-                            validation_timeout=5):
+                            validation_timeout=10):
         """
         Generates a valid connection object to be passed to the load_table method
 

diff --git a/testing/deploy-cloudformation.sh b/testing/deploy-cloudformation.sh
@@ -2,7 +2,11 @@
 set -e
 
 aws cloudformation deploy \
---template-file template.yaml \
---stack-name aws-data-wrangler-test \
---capabilities CAPABILITY_IAM \
---parameter-overrides $(cat parameters.properties)
+  --template-file template.yaml \
+  --stack-name aws-data-wrangler-test \
+  --capabilities CAPABILITY_IAM \
+  --parameter-overrides $(cat parameters.properties)
+
+aws cloudformation update-termination-protection \
+  --enable-termination-protection \
+  --stack-name aws-data-wrangler-test
diff --git a/testing/parameters.properties b/testing/parameters.properties
@@ -2,5 +2,5 @@ VpcId=VPC_ID
 SubnetId=SUBNET_ID
 SubnetId2=SUBNET_ID2
 SubnetAz=AVAILABILITY_ZONE
-Password=REDSHIFT_PASSWORD
-TestUser=AWS_USER_THAT_WILL_RUN_THE_TESTS_ON_CLI
+DatabasesPassword=REDSHIFT_PASSWORD
+AWSUserForTests=AWS_USER_THAT_WILL_RUN_THE_TESTS_ON_CLI
diff --git a/testing/template.yaml b/testing/template.yaml
@@ -16,10 +16,10 @@ Parameters:
   SubnetAz:
     Type: String
     Description: Subnet AZ
-  Password:
+  DatabasesPassword:
     Type: String
-    Description: Redshift Password
-  TestUser:
+    Description: Password for all databases
+  AWSUserForTests:
     Type: String
     Description: AWS User that will running the tests on the CLI
 
@@ -49,7 +49,7 @@ Resources:
           - Sid: "Allow administration of the key"
             Effect: "Allow"
             Principal:
-              AWS: !Join ["", ["arn:aws:iam::", !Ref "AWS::AccountId", ":user/", !Ref TestUser]]
+              AWS: !Join ["", ["arn:aws:iam::", !Ref "AWS::AccountId", ":user/", !Ref AWSUserForTests]]
             Action:
               - "kms:Create*"
               - "kms:Describe*"
@@ -95,7 +95,7 @@ Resources:
               - sts:AssumeRole
       Path: "/"
       Policies:
-        - PolicyName: S3GetAndList
+        - PolicyName: Root
           PolicyDocument:
             Version: 2012-10-17
             Statement:
@@ -107,6 +107,30 @@ Resources:
                 Resource:
                   - !Join ['', ['arn:aws:s3:::', !Ref Bucket]]
                   - !Join ['', ['arn:aws:s3:::', !Ref Bucket, /*]]
+              - Effect: Allow
+                Action:
+                  - "lakeformation:GrantPermissions"
+                Resource: "*"
+              - Effect: Allow
+                Action:
+                  - "glue:SearchTables"
+                  - "glue:GetConnections"
+                  - "glue:GetDataCatalogEncryptionSettings"
+                  - "glue:GetTables"
+                  - "glue:GetTableVersions"
+                  - "glue:GetPartitions"
+                  - "glue:DeleteTableVersion"
+                  - "glue:BatchGetPartition"
+                  - "glue:GetDatabases"
+                  - "glue:GetTags"
+                  - "glue:GetTable"
+                  - "glue:GetDatabase"
+                  - "glue:GetPartition"
+                  - "glue:GetTableVersion"
+                  - "glue:GetConnection"
+                  - "glue:GetUserDefinedFunction"
+                  - "glue:GetUserDefinedFunctions"
+                Resource: "*"
 
   RedshiftSubnetGroup:
     Type: AWS::Redshift::ClusterSubnetGroup
@@ -140,7 +164,7 @@ Resources:
     Properties:
       DBName: test
       MasterUsername: test
-      MasterUserPassword: !Ref Password
+      MasterUserPassword: !Ref DatabasesPassword
       NodeType: dc2.large
       ClusterType: single-node
       VpcSecurityGroupIds:
@@ -223,7 +247,7 @@ Resources:
       Engine: aurora-postgresql
       DBClusterIdentifier : postgres-cluster-wrangler
       MasterUsername: test
-      MasterUserPassword: !Ref Password
+      MasterUserPassword: !Ref DatabasesPassword
       BackupRetentionPeriod: 1
       DBSubnetGroupName: !Ref RdsSubnetGroup
       VpcSecurityGroupIds:
@@ -264,19 +288,21 @@ Resources:
       Engine: aurora-mysql
       DBClusterIdentifier: mysql-cluster-wrangler
       MasterUsername: test
-      MasterUserPassword: !Ref Password
+      MasterUserPassword: !Ref DatabasesPassword
       BackupRetentionPeriod: 1
       DBSubnetGroupName: !Ref RdsSubnetGroup
       VpcSecurityGroupIds:
         - !Ref DatabaseSecurityGroup
       DBClusterParameterGroupName: !Ref MysqlParameterGroup
+      DatabaseName: test
       AssociatedRoles:
         - RoleArn: !GetAtt AuroraRole.Arn
 
   AuroraInstanceMysql:
     Type: AWS::RDS::DBInstance
     Properties:
       Engine: aurora-mysql
+#      DBName: test
       DBInstanceIdentifier: mysql-instance-wrangler
       DBClusterIdentifier: !Ref AuroraClusterMysql
       DBInstanceClass: db.t3.medium
@@ -285,6 +311,9 @@ Resources:
 
   RedshiftGlueConnection:
     Type: AWS::Glue::Connection
+    DependsOn:
+      - DatabaseSecurityGroup
+      - Redshift
     Properties:
       CatalogId: !Ref AWS::AccountId
       ConnectionInput:
@@ -310,12 +339,15 @@ Resources:
                 ],
               ],
             "USERNAME": test,
-            "PASSWORD": !Ref Password,
+            "PASSWORD": !Ref DatabasesPassword,
           }
         Name: "aws-data-wrangler-redshift"
 
   PostgresGlueConnection:
     Type: AWS::Glue::Connection
+    DependsOn:
+      - DatabaseSecurityGroup
+      - AuroraInstancePostgres
     Properties:
       CatalogId: !Ref AWS::AccountId
       ConnectionInput:
@@ -341,12 +373,15 @@ Resources:
                 ],
               ],
             "USERNAME": test,
-            "PASSWORD": !Ref Password,
+            "PASSWORD": !Ref DatabasesPassword,
           }
         Name: "aws-data-wrangler-postgres"
 
   MysqlGlueConnection:
     Type: AWS::Glue::Connection
+    DependsOn:
+      - DatabaseSecurityGroup
+      - AuroraInstanceMysql
     Properties:
       CatalogId: !Ref AWS::AccountId
       ConnectionInput:
@@ -372,7 +407,7 @@ Resources:
                 ],
               ],
             "USERNAME": test,
-            "PASSWORD": !Ref Password,
+            "PASSWORD": !Ref DatabasesPassword,
           }
         Name: "aws-data-wrangler-mysql"
 
@@ -398,12 +433,12 @@ Outputs:
     Description: Name of the S3 Bucket used for tests.
   RedshiftAddress:
     Value: !GetAtt Redshift.Endpoint.Address
-    Description: Redshift Password.
+    Description: Redshift address.
   RedshiftPort:
     Value: !GetAtt Redshift.Endpoint.Port
     Description: Redshift Endpoint Port.
-  Password:
-    Value: !Ref Password
+  DatabasesPassword:
+    Value: !Ref DatabasesPassword
     Description: Password.
   RedshiftRole:
     Value: !GetAtt RedshiftRole.Arn
@@ -434,4 +469,7 @@ Outputs:
     Description: Mysql Address
   DynamoDbTableARN:
     Value: !GetAtt DynamoDBTable.Arn
-    Description: DynamoDB table name
+    Description: DynamoDB table name
+  Region:
+    Value: !Ref AWS::Region
+    Description: AWS Region
diff --git a/testing/test_awswrangler/test_aurora.py b/testing/test_awswrangler/test_aurora.py
@@ -26,8 +26,8 @@ def postgres_parameters(cloudformation_outputs):
         postgres_parameters["PostgresAddress"] = cloudformation_outputs.get("PostgresAddress")
     else:
         raise Exception("You must deploy the test infrastructure using SAM!")
-    if "Password" in cloudformation_outputs:
-        postgres_parameters["Password"] = cloudformation_outputs.get("Password")
+    if "DatabasesPassword" in cloudformation_outputs:
+        postgres_parameters["DatabasesPassword"] = cloudformation_outputs.get("DatabasesPassword")
     else:
         raise Exception("You must deploy the test infrastructure using SAM!")
     yield postgres_parameters
@@ -40,8 +40,8 @@ def mysql_parameters(cloudformation_outputs):
         mysql_parameters["MysqlAddress"] = cloudformation_outputs.get("MysqlAddress")
     else:
         raise Exception("You must deploy the test infrastructure using SAM!")
-    if "Password" in cloudformation_outputs:
-        mysql_parameters["Password"] = cloudformation_outputs.get("Password")
+    if "DatabasesPassword" in cloudformation_outputs:
+        mysql_parameters["DatabasesPassword"] = cloudformation_outputs.get("DatabasesPassword")
     else:
         raise Exception("You must deploy the test infrastructure using SAM!")
     yield mysql_parameters
@@ -52,7 +52,7 @@ def test_postgres_connection(postgres_parameters):
                                       host=postgres_parameters["PostgresAddress"],
                                       port=3306,
                                       user="test",
-                                      password=postgres_parameters["Password"],
+                                      password=postgres_parameters["DatabasesPassword"],
                                       engine="postgres")
     cursor = conn.cursor()
     cursor.execute("SELECT 1 + 2, 3 + 4")
@@ -68,7 +68,7 @@ def test_mysql_connection(mysql_parameters):
                                       host=mysql_parameters["MysqlAddress"],
                                       port=3306,
                                       user="test",
-                                      password=mysql_parameters["Password"],
+                                      password=mysql_parameters["DatabasesPassword"],
                                       engine="mysql")
     cursor = conn.cursor()
     cursor.execute("SELECT 1 + 2, 3 + 4")
@@ -85,5 +85,5 @@ def test_invalid_engine(mysql_parameters):
                                    host=mysql_parameters["MysqlAddress"],
                                    port=3306,
                                    user="test",
-                                   password=mysql_parameters["Password"],
+                                   password=mysql_parameters["DatabasesPassword"],
                                    engine="foo")