[![AWS Data Wrangler](_static/logo.png "AWS Data Wrangler")](https://github.com/awslabs/aws-data-wrangler)

# 25 - Redshift - Loading Parquet files with Spectrum

## Enter your bucket name:

In [1]:
import getpass
bucket = getpass.getpass()
PATH = f"s3://{bucket}/files/"

 ············


## Mocking some Parquet Files on S3

In [2]:
import awswrangler as wr
import pandas as pd

df = pd.DataFrame({
    "col0": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
    "col1": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"],
})

df

Unnamed: 0,col0,col1
0,0,a
1,1,b
2,2,c
3,3,d
4,4,e
5,5,f
6,6,g
7,7,h
8,8,i
9,9,j


In [3]:
wr.s3.to_parquet(df, PATH, max_rows_by_file=2, dataset=True, mode="overwrite");

## Crawling the metadata and adding into Glue Catalog

In [4]:
wr.s3.store_parquet_metadata(
    path=PATH,
    database="aws_data_wrangler",
    table="test",
    dataset=True,
    mode="overwrite"
)

({'col0': 'bigint', 'col1': 'string'}, None, None)

## Running the CTAS query to load the data into Redshift storage

In [5]:
eng = wr.catalog.get_engine(connection="aws-data-wrangler-redshift", connect_args={"options": "-c statement_timeout=900"})

In [6]:
query = "CREATE TABLE public.test AS (SELECT * FROM aws_data_wrangler_external.test)"

In [7]:
with eng.connect() as con:
    con.execute(query)

## Running an INSERT INTO query to load MORE data into Redshift storage

In [8]:
df = pd.DataFrame({
    "col0": [10, 11],
    "col1": ["k", "l"],
})
wr.s3.to_parquet(df, PATH, dataset=True, mode="overwrite");

In [9]:
query = "INSERT INTO public.test (SELECT * FROM aws_data_wrangler_external.test)"

In [10]:
with eng.connect() as con:
    con.execute(query)

## Checking the result

In [11]:
query = "SELECT * FROM public.test"

In [12]:
wr.db.read_sql_table(con=eng, schema="public", table="test")

Unnamed: 0,col0,col1
0,9,j
1,5,f
2,7,h
3,1,b
4,3,d
5,11,l
6,8,i
7,4,e
8,6,g
9,0,a
