In [2]:
import configparser
import boto3
import pandas as pd

# Load AWS credentials from the configuration file
config = configparser.ConfigParser()
config.read('aws.cfg')

aws_access_key = config.get('AWS','aws_access_key_id')
aws_secret_key = config.get('AWS','aws_secret_access_key')

### creating an s3 bucket

In [3]:
# Initialize the S3 client
s3 = boto3.client(
    's3',
    aws_access_key_id=aws_access_key,
    aws_secret_access_key=aws_secret_key
)

In [4]:
## list all existing bucket and location

response = s3.list_buckets()


for bucket in response['Buckets']:
    if bucket["Name"] == 'samee-awswrangler-practice':
        loca = s3.get_bucket_location(Bucket ='samee-awswrangler-practice')

        print(loca)

    #print(f'  {bucket["Name"]}')


### creating a new bucket

In [5]:
# s3.create_bucket(Bucket='andy-wrangler-bucket')
# s3.create_bucket(Bucket='andy-wrangler-bucket', CreateBucketConfiguration={
#     'LocationConstraint': 'us-west-2'})

In [6]:
bucket = 'andy-wrangler-bucket'

In [7]:
# s3.upload_file('/workspaces/techcatalyst-DE-andy/activities/intro_awswrangler/upload.txt',bucket,'upload_andy.txt')

In [8]:
#pip install awswrangler

##### Import awswrangler and set up default session

In [9]:
import awswrangler as wr

boto3.setup_default_session(
    aws_access_key_id= config.get('AWS','aws_access_key_id'),
    aws_secret_access_key= config.get('AWS','aws_secret_access_key'),
    region_name=config.get('AWS','region_name')
)

In [10]:
try:
    df = wr.s3.read_parquet(path = 's3://techcatalyst-raw/yellow_tripdata_2024-01.parquet') # YOUR CODE 
except Exception as e:
    print('error')
    print(e)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2964624 entries, 0 to 2964623
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               Int32         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        Int64         
 4   trip_distance          float64       
 5   RatecodeID             Int64         
 6   store_and_fwd_flag     string        
 7   PULocationID           Int32         
 8   DOLocationID           Int32         
 9   payment_type           Int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  Airport_fee           

In [12]:
# wr.s3.to_parquet(
#     df= df,
#     path='s3://andy-wrangler-bucket/',
#     dataset=True,
#     mode='append'
# )

In [13]:
databases = wr.catalog.databases()
print(databases)

                                      Database Description
0                       aryan_awswrangler_test            
1                         ben-awswrangler_test            
2                       jason-awswrangler_test            
3  studio_widget_input-stream_1721327027000_db            
4                        zayd-awswrangler_test            


In [14]:
if "andy-awswrangler_test" not in databases.values:
    wr.catalog.create_database(name='andy-awswrangler-test')
    print(wr.catalog.databases())
else:
    print("Database awswrangler_test already exists")

                                      Database Description
0                        andy-awswrangler-test            
1                       aryan_awswrangler_test            
2                         ben-awswrangler_test            
3                       jason-awswrangler_test            
4  studio_widget_input-stream_1721327027000_db            
5                        zayd-awswrangler_test            


In [16]:
desc = "This is test for wrangler table andy."

param = {"source": "NYC Taxi Web Service https://www.nyc.gov", "class": "e-commerce"}

comments = {
    "tpep_pickup_datetime": "The date and time when the meter was engaged.",
    "PULocationID": "TLC Taxi Zone in which the taximeter was engaged",
    "payment_type": "A numeric code signifying how the passenger paid for the trip",
    "fare_amount": "The time-and-distance fare calculated by the meter.",
}

res = wr.s3.to_parquet(
    df=df,
    path=f's3://andy-wrangler-bucket/',
    dataset=True,
    database="andy-awswrangler-test",
    table="wrangler-table",
    mode="overwrite",
    glue_table_settings=wr.typing.GlueTableSettings(description=desc, parameters=param, columns_comments=comments),
)