#### Setting Up Boto3 and Interacting with S3

**1. Import necessary libraries and read AWS credentials:**

In [1]:
import configparser
import boto3
import pandas as pd

# Import your credentials from your .cfg file

config = configparser.ConfigParser()
config.read('aws.cfg')

aws_access_key = config['AWS']['aws_access_key_id']
aws_secret_key = config['AWS']['aws_secret_access_key']

**2. Create an S3 client:**

In [2]:
# Initialize the S3 client
s3 = boto3.client(
    's3',
    aws_access_key_id=aws_access_key,
    aws_secret_access_key=aws_secret_key
)

# Verify the client is set up correctly by listing buckets
# response = s3.list_buckets()
# print('Existing buckets:')
# for bucket in response['Buckets']:
#     print(f'  {bucket["Name"]}')

**3. List all buckets and get the location of a specific bucket:**

In [3]:
bucket_name = 'aryan-techcatalyst-awswrangler-lab'
# if not us-east-1
location = config['AWS']['region_name']

# YOUR CODE TO CREATE A BUCKET
s3.create_bucket(Bucket=bucket_name)
print(f'{bucket_name} has been created in {location}')

aryan-techcatalyst-awswrangler-lab has been created in us-west-2


**4. Create a new bucket:**

In [4]:
filename = '/workspaces/boto3_lab/awswrangler/test.csv'
key = 'upload_file/test_s3.csv'

# UPLOAD_FILE CODE
s3.upload_file(filename, bucket_name, key)
# Print a confirmation message
print(f'File: {filename} uploaded to bucket {bucket_name} with key {key}.')


#PUT_OBJECT CODE
with open (filename, 'rb') as f:
    s3.put_object(Bucket=bucket_name, Key=key, Body=f)
# Print a confirmation message
print(f'File uploaded to bucket {bucket_name} with key {key}.')


File: /workspaces/boto3_lab/awswrangler/test.csv uploaded to bucket aryan-techcatalyst-awswrangler-lab with key upload_file/test_s3.csv.
File uploaded to bucket aryan-techcatalyst-awswrangler-lab with key upload_file/test_s3.csv.


**5. Upload files using `upload_file` and `put_object`: A file of your choice**

#### Transition to AWS SDK for Pandas (awswrangler)

**1. Install awswrangler:**

In [5]:
# !pip install awswrangler

Collecting numpy<2.0,>=1.18 (from awswrangler)
  Using cached numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.1
    Uninstalling numpy-2.0.1:
      Successfully uninstalled numpy-2.0.1
Successfully installed numpy-1.26.4


In [6]:
# !pip install numpy --upgrade

Collecting numpy
  Using cached numpy-2.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Using cached numpy-2.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19.5 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
awswrangler 3.9.0 requires numpy<2.0,>=1.18; python_version < "3.12", but you have numpy 2.0.1 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-2.0.1


**2. Import awswrangler and set up default session:**

In [9]:
import awswrangler as wr

boto3.setup_default_session(
    aws_access_key_id=config['AWS']['aws_access_key_id'],
    aws_secret_access_key=config['AWS']['aws_secret_access_key'],
    region_name=config['AWS']['region_name']
)

**3. Read data from S3 directly into a Pandas DataFrame:**

In [11]:
try:
    df = wr.s3.read_csv('s3://aryan-techcatalyst-awswrangler-lab/upload_file/test_s3.csv') 
except Exception as e:
    print('error')
    print(e)

**4. Inspect the DataFrame: Check the type (is it a Pandas DataFrame), DataFrame Shape, print first few rows, inspect with info**

In [12]:
print(type(df))

<class 'pandas.core.frame.DataFrame'>


**5. Write DataFrame to S3 as a Parquet file:**

In [14]:
# complete the code
wr.s3.to_parquet(
    df=df, 
    path= "s3://aryan-techcatalyst-awswrangler-lab/",
    dataset=True,
    mode="overwrite"
)

{'paths': ['s3://aryan-techcatalyst-awswrangler-lab/c32cd76f4f774255b6e44a7a6f51064f.snappy.parquet'],
 'partitions_values': {}}

**6. Work with AWS Glue Catalog:**

* List databases: Note this will be showing Glue databases based on the region you specified when you created the session

In [15]:
databases = wr.catalog.databases()
print(databases)

                                      Database Description
0  studio_widget_input-stream_1721327027000_db            


* **Create a new database if it doesn't exist:** Call it <yourname>-awswrangler_test
* REPLACE with actual value

In [16]:
if "aryan_awswrangler_test" not in databases.values:
    wr.catalog.create_database("aryan_awswrangler_test")
    print(wr.catalog.databases())
else:
    print("Database aryan_awswrangler_test already exists")

                                      Database Description
0                       aryan_awswrangler_test            
1  studio_widget_input-stream_1721327027000_db            


* **List tables in a database:** The one you created as well as another Database of your choice that is in Glue

In [23]:
tables = wr.catalog.tables(name_contains="aryan")
print(tables)

Empty DataFrame
Columns: [Database, Table, Description, TableType, Columns, Partitions]
Index: []


* **Write DataFrame to Data Lake with Glue Catalog:**

In [24]:
desc = "This is the csv file that we used yesterday, expect I wrote it to parquet"
param = {"source": "https://github.com/tatwan/TechCatalyst_DE/tree/main/activities/wk6/awswrangler", "class": "sample data"}
comments = {
    "name" : "The name of the person",
    "favorite_num" : "The person's favorite number"
}

res = wr.s3.to_parquet(
    df=df,
    path= "s3://aryan-techcatalyst-awswrangler-lab/",
    dataset=True,
    database= "aryan_awswrangler_test",
    table="test_parquet",
    mode="overwrite",
    glue_table_settings=wr.typing.GlueTableSettings(description=desc, parameters =param, columns_comments=comments)
)

In [25]:
wr.catalog.tables(database="aryan_awswrangler_test")

Unnamed: 0,Database,Table,Description,TableType,Columns,Partitions
0,aryan_awswrangler_test,test_parquet,"This is the csv file that we used yesterday, e...",EXTERNAL_TABLE,"name, favorite_num",


In [29]:
wr.catalog.tables(name_contains="test")

Unnamed: 0,Database,Table,Description,TableType,Columns,Partitions
0,aryan_awswrangler_test,test_parquet,"This is the csv file that we used yesterday, e...",EXTERNAL_TABLE,"name, favorite_num",


**7. Query data using Athena:**

In [30]:
df_athena = wr.athena.read_sql_query(
    sql="SELECT * FROM test_parquet LIMIT 10",
    database="aryan_awswrangler_test",
    ctas_approach=True
)
print(df_athena)

     name  favorite_num
0  Vrinda            22
1   Tracy            28
2  Gareth            23
3   Chris            16
4    Emma            14
5  Carlos             7
6  Cooper            11
7  Praful             4
8   David            33
9  Shilpa             2
