### connect s3 using boto3

In [3]:
import boto3
import pandas as pd

In [4]:
#connect to s3 object storage
s3 = boto3.resource('s3')

In [9]:
#create a bucket called 'new-bucket-bxiao5050'
location = {'LocationConstraint': 'eu-central-1'}
new_bucket = s3.create_bucket(Bucket = 'new-bucket-bxiao5050', CreateBucketConfiguration=location)

In [11]:
#view all buckets in s3
for bucket in s3.buckets.all():
    print(bucket.name)

bxiaobxiao.bucket
new-bucket-bxiao5050


#### select a bucket, and upload, download and delete objects

In [14]:
#select bucket of 'bxiaobxiao.bucket'
bucket = s3.Bucket('bxiaobxiao.bucket')

In [22]:
#upload a file
local_file_path = '2022-04-25_BINS_XETR1415.csv'
key_object = 'added/a/new/object/test_data.csv'

bucket.upload_file(local_file_path, key_object)

In [24]:
#show all objects from 'bxiaobxiao.bucket'
for obj in bucket.objects.all():
    print(obj)

s3.ObjectSummary(bucket_name='bxiaobxiao.bucket', key='added/a/new/object/data.csv')
s3.ObjectSummary(bucket_name='bxiaobxiao.bucket', key='added/a/new/object/test_data.csv')


##### convert an object to dataframe

In [32]:
#get the content of this object, the content is in the string format
csv_obj = bucket.Object(key = 'added/a/new/object/data.csv').get().get('Body').read().decode('utf-8')
print(type(csv_obj))
csv_obj

<class 'str'>


In [36]:
#use package StringIO to convert string to dataframe
from io import StringIO
data = StringIO(csv_obj)

df = pd.read_csv(data, delimiter = ",")
df.head()

Unnamed: 0,ISIN,Mnemonic,SecurityDesc,SecurityType,Currency,SecurityID,Date,Time,StartPrice,MaxPrice,MinPrice,EndPrice,TradedVolume,NumberOfTrades
0,AT0000A0E9W5,SANT,S+T AG O.N.,Common stock,EUR,2504159,2022-01-20,08:00,15.14,15.25,15.11,15.25,14239,9
1,DE000A0DJ6J9,S92,SMA SOLAR TECHNOL.AG,Common stock,EUR,2504287,2022-01-20,08:00,32.8,32.8,32.8,32.8,357,1
2,DE000A0D6554,NDX1,NORDEX SE O.N.,Common stock,EUR,2504290,2022-01-20,08:00,13.97,14.1,13.96,14.09,18908,32
3,DE000A0D9PT0,MTX,MTU AERO ENGINES NA O.N.,Common stock,EUR,2504297,2022-01-20,08:00,198.6,199.15,198.1,198.65,2216,29
4,DE000A0HN5C6,DWNI,DEUTSCHE WOHNEN SE INH,Common stock,EUR,2504314,2022-01-20,08:00,35.49,35.49,35.38,35.43,469,6


#### convert dataframe to parquet and upload to s3 (bucket name: new-bucket-bxiao5050)

In [45]:
#need to install pyarrow package, and use io.BytesIO to get a buffer to the parquet content
from io import BytesIO

out_buffer = BytesIO()
df_new = df[['ISIN', 'EndPrice']]
df_new.to_parquet(out_buffer, index = False)

s3.Bucket('new-bucket-bxiao5050').put_object(Body=out_buffer.getvalue(), Key = 'upload/a/file/xxx.parquet')

s3.Object(bucket_name='new-bucket-bxiao5050', key='upload/a/file/xxx.parquet')

In [46]:
#list all objects from new-bucket-bxiao5050 bucket
[print(obj) for obj in s3.Bucket('new-bucket-bxiao5050').objects.all()]

s3.ObjectSummary(bucket_name='new-bucket-bxiao5050', key='upload/a/file/xxx.parquet')


[None]

#### read the uploaded parquet file

In [41]:
prq_obj = s3.Bucket('new-bucket-bxiao5050').Object(key='upload/a/file/xxx.parquet').get().get('Body').read()
data = BytesIO(prq_obj)
df_prq = pd.read_parquet(data)
df_prq.head()

ArrowInvalid: Could not open Parquet input source '<Buffer>': Parquet file size is 0 bytes

In [42]:
prq_obj = s3.Bucket('new-bucket-bxiao5050').Object(key='upload/a/file/xxx.parquet').get().get('Body').read()
prq_obj

b''