In [1]:
from dotenv import load_dotenv
import os
import boto3
import botocore
import pandas as pd
import pymongo



## Setup boto to access AWS bucket

In [None]:
load_dotenv()

ACCESS_KEY = os.getenv('AWS_ACCESS_KEY_ID')
SECRET_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
REGION = os.getenv('REGION')

In [None]:
client = boto3.client(
    's3',
    aws_access_key_id=ACCESS_KEY,
    aws_secret_access_key=SECRET_KEY,
)

In [4]:
s3 = boto3.resource('s3')

## Show all files in bucket

In [5]:
my_bucket = s3.Bucket('legalthings-datalake')

for file in my_bucket.objects.all():
    if file.key.startswith('mongo/'):
        print(file.key, file.size)

mongo/company_sbi.csv 31368
mongo/company_sbi_new.csv 10
mongo/emails.json 39776579
mongo/incorporation-processes-short.json 257748397
mongo/incorporation-processes.json 7435914253
mongo/licenses.json 67158
mongo/organizations.json 40311014
mongo/packages.json 25228872
mongo/transactions.json 57612040
mongo/users.json 32239012


#### List files in remote aws directory

In [None]:
from risk_model import storage

In [None]:
storage.list_files(
    bucket='legalthings-datalake',
    remote='mongo/',
    file_type=['json']
)

#### Download files from remote to local directory

In [None]:
storage.download_files_remote_to_local(
    bucket='legalthings-datalake',
    remote='datastudio/BVshareholders', 
    local='../data/',
    file_type=['csv']
)

#### Download file and save on local destination

In [None]:
BUCKET_NAME = 'legalthings-datalake'
KEY = 'datastudio/BVshareholders/part-00000-20269f50-2b21-4c2d-b896-6bd87829a261-c000.csv'

s3 = boto3.resource('s3')

try:
    s3.Bucket(BUCKET_NAME).download_file(KEY, '../data/example.csv')
except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == "404":
        print("The object does not exist.")
    else:
        raise

#### Explore number of files in subdirectory

In [None]:
my_bucket = s3.Bucket('legalthings-datalake')

bucket_categories = []

for file in my_bucket.objects.all():
    try:
        directory = file.key.split('/')[0] + '/' + file.key.split('/')[1]
        bucket_categories.append(directory)
    except:
        print('No sub directory')

In [None]:
df = pd.DataFrame(bucket_categories, columns=['directory'])

In [None]:
df.groupby('directory').size().head()