# 5 - Uploading to S.A.M. bucket

## 1. Load .env variables



In [6]:
# load .env variables
from dotenv import load_dotenv
import os

load_dotenv()

GITHUB_PROFILE_NAME = os.getenv('GITHUB_PROFILE_NAME')
genre = os.getenv('genre')
file_count = os.getenv('file_count')
final_folder_name = os.getenv('final_folder_name')

## 2 Organizing dataframe files

In [None]:
# reorganize dataframes for uploading
!python scripts/reorganize_df_files.py

## 3. Uploading to S3 Bucket

### 3.1. You need to be in :
  - `../stable-audio-tools-sam/sam_files`

In [2]:
# run this cell (path move)
%cd ..
%cd ..

/home/arthurcornelio/code/arthurcornelio88/stable-audio-tools-sam/sam_files/notebooks
/home/arthurcornelio/code/arthurcornelio88/stable-audio-tools-sam/sam_files


### 3.3 Upload your AWS Credentials
- Put the "rootkey.csv" in /sam-files
- - ../Modèles/RunPod-arthur/runpod-repo/model_v2
- Run the cell. The values will go to the variables, the "rootkey.csv" will be deleted.

In [2]:
# upload AWS Credentials
import pandas as pd
import os

aws_key = pd.read_csv("rootkey.csv")

AWS_ACCESS_KEY_ID = aws_key['Access key ID'][0]
AWS_SECRET_ACCESS_KEY = aws_key['Secret access key'][0]

os.remove("rootkey.csv")

### 3.3  Define your BUCKET_NAME

In [3]:
BUCKET_NAME = 'runpod-sam-model'

### 3.2. Creating .tar file for uploading

In [3]:
# ERASE THIS
import os

tar_code = f'tar -cvf 5_test_classical_json6.tar 5_test_classical_6'
os.system(tar_code)

5_test_classical_6/
5_test_classical_6/4_majestic-voyage-200663.json
5_test_classical_6/4_majestic-voyage-200663.mp3
5_test_classical_6/5_drive-to-triumph-188794.mp3
5_test_classical_6/5_drive-to-triumph-188794.json
5_test_classical_6/3_science-documentary-169621.mp3
5_test_classical_6/2_risk-136788.json
5_test_classical_6/1_inspiring-cinematic-ambient-116199.mp3
5_test_classical_6/2_risk-136788.mp3
5_test_classical_6/1_inspiring-cinematic-ambient-116199.json
5_test_classical_6/3_science-documentary-169621.json


0

In [None]:
# create .tar file
import os

tar_code = f'tar -cvf {final_folder_name}.tar audio_files/by_genre/{final_folder_name}'
os.system(tar_code)

In [None]:
# if needed, install boto3
%pip install boto3

In [None]:
# upload bucket in amazon s3
import boto3
import os
import logging
from boto3.s3.transfer import S3Transfer, TransferConfig

load_dotenv()
TIMESTAMP_DF = os.getenv('TIMESTAMP_DF')

# Set up logging to a file
logging.basicConfig(filename='upload_log.txt', level=logging.DEBUG,
                    format='%(asctime)s - %(levelname)s - %(message)s')

# Disable console logging from Boto3
logging.getLogger('boto3').setLevel(logging.CRITICAL)  # Set to CRITICAL to suppress all Boto3 messages
logging.getLogger('botocore').setLevel(logging.CRITICAL)
logging.getLogger('s3transfer').setLevel(logging.CRITICAL)

# Initialize S3 client
print("Establishing connection to S3... ", end="")
s3 = boto3.client('s3',
                  aws_access_key_id=AWS_ACCESS_KEY_ID,
                  aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
print("Connected!")

# Files to upload
tar_to_upload = f'{final_folder_name}.tar'
folder_to_upload = f'dataframes_{TIMESTAMP_DF}'

# Upload the folder first
total_files = sum([len(files) for root, dirs, files in os.walk(folder_to_upload)])
uploaded_files = 0

print(f"Uploading {total_files} files from '{folder_to_upload}' to {BUCKET_NAME}/{folder_to_upload}:")
for root, dirs, files in os.walk(folder_to_upload):
    for file in files:
        file_path = os.path.join(root, file)

        # Include the folder_to_upload in the S3 key
        s3_key = os.path.join(folder_to_upload, os.path.relpath(file_path, folder_to_upload))

        print(f"  Uploading {file_path}... ", end="")
        s3.upload_file(file_path, BUCKET_NAME, s3_key)
        print("Done!")

        uploaded_files += 1
        progress = (uploaded_files / total_files) * 100
        print(f"  Progress: {progress:.1f}%")

print("Folder upload complete!")

# Upload the zipped file using multipart upload with S3Transfer
print(f"\nUploading {tar_to_upload} to {BUCKET_NAME}...")

# Configure multipart upload and transfer
config = TransferConfig(multipart_threshold=1024 * 25,
                        max_concurrency=10,
                        multipart_chunksize=1024 * 25,
                        use_threads=True)
transfer = S3Transfer(s3, config)

# Use a mutable object (list) to store total_transferred
total_transferred = [0]

# Define a progress callback function
def progress_callback(bytes_transferred):
    total_transferred[0] += bytes_transferred
    percent = total_transferred[0] / file_size * 100
    bar = '█' * int(percent / 2) + '-' * int(50 - percent / 2)
    print(f"\r|{bar}| {percent:.1f}%", end="")

# Get the file size before uploading
file_size = os.path.getsize(tar_to_upload)

# Perform the multipart upload with progress callback
transfer.upload_file(tar_to_upload, BUCKET_NAME, tar_to_upload, callback=progress_callback)

print("\nTar file upload complete!")

print("All uploads complete!")

### 3.3. Verify if uploading operation is successful

In [None]:
# verify upload consistency
import boto3

# Initialize S3 client (assuming you have your credentials set up)
s3 = boto3.client('s3',
                  aws_access_key_id=AWS_ACCESS_KEY_ID,
                  aws_secret_access_key=AWS_SECRET_ACCESS_KEY)

# List objects in your bucket
response = s3.list_objects_v2(Bucket=BUCKET_NAME)

# Extract object keys (filenames)
uploaded_objects = [obj['Key'] for obj in response.get('Contents', [])]

# Files and folders you expected to upload
expected_uploads = [tar_to_upload] + [os.path.join(root, file) for root, dirs, files in os.walk(folder_to_upload) for file in files]

# Convert expected uploads to S3 keys (relative paths)
expected_uploads = [os.path.relpath(path, folder_to_upload) if path.startswith(folder_to_upload) else path for path in expected_uploads]

# Compare uploaded objects with expected uploads
missing_uploads = set(expected_uploads) - set(uploaded_objects)
extra_uploads = set(uploaded_objects) - set(expected_uploads)

if missing_uploads or extra_uploads:
    if missing_uploads:
        print("The following files/folders were not uploaded:", missing_uploads)
    if extra_uploads:
        print("The following unexpected files/folders were found in the bucket:", extra_uploads)
else:
    print("All expected files/folders were uploaded successfully, and no unexpected uploads were found!")

## Delete final operation folders 

In [None]:
# delete operational folders and files

# Delete the folders and their contents
!rm -rf "notebooks/dataset for fine-tuning/myenv"

!rm -rf dataframes/* dataframes_${TIMESTAMP_DF}/* audio_files/final_backup/* audio_files/by_genre/*  json/*

# Delete .tar file
!rm ${final_folder_name}.tar
#!rm 498_classical_files_2024-08-28_01-13-37.tar

# All done, bravo ! (by Arthur Cornélio, 28th August 2024)