# 5 - Uploading to S.A.M. bucket

## 1. Load .env variables



In [27]:
from dotenv import load_dotenv
import os

load_dotenv()

GITHUB_PROFILE_NAME = os.getenv('GITHUB_PROFILE_NAME')
genre_folder = os.getenv('genre_folder')
file_count = os.getenv('file_count')

## 2. Move manually your .JSON credential
  - Put it in /sam_files
  - Open the .env for sam_files (sam_files/notebooks/dataset for fine-tuning/.env)
  - insert this line :
    - `gdrive_api_json_path={path-for-json-credential.json}`

## 3. Uploading to Google Cloud Bucket

### 3.1. You need to be in :
  - `../stable-audio-tools-sam/sam_files`

In [2]:
%cd ..
%cd ..

/home/arthurcornelio/code/arthurcornelio88/stable-audio-tools-sam/sam_files/notebooks
/home/arthurcornelio/code/arthurcornelio88/stable-audio-tools-sam/sam_files


### 3.2. Copying folders to bucket

In [None]:
%%bash

# Replace with your actual bucket name
bucket_name="sam-dataset"

# Get the current timestamp
timestamp=$(date +"%Y-%m-%d_%H-%M-%S")

# Create the folder name within the bucket
folder_name="${file_count}_${genre_folder}_files_${timestamp}"

#capture variables
echo "bucket_name=$bucket_name" >> "notebooks/dataset for fine-tuning/.env"
echo "timestamp=$timestamp" >> "notebooks/dataset for fine-tuning/.env"
echo "folder_name=$folder_name" >> "notebooks/dataset for fine-tuning/.env"

# List of source folders you want to upload
source_folders=("json" "dataframes" "audio_files")

# Upload each folder to the bucket
for source_folder in "${source_folders[@]}"
do
    gsutil -m cp -r -L "upload_log_${timestamp}.txt" "$source_folder" "gs://$bucket_name/$folder_name/"
done


### 3.3. Verify if uploading operation is successful

In [None]:
from google.cloud import storage
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv(dotenv_path='notebooks/dataset for fine-tuning/.env')

# Access the variables from the .env file
bucket_name = os.getenv('bucket_name')
folder_name = os.getenv('folder_name')

def verify_upload(bucket_name, folder_name, source_folders):
    """Verifies if the specified folder and its subfolders exist in the given GCS bucket."""

    # Get the path to your service account credentials JSON file from the environment variable
    credentials_path = os.getenv('gdrive_api_json_path')

    # Explicitly create the storage client using the service account credentials
    storage_client = storage.Client.from_service_account_json(credentials_path)

    bucket = storage_client.bucket(bucket_name)

    # Check if the main folder exists
    blobs = list(bucket.list_blobs(prefix=folder_name + '/'))
    if not blobs:
        print(f"Upload failed or incomplete. Main folder '{folder_name}' not found in bucket '{bucket_name}'.")
        return  # Stop further checks if the main folder is missing

    # Check for the existence of each subfolder within the main folder
    for source_folder in source_folders:
        blobs = list(bucket.list_blobs(prefix=f"{folder_name}/{source_folder}/"))

        if blobs:
            print(f"Upload successful! Subfolder '{source_folder}' and its contents found in bucket '{bucket_name}' under '{folder_name}'.")
            # (Optional) You can iterate through 'blobs' to list individual files if needed
            # for blob in blobs:
            #   print(f"  - {blob.name}")
        else:
            print(f"Upload failed or incomplete. Subfolder '{source_folder}' not found in bucket '{bucket_name}' under '{folder_name}'.")

# List of source folders you want to upload
source_folders = [
    'json',
    'dataframes/checked',
    'dataframes/filtered_by_genre',
    'audio_files/by_genre',
    'audio_files/final_backup',
]

verify_upload(bucket_name, folder_name, source_folders)


## 4. Delete final operation folders 

In [42]:
%%bash

# Delete the folders and their contents
rm -rf audio_files/final_backup audio_files/by_genre json/*

# All done, bravo ! (by Arthur Cornélio, 12th August 2024)