<a href="https://colab.research.google.com/github/ayomide2021/Effectiveness-of-E-commerce-Tiered-Loyalty-Program-through-A-B-Testing/blob/main/%5Badvanced_capabilties%5D_upload_to_bucket.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Functions (only run once)

In [None]:
import os                                        # UPDATED: needed to check dirs & walk
import json
from google.cloud import storage
from google.oauth2 import service_account
import zipfile
import sys
from pathlib import Path
from typing import Tuple, List

def unzip_file(folder_path):
    zip_path = folder_path

    # Destination directory
    extract_to = '/content/'

    # Create the directory if it doesn't exist
    os.makedirs(extract_to, exist_ok=True)

    # Unzip the file
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

    print(f"Unzipped to: {extract_to}")

def upload_file_to_gcs(file_path, bucket_name, destination_blob_name, credentials):
    """
    Uploads a file or all files in a folder to a specified GCS bucket
    using service account credentials.

    - If `file_path` is a folder, uploads all contained files, then prints:
        gs://<bucket>/<destination_blob_name>/ (N files uploaded)
      and returns "gs://<bucket>/<destination_blob_name>/"

    - Otherwise, uploads the single file and returns its gs:// URI.
    """

    # —– parse credentials —–
    if isinstance(credentials, str):
        credentials = json.loads(credentials)

    creds = service_account.Credentials.from_service_account_info(credentials)
    client = storage.Client(credentials=creds, project=credentials.get("project_id"))
    bucket = client.bucket(bucket_name)

    # —– handle directory upload —–
    if os.path.isdir(file_path):                                   # UPDATED: detect folder
        file_count = 0                                             # UPDATED: initialize counter
        for root, dirs, files in os.walk(file_path):               # UPDATED: grab dirs so we can prune
            if ".ipynb_checkpoints" in dirs:
                dirs.remove(".ipynb_checkpoints")                  # UPDATED: skip Colab checkpoint folders
            for filename in files:
                local_path = os.path.join(root, filename)
                rel_path = os.path.relpath(local_path, start=file_path)
                blob_name = f"{destination_blob_name.rstrip('/')}/{rel_path}"
                try:
                    bucket.blob(blob_name).upload_from_filename(local_path)
                    file_count += 1                               # UPDATED: count each upload
                except Exception as e:
                    print(f"Error uploading '{local_path}':", e)
        # build and print only the directory URI + count
        dir_uri = f"gs://{bucket_name}/{destination_blob_name.rstrip('/')}/"  # UPDATED
        print(f"{dir_uri} ({file_count} files uploaded)")                    # UPDATED
        return dir_uri                                                        # UPDATED

    # —– single-file upload (original) —–
    blob = bucket.blob(destination_blob_name)
    try:
        blob.upload_from_filename(file_path)
        gs_uri = f"gs://{bucket_name}/{destination_blob_name}"
        return gs_uri                                                       # UPDATED: return single-file URI
    except Exception as e:
        print("Error uploading file:", e)
        return None

def validate_folder_structure(root_path: Path) -> Tuple[bool, List[str]]:
    """
    root_path should be the <data_row_id> folder.
    Validates that:
      <data_row_id>/
        ├─ data/           → must exist, not empty, only .npy/.csv
        ├─ outputs/        → must exist, not empty, only .html
        ├─ scripts/        → exactly data_gen.py and viz.py (no nesting)
    Returns (is_valid, list_of_error_messages).
    """
    errors: List[str] = []
    expected_dirs = {"data", "scripts", "outputs"}
    actual_dirs   = {p.name for p in root_path.iterdir() if p.is_dir()}

    # 1) Top-level directories
    missing = expected_dirs - actual_dirs
    extra   = actual_dirs - expected_dirs
    if ".ipynb_checkpoints" in extra:
        extra.remove(".ipynb_checkpoints") #added for colab
    for d in missing:
        errors.append(f"Missing directory: {d}/")
    for d in extra:
        errors.append(f"Unexpected directory: {d}/")

    # --- data/ checks (unchanged) ---
    data_dir = root_path / "data"
    if not data_dir.exists() or not data_dir.is_dir():
        errors.append("Missing directory: data/")
    else:
        entries = list(data_dir.iterdir())
        if not entries:
            errors.append("data/ must not be empty")
        if len(entries) < 2:
            errors.append("data/ must have 2 or data CSVs or NPYs files to be complex enough")
        for child in entries:
            if ".ipynb_checkpoints" in child.name:
                continue
            if child.suffix.lower() not in {".npy", ".csv"}:
                errors.append(
                    f"Invalid file in data/ → {child.name} (must be .npy or .csv)"
                )

    # --- outputs/ checks (NEW block) ---
    outputs_dir = root_path / "outputs"
    if not outputs_dir.exists() or not outputs_dir.is_dir():
        errors.append("Missing directory: outputs/")
    else:
        out_entries = [p for p in outputs_dir.iterdir() if p.name != ".ipynb_checkpoints"]
        if not out_entries:
            errors.append("outputs/ must not be empty")
        for child in out_entries:
            if ".ipynb_checkpoints" in child.name:
                continue
            if child.suffix.lower() not in {".html"}:
                errors.append(
                    f"Invalid file in outputs/ → {child.name} (must be .html)"
                )

    # helper for scripts/
    scripts = root_path / "scripts"
    if scripts.exists():
        found = {p.name for p in scripts.iterdir() if p.is_file()}
        want  = {"data_gen.py", "viz.py"}
        for f in want - found:
            errors.append(f"Missing script file in scripts/ → {f}")
        for f in found - want:
            errors.append(f"Unexpected script file in scripts/ → {f}")
        for p in scripts.iterdir():
            if ".ipynb_checkpoints" in p.name:
                continue
            if p.is_dir():
                errors.append(f"Nested directory not allowed in scripts/ → {p.name}/")

    return (len(errors) == 0), errors

## Put all the data in the following structure and upload using the calls below

```
<data row id>/
├── data/
    ├── sample.npy    
    ├── dataframe2.csv
    └── dataframe.csv          #Generated .csv and/or npy files

├── scripts/
    ├── data_gen.py            #Data generation script
    └── viz.py                 #Visualization script

├── outputs/
    └── golden_image.html      #Interactive html coming out of the plotly fig.write_html("./name_me_something_useful.html").
```

In [None]:
bucket_name = 'advanced_capabilities'
credentials = {
    "type": "service_account",
    "project_id": "dataoperations-449123",
    "private_key_id": "5a12330bb6b15e3b3d72253577a7a0108e20234c",
    "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQCsFAblUsg2BrLe\nrB8KyoT0Z9XKx/fB7a6TXDFM9q2ebBV5O0Iqztgkp4Eu7M+yumwq4yUxSstRPIDE\nLS71//G0LPq0mXRgHSwlbSwQ+itE19gVMKx+z58duX+BZ/aJ3BJdQs+/XnWhhmo0\nYXwjFfUAUjEMGVjzH6BtvZadMf69EfD0HmjH0BBNTEtSivOwiT/Rb4nCPeFn1Yay\nxBmIf3CmZDgtBR4nKa2/zv8nQbGs1ORKECjeQ3X3oLMj4dtCgft0CrxHFfZtnxBw\naThQXP+RBgwO14DkzxXh7jaW+ePhtOO/MPFQr8eYLM+3/HuvLAYRjmZVwinIdVsW\n0lBBkQsvAgMBAAECggEAAdwH8XIO9ptLhlTgoDKgc/rfRSnnoOSoCBc5848pQL01\nEJjpIrwWrkBR70DhEBksBanSUBFzCwJKKrrv3ZN4RPsrcy72ImFbH17QcbDr+mCr\niOcV8ugP2+r4pSklCxS7ht7e9QsMEffGFAgHQGKcKCm7Nlp1XjZpFGaB4MCWjewS\nj0zptuwEPQ8nNkSm5EloLIOjEB/xOeuWxQaz079MYIoNi0qtMPPIMIEjTASLdVbX\nbV8dkhr4QLSppHOipW5gNfgdFZYx7AkNsaXU38aF3OJcGFsmTH9CRc6X/D+9/5s2\nf+ZRVY9ZxC3VvAPlKWukoJOBKpWlHaQa4pmdaAlBeQKBgQDjbh+aE5uqG316A4G6\nnMMPU+uGHyQFNAbBPYRhZ9xHdEivWDL535DJMddUvJZiGJaopdS0UqZvnCBnVwBx\nvTh5zPjzUgr4EWdHmiMQ9vy5LrO+AZ0I4vFH/Mw/vFyO7JphKiW2WLIMWhCAGeNR\nd7FKfOlNOxhDXnOzL61hmrAgDQKBgQDBsdpK5VGMe+oMJmHx0Mrd4m/aFK0Si9vT\nC08r2viR3d81o4HvfkM+PhtOnel1NwR7ClW4NPO5C7eMiUF1R+Zfnv/olil2k9KF\n/7oABIBuy4ED+o9TDBNQcA+/mrssRf+mZnqgTLD60PF4hNs3MCz/UKOZwj3ahsXS\nTXqsRAcNKwKBgQC+i3k2oc79ymJqY53BN63Fnc3qZRTMtzYhmOTcpyPLSgXLwt2S\ntFid+IAsooRsU6WGTsnS+pwvTmNnsWDR6nLyuWSql8ZQ6GPbfax7fjNnA1Xcj4V9\n+IWmhoqpq6rwpBMD7UgDanRiHONOMGJf0yRbACSHbEd7yhqtufhNmkiN0QKBgHZ+\ntJnPEoWMJRHLxW2nGwSFC4Jx1bOb5h6FM6kTq6+o+W2aGtqF9uM0IYaF6pWv08jl\n+KzZkCSre125dlcmZlQnNE141+LX6hnZ6VMrbdraGpJxjY7zuzkhZTEFmu4p4I1O\n8kPwxvCaNK9TL7zidxS2o29kOmzeuFTA24RZlarDAoGASj1jqlkg/TGqDu1azKX5\nksr9nXjYuVPP4+n9tf0OoDgdDxRDfijgdPcL3tJae1IsawYHDsa0NciU25QDocbh\nPn+3voqNoO0NreGaqZL7MycV6CDAf1Qf38dIe2yLpYKKtddRfhHDF5WBRS8XjBLP\nwQzLDIijESyckEIrYZHq1WM=\n-----END PRIVATE KEY-----\n",
    "client_email": "adv-cap@dataoperations-449123.iam.gserviceaccount.com",
    "client_id": "108132512627608538619",
    "auth_uri": "https://accounts.google.com/o/oauth2/auth",
    "token_uri": "https://oauth2.googleapis.com/token",
    "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
    "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/adv-cap%40dataoperations-449123.iam.gserviceaccount.com",
    "universe_domain": "googleapis.com"
}


# Assume `bucket_name` and `credentials` are preconfigured
# Find this data row id in the datarow details: https://docs.labelbox.com/docs/label-data#data-row-information-panel
datarow_id = "cmce5hjxl10480798zr1e1j6u"

ok, errors =  validate_folder_structure(Path("./" + datarow_id))
if not ok:
    for e in errors:
        print("ERROR: ", e)
else:

    # 1) Upload generated data files from the data folder
    print("data folder URIs: ", upload_file_to_gcs(
        file_path=f"{datarow_id}/data",
        bucket_name=bucket_name,
        destination_blob_name=f"{datarow_id}/data",
        credentials=credentials
    ))
    print("**"*50)

    # 2) Upload scripts from the scripts folder
    print("scripts folder URIs: ", upload_file_to_gcs(
        file_path=f"{datarow_id}/scripts",
        bucket_name=bucket_name,
        destination_blob_name=f"{datarow_id}/scripts",
        credentials=credentials
    ))
    print("**"*50)

    # 3) Upload outputs from the outputs folder
    print("outputs folder URIs: ", upload_file_to_gcs(
        file_path=f"{datarow_id}/outputs",
        bucket_name=bucket_name,
        destination_blob_name=f"{datarow_id}/outputs",
        credentials=credentials
    ))


gs://advanced_capabilities/cmce5hjxl10480798zr1e1j6u/data/ (2 files uploaded)
data folder URIs:  gs://advanced_capabilities/cmce5hjxl10480798zr1e1j6u/data/
****************************************************************************************************
gs://advanced_capabilities/cmce5hjxl10480798zr1e1j6u/scripts/ (2 files uploaded)
scripts folder URIs:  gs://advanced_capabilities/cmce5hjxl10480798zr1e1j6u/scripts/
****************************************************************************************************
gs://advanced_capabilities/cmce5hjxl10480798zr1e1j6u/outputs/ (1 files uploaded)
outputs folder URIs:  gs://advanced_capabilities/cmce5hjxl10480798zr1e1j6u/outputs/


#SINGLE FILE UPLOADS

In [None]:
print("data folder URIs: ", upload_file_to_gcs(
    file_path=f"{datarow_id}/data/sample.csv",
    bucket_name=bucket_name,
    destination_blob_name=f"{datarow_id}/data/sample.csv",
    credentials=credentials
))

Error uploading file: [Errno 2] No such file or directory: 'cmce5hjxl10480798zr1e1j6u/data/sample.csv'
data folder URIs:  None
