# first run (21/03/2025)
- year 2009
- one base existing in this year, each month around 500mb with 8 minutes total time
- Using 1 Driver 14 GB Memory, 4 Cores

In [0]:
from azure.storage.blob import BlobServiceClient
import requests
from retry import retry
from datetime import datetime

# Define parameters
years = list(range(2009, 2015))  # From 2009 to 2025
months = [f"{m:02d}" for m in range(1, 13)]  # 01 to 12
trip_types = ["yellow", "green", "fhv", "fhvhv"]

# Generate URLs dynamically and create a dictionary
base_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/"
url_dict = {f"staging/{trip}/{year}/{month}/{trip}_tripdata_{year}-{month}.parquet": 
            f"{base_url}{trip}_tripdata_{year}-{month}.parquet"
            for year in years for month in months for trip in trip_types}

# Define your storage account details
storage_account_name = 'lagodedadosalttab'
storage_account_key = ''
container_name = 'lagodedadosv1'

# Create a BlobServiceClient
blob_service_client = BlobServiceClient(account_url=f"https://{storage_account_name}.blob.core.windows.net", credential=storage_account_key)

# Function to upload file with retries
@retry(tries=3, delay=2)
def upload_file(blob_client, file_data):
    blob_client.upload_blob(file_data, overwrite=True)

for blob_name, file_url in url_dict.items():
    try:
        # Check if the blob already exists
        blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
        if blob_client.exists():
            print(f"Blob {blob_name} already exists. Skipping download and upload.")
            continue
        # Download the file from the URL
        response = requests.get(file_url)
        # Check HTTP response status
        if response.status_code != 200:
            raise Exception(f"Failed to download {file_url}: HTTP {response.status_code}")
        
        file_data = response.content
        
        # Validate file content (simple check for XML error message)
        if b"<Error>" in file_data:
            print("teste")
            print("Failed to get data from {file_url}: XML error message found. The file does not exists or the url given is invalid.")
            raise Exception(f"Downloaded file contains error message: {file_data.decode('utf-8')}")
        
        upload_file(blob_client, file_data)
        print(f"Uploaded {blob_name} successfully.")
    except Exception as e:
        error_message = f"Failed to upload {blob_name}: {str(e)}\n"
        print(error_message)
        
        # Create a unique log file name with the current date and time
        current_time = datetime.now().strftime("%Y%m%d%H%M%S")
        log_blob_name = f"staging/logs/upload_errors_{blob_name}_{current_time}.log"
        log_blob_client = blob_service_client.get_blob_client(container=container_name, blob=log_blob_name)
        
        # Prepare log content
        log_content = f"Error: {str(e)}\nURL: {file_url}\n"
        
        # Append the error message to the log file in Azure Blob Storage
        log_blob_client.upload_blob(log_content, overwrite=True)

print("File upload process completed.")

# Next steps: 

- Hide secret key as env variable
- parametrize everything
- create utils ntb
- improve ingestion speed (parallelize)
- add meta data