In [None]:
import os
from google.cloud import bigquery


In [None]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "G:/Il mio Drive/PERSONALE/Boolean/Final projects/sql-sandbox-471915-9367768b0079.json"

save_dir = "CSV2" # cartella di destinazione per i CSV puliti
os.makedirs(save_dir, exist_ok=True)

client = bigquery.Client()

tables = [
    "distribution_centers",
    "events",
    "inventory_items",
    "order_items",
    "orders",
    "products",
    "users",
]

# ciclo per leggere, pulire e salvare ogni tabella

for table in tables:  
    query = f"""  
        SELECT *  
        FROM `bigquery-public-data.thelook_ecommerce.{table}`  
    """  
    df = client.query(query).to_dataframe()  
  
    # trasformazioni specifiche per tabella  
    if table == "distribution_centers":  
        df = df.rename(columns={  
            'id': 'center_id',  
            'name': 'center_name'  
        })  
  
    elif table == "events":  
        df = df.rename(columns={'id': 'event_id'})  
        df = df.drop(columns=['session_id'])  
  
    elif table == "inventory_items":  
        df = df.rename(columns={  
            'id': 'inventory_item_id',  
            'product_distribution_center_id': 'center_id'  
        })  
        df = df.drop(columns=[  
            'product_category', 'product_name', 'product_brand',  
            'product_department', 'product_sku'  
        ])  
        df['cost'] = df['cost'].round(2)  
        df['product_retail_price'] = df['product_retail_price'].round(2)

    elif table == "order_items":  
        df = df.drop(columns=['id'])
        df = df.rename(columns={'status': 'items_status'})  
  
    elif table == "orders":  
        df = df.rename(columns={  
            'status': 'order_status',  
            'created_at': 'order_created_at',  
            'returned_at': 'order_returned_at',  
            'shipped_at': 'order_shipped_at',  
            'delivered_at': 'order_delivered_at'
        })  
  
    elif table == "products":  
        df = df.drop(columns=['sku'])
        df = df.rename(columns={  
            'id': 'product_id',  
            'distribution_center_id': 'center_id'  
        })  
        df['cost'] = df['cost'].round(2)  
        df['retail_price'] = df['retail_price'].round(2)  
  
    elif table == "users":  
        df = df.rename(columns={'id': 'user_id'})  
        df = df.drop(columns=['latitude', 'longitude'])  
  
    # pulizia date
    date_cols = df.select_dtypes(include=['datetime64[ns, UTC]', 'datetime64[us, UTC]', 'datetime64']).columns  
    for col in date_cols:  
        df[col] = df[col].dt.tz_localize(None).dt.floor('s')  
  
    # salvataggio CSV nella cartella scelta
    file_path = os.path.join(save_dir, f"{table}_cleaned.csv")  
    df.to_csv(file_path, index=False)  
    print(f"âœ… Salvato: {file_path}")

In [None]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "G:/Il mio Drive/PERSONALE/Boolean/Final projects/sql-sandbox-471915-9367768b0079.json"

save_dir = "CSV3" # cartella di destinazione per i CSV puliti
os.makedirs(save_dir, exist_ok=True)

client = bigquery.Client()

tables_queries = {
    "distribution_centers": """
        SELECT
            id AS center_id,
            name AS center_name,
            latitude,
            longitude
        FROM `bigquery-public-data.thelook_ecommerce.distribution_centers`
    """,

    "events": """
        SELECT
            id AS event_id,
            user_id,
            sequence_number,
            created_at,
            ip_address,
            city,
            state,
            postal_code,
            browser,
            traffic_source,
            uri,
            event_type
        FROM `bigquery-public-data.thelook_ecommerce.events`
    """,

    "inventory_items": """
        SELECT
            id AS inventory_item_id,
            product_id,
            product_distribution_center_id AS center_id,
            ROUND(cost, 2) AS cost,
            ROUND(product_retail_price, 2) AS product_retail_price,
            created_at,
            sold_at
        FROM `bigquery-public-data.thelook_ecommerce.inventory_items`
    """,

    "order_items": """
        SELECT
            order_id,
            user_id,
            product_id,
            inventory_item_id,
            status AS items_status,
            created_at,
            shipped_at,
            delivered_at,
            returned_at,
            sale_price
        FROM `bigquery-public-data.thelook_ecommerce.order_items`
    """,

    "orders": """
        SELECT
            order_id,
            user_id,
            status AS order_status,
            gender,
            created_at AS order_created_at,
            returned_at AS order_returned_at,
            shipped_at AS order_shipped_at,
            delivered_at AS order_delivered_at,
            num_of_item
        FROM `bigquery-public-data.thelook_ecommerce.orders`
    """,

    "products": """
        SELECT
            id AS product_id,
            distribution_center_id AS center_id,
            name,
            category,
            brand,
            department,
            ROUND(cost, 2) AS cost,
            ROUND(retail_price, 2) AS retail_price
        FROM `bigquery-public-data.thelook_ecommerce.products`
    """,

    "users": """
        SELECT
            id AS user_id,
            first_name,
            last_name,
            email,
            age,
            gender,
            state,
            street_address,
            postal_code,
            city,
            country,
            created_at,
            traffic_source
        FROM `bigquery-public-data.thelook_ecommerce.users`
    """,
}

for table, query in tables_queries.items():
    df = client.query(query).to_dataframe()

    # pulizia date
    date_cols = df.select_dtypes(include=['datetime64[ns, UTC]', 'datetime64[us, UTC]', 'datetime64']).columns
    for col in date_cols:
        df[col] = df[col].dt.tz_localize(None).dt.floor('s')

    # salvataggio CSV
    file_path = os.path.join(save_dir, f"{table}_cleaned.csv")
    df.to_csv(file_path, index=False)

