In [1]:
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import LoadJobConfig
from pandas_gbq import to_gbq
import pandas as pd

import re

from google_auth_oauthlib import flow

### TEST

In [3]:
scopes=["https://www.googleapis.com/auth/cloud-platform"]
bq_credentials = './credenciales/client_secret_python_bq.json'
project_id = 'curso-bigquery-mide-403114'
dataset_id = 'bigquery-public-data'
table_id = 'ga4_obfuscated_sample_ecommerce.events_20210131'

In [4]:
try:
    credentials = service_account.Credentials.from_service_account_file(bq_credentials, scopes=scopes)
    client = bigquery.Client(credentials=credentials, project=credentials.project_id)
    table_ref = client.dataset(dataset_id).table(table_id)
except Exception as e:
    print("Big query conection wrong: " + str(e))

In [8]:
#query de ejemplo
query_ejemplo = """
    SELECT *
    FROM `{}.{}`
    Limit 10
""".format(dataset_id,table_id)

query_ejecutada_ejemplo = client.query(query_ejemplo)
resultado_ejemplo = query_ejecutada_ejemplo.result()

#for row_ejemplo in resultado_ejemplo:
#    print(row_ejemplo)

-----------
#### CREAR TABLA E INSERTAR DATOS

#### configuración

In [9]:
#VARIABLES
scopes=["https://www.googleapis.com/auth/cloud-platform"]
bq_credentials = './credenciales/client_secret_python_bq.json'

project_id = 'curso-bigquery-mide-403114'
dataset_id = 'chicago_taxi_tips'
table_id = 'cars_upload_test'
csv_path = './files/cars.csv'

In [10]:
#CONEXION
def bq_client(scopes, bq_credentials, project_id):
    try:
        credentials = service_account.Credentials.from_service_account_file(bq_credentials, scopes=scopes)
        client = bigquery.Client(credentials=credentials, project=credentials.project_id)
        return client
    except Exception as e:
        print("Big query conection wrong: " + str(e))

client = bq_client(scopes, bq_credentials, project_id)

In [28]:
#REFERENCIAR LA TABLA
def bq_table_ref(client,dataset_id,table_id):
    try:
        table_ref = client.dataset(dataset_id).table(table_id)
        return table_ref
    except Exception as e:
        print("Error in Bigquery table reference: " + str(e))

table = bq_table_ref(client,dataset_id,table_id)

In [12]:
#CSV a un DataFrame de Pandas
def bq_csv_to_pandas(csv_path):
    try:
        bigquery_csv = pd.read_csv(csv_path, sep=",")
        return bigquery_csv
    except Exception as e:
        print("Error importing the csv file" + str(e))

bq_csv_df = bq_csv_to_pandas(csv_path)

##### a) SQL

In [29]:
#correr después de crear la query:
def bq_query_execution(client, query, dataset_id, table_id, table):
    try:
        query_ejecutada = client.query(query)
        resultado_query = query_ejecutada.result()

        if "CREATE TABLE" in query:
            return("Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id))
        elif "INSERT" in query:
            return("Values inserted into {}.{}.{}".format(table.project, table.dataset_id, table.table_id))
        elif "TRUNCATE TABLE" in query:
            return("TRUNCATE {}.{}.{}".format(table.project, table.dataset_id, table.table_id))
        else:
            pass
        
    except Exception as e:
        print(e)

###### 1.A crear la tabla si no existe

In [129]:
#CREAR ESQUEMA DESDE EL CSV 

def bq_schema_for_query(bq_csv_df):
    schema_for_query = {}

    # Itera sobre las columnas y determina el tipo de datos
    try:
        for column in bq_csv_df.columns:
            # Infiere el tipo de datos basándose en los tipos de pandas
            if pd.api.types.is_string_dtype(bq_csv_df[column]):
                schema_for_query[column] = 'STRING'
            elif pd.api.types.is_numeric_dtype(bq_csv_df[column]):
                if pd.api.types.is_float_dtype(bq_csv_df[column]):
                    schema_for_query[column] = 'FLOAT64'
                else:
                    schema_for_query[column] = 'INT64'
            else:
                schema_for_query[column] = 'STRING'
                
        schema_for_query = re.sub("'|\{|\}|\[|\]|\:","",str(schema_for_query))
        #schema_for_query = re.sub("FLOAT","FLOAT64",str(schema_for_query))
        #schema_for_query = re.sub("INTEGER","INT64",str(schema_for_query))

        return schema_for_query
        
    except Exception as e:
        print(e)

bq_schema_for_query = bq_schema_for_query(bq_csv_df)

In [130]:
bq_schema_for_query

'cars STRING, mpg FLOAT64, cyl INT64, disp FLOAT64, hp INT64, drat FLOAT64, wt FLOAT64, qsec FLOAT64, vs INT64, am INT64, gear INT64, carb INT64'

In [140]:
def bq_schema_for_query2(bq_csv_df):
    try:
        return re.sub("'|\{|\}|\[|\]|\:|dtype|\(|\)","",str(bq_csv_df.dtypes.replace("O","string").to_dict())) #en minúscula
        #return re.sub("'|\{|\}|\[|\]|\:|dtype|\(|\)","",str({key: str(values).upper() for key, values in bq_csv_df.dtypes.replace("O","string").items()})) #en mayúscula
        
    except Exception as e:
        print(e)  

bq_schema_for_query2 = bq_schema_for_query2(bq_csv_df)

In [141]:
bq_schema_for_query2

'cars string, mpg float64, cyl int64, disp float64, hp int64, drat float64, wt float64, qsec float64, vs int64, am int64, gear int64, carb int64'

In [142]:
#crear la tabla si no existe --> https://cloud.google.com/bigquery/docs/reference/standard-sql/data-definition-language
def bq_query_create_table(dataset_id,table_id,bq_schema_for_query):
    query = """CREATE TABLE IF NOT EXISTS `{}.{}`({});""".format(dataset_id,table_id,bq_schema_for_query)
    return query
    
    #PARTITION BY fecha
    #OPTIONS(
    #    partition_expiration_days=100,
    #    description="a table partitioned by fecha");

bq_query_create_table = bq_query_create_table(dataset_id,table_id,bq_schema_for_query)

In [143]:
#ejecución de la query
bq_query_execution(client, bq_query_create_table, dataset_id, table_id, table)

'Created table curso-bigquery-mide-403114.chicago_taxi_tips.cars_upload_test'

###### 2.A INSERTAR datos del csv en la tabla debajo de los ya existentes

In [144]:
def bq_query_insert_values(bq_csv_df,dataset_id,table_id):
    #crear columnas
    columns_for_query = list(bq_csv_df.columns)
    columns_for_query = re.sub("'|\{|\}|\[|\]|\:","",str(columns_for_query))

    #crear valores
    values_for_query = str([tuple(i_value) for i_value in bq_csv_df.values.tolist()])
    values_for_query = re.sub("\[|\]","",str(values_for_query))

    #crear query
    query = """INSERT `{}.{}`({}) VALUES {}""".format(dataset_id,table_id,columns_for_query,values_for_query)
    return query

bq_query_insert_values = bq_query_insert_values(bq_csv_df,dataset_id,table_id)

In [145]:
bq_query_execution(client, bq_query_insert_values, dataset_id, table_id, table)

403 Billing has not been enabled for this project. Enable billing at https://console.cloud.google.com/billing. DML queries are not allowed in the free tier. Set up a billing account to remove this restriction.

Location: europe-southwest1
Job ID: ca04df61-88a1-4b02-81c9-55d1f7811c6e



###### * Si queremos eliminar los datos antes de insertarlos tendríamos que hacer un TRUNCATE

In [33]:
def bq_query_truncate_table(dataset_id,table_id):
    query = """TRUNCATE TABLE `{}.{}`""".format(dataset_id,table_id)
    return query

bq_query_truncate_table = bq_query_truncate_table(dataset_id,table_id)

In [35]:
bq_query_execution(client, bq_query_truncate_table, dataset_id, table_id, table)

403 Billing has not been enabled for this project. Enable billing at https://console.cloud.google.com/billing. DML queries are not allowed in the free tier. Set up a billing account to remove this restriction.

Location: europe-southwest1
Job ID: 0deab718-48c9-4e07-80ea-0d89f40c7861



##### b) python
https://cloud.google.com/bigquery/docs/tables?hl=es-419
https://github.com/googleapis/python-bigquery/blob/35627d145a41d57768f19d4392ef235928e00f72/samples/create_table_range_partitioned.py

###### 1.A crear la tabla a mano si no existe

In [37]:
def bq_python_create_table_static(client, table, dataset_id, table_id):
    try:
        client.get_table(table)
        print(f"La tabla {table_id} ya existe en el conjunto de datos {dataset_id}.")
    except Exception as e:
        if "Not found" in str(e):
            print(f"La tabla {table_id} no existe en el conjunto de datos {dataset_id}.")
            # Crea la tabla si no existe
            schema = [
                bigquery.SchemaField("cars", "STRING", mode="NULLABLE"),
                bigquery.SchemaField("mpg", "FLOAT", mode="REQUIRED"),
                bigquery.SchemaField("cyl", "INTEGER", mode="NULLABLE"),
                bigquery.SchemaField("disp", "FLOAT", mode="NULLABLE"),
                bigquery.SchemaField("hp", "INTEGER", mode="NULLABLE"),
                bigquery.SchemaField("drat", "FLOAT", mode="NULLABLE"),
                bigquery.SchemaField("wt", "FLOAT", mode="NULLABLE"),
                bigquery.SchemaField("qsec", "FLOAT", mode="NULLABLE"),
                bigquery.SchemaField("vs", "INTEGER", mode="NULLABLE"),
                bigquery.SchemaField("am", "INTEGER", mode="NULLABLE"),
                bigquery.SchemaField("gear", "INTEGER", mode="NULLABLE"),
                bigquery.SchemaField("carb", "INTEGER", mode="NULLABLE")
            ]
            table = bigquery.Table(table, schema=schema)
    
            #para crear tabla particionada
            #table.time_partitioning = bigquery.TimePartitioning(
            #    type_=bigquery.TimePartitioningType.DAY,
            #    field="date",  # name of column to use for partitioning
            #    expiration_ms=1000 * 60 * 60 * 24 * 90,
            #)  # 90 days
            
            table = client.create_table(table)
            return("Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id))
        else:
            print(f"Error: {e}")
            
bq_python_create_table_static(client, table, dataset_id, table_id)

La tabla cars_upload_test no existe en el conjunto de datos chicago_taxi_tips.


'Created table curso-bigquery-mide-403114.chicago_taxi_tips.cars_upload_test'

###### 1.B crear la tabla leyendo el schema del csv

In [39]:
def bq_python_create_table_dynamic(client,bq_csv_df, table, dataset_id, table_id):

    # Crea un diccionario para almacenar el esquema inferido
    schema = {}
    
    # Itera sobre las columnas y determina el tipo de datos
    try:
        for column in bq_csv_df.columns:
            # Infiera el tipo de datos basándose en los tipos de pandas
            if pd.api.types.is_string_dtype(bq_csv_df[column]):
                schema[column] = 'STRING'
            elif pd.api.types.is_numeric_dtype(bq_csv_df[column]):
                if pd.api.types.is_float_dtype(bq_csv_df[column]):
                    schema[column] = 'FLOAT'
                else:
                    schema[column] = 'INTEGER'
            else:
                schema[column] = 'STRING'  # Puedes ajustar según sea necesario
    except Exception as e:
        print(e)
    
    # iterar este esquema para crear el formato que BQ necesita en un listado (ejemplo mas arriba en la creacion a mano del listado)
    try:
        schema2=[bigquery.SchemaField(keyx, valuex, mode="NULLABLE") for keyx,valuex in schema.items()]
    except Exception as e:
        print(e)
        
    try:
        client.get_table(table)
        print(f"La tabla {table_id} ya existe en el conjunto de datos {dataset_id}.")
    except Exception as e:
        if "Not found" in str(e):
            print(f"La tabla {table_id} no existe en el conjunto de datos {dataset_id}.")
            
            # Crea la tabla si no existe
            table = bigquery.Table(table, schema=schema2)
    
            #para crear tabla particionada
            #table.time_partitioning = bigquery.TimePartitioning(
            #    type_=bigquery.TimePartitioningType.DAY,
            #    field="date",  # name of column to use for partitioning
            #    expiration_ms=1000 * 60 * 60 * 24 * 90,
            #)  # 90 days
            
            table = client.create_table(table)
            return("Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id))

bq_python_create_table_dynamic(client,bq_csv_df, table, dataset_id, table_id)

La tabla cars_upload_test no existe en el conjunto de datos chicago_taxi_tips.


'Created table curso-bigquery-mide-403114.chicago_taxi_tips.cars_upload_test'

###### 2.A INSERTAR datos del csv en la tabla debajo de los ya existentes

In [146]:
def bq_python_insert_values(client, bq_csv_df, table, dataset_id, table_id):
    try:
        table = client.get_table(table)
        print(f"La tabla {table_id} existe en el conjunto de datos {dataset_id}.")
    
        job = client.load_table_from_dataframe(bq_csv_df, table)
        job.result()  # Espera a que se complete la carga
    
        return(f"Datos del CSV cargados en la tabla {table_id} de BigQuery.")
    except:
        print(f"La tabla {table_id} NO existe en el conjunto de datos {dataset_id}, hay que crearla")

bq_python_insert_values(client, bq_csv_df, table, dataset_id, table_id)

La tabla cars_upload_test existe en el conjunto de datos chicago_taxi_tips.


'Datos del CSV cargados en la tabla cars_upload_test de BigQuery.'

###### 2.B REEMPLAZAR datos ya existentes en la tabla por los del csv

In [42]:
def bq_python_truncate_table(client, bq_csv_df, table, dataset_id, table_id):
    try:
        table = client.get_table(table)
        print(f"La tabla {table_id} existe en el conjunto de datos {dataset_id}.")

        #truncate
        job_config = bigquery.LoadJobConfig(write_disposition="WRITE_TRUNCATE")

        #insertar datos
        job = client.load_table_from_dataframe(bq_csv_df, table, job_config=job_config)
        job.result()  # Espera a que se complete la carga
    
        return(f"Datos del CSV reemplazaron los existentes en la tabla {table_id} de BigQuery.")
    
    except:
        print(f"La tabla {table_id} NO existe en el conjunto de datos {dataset_id}, hay que crearla")

bq_python_truncate_table(client, bq_csv_df, table, dataset_id, table_id)

La tabla cars_upload_test existe en el conjunto de datos chicago_taxi_tips.


'Datos del CSV reemplazaron los existentes en la tabla cars_upload_test de BigQuery.'

##### c) pandas_gbq  -> requiere autenticación LA DESCARTAMOS

In [None]:
project_id = 'curso-bigquery-mide-403114'
dataset_id = 'chicago_taxi_tips'
table_id = 'cars_upload_test'

try:
    credentials = service_account.Credentials.from_service_account_file(bq_credentials, scopes=scopes)
    client = bigquery.Client(credentials=credentials, project=credentials.project_id)
except Exception as e:
    print("Big query conection wrong: " + str(e))

# Ruta al archivo CSV en tu sistema local
csv_path = './files/cars.csv'  # Reemplaza con tu ruta real

# Verifica si la tabla existe en el conjunto de datos
try:
    table_ref = client.dataset(dataset_id).table(table_id)
except Exception as e:
    print("Error in Bigquery table reference: " + str(e))
    
try:
    client.get_table(table_ref)
    print(f"La tabla {table_id} ya existe en el conjunto de datos {dataset_id}.")
except Exception as e:
    if "Not found" in str(e):
        print(f"La tabla {table_id} no existe en el conjunto de datos {dataset_id}.")

        # Carga el CSV en un DataFrame de Pandas
        df = pd.read_csv(csv_path, sep=",", index_col=False)

        # Carga el DataFrame en BigQuery
        to_gbq(df, f'{project_id}.{dataset_id}.{table_id}', project_id=project_id, if_exists='replace')

        print(f"La tabla {table_id} ha sido creada y los datos del CSV han sido cargados en BigQuery.")
    else:
        print(f"Error: {e}")