In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.enableHiveSupport().getOrCreate() # SparkSession de forma programativa
sc = spark.sparkContext      

In [2]:
spark.catalog.listDatabases()

[Database(name='default', description='Default Hive database', locationUri='file:/home/jovyan/work/jupyter/spark-warehouse')]

In [3]:
currentDB = spark.catalog.currentDatabase()
currentDB

'default'

In [4]:
spark.sql("create database if not exists s8a")

DataFrame[]

In [5]:
spark.catalog.listDatabases()

[Database(name='default', description='Default Hive database', locationUri='file:/home/jovyan/work/jupyter/spark-warehouse'),
 Database(name='s8a', description='', locationUri='file:/home/jovyan/work/jupyter/spark-warehouse/s8a.db')]

In [6]:
spark.sql("use database s8a")

DataFrame[]

In [7]:
spark.catalog.currentDatabase()

's8a'

In [8]:
jdbcDF = spark.read \
    .format("jdbc") \
    .option("driver", "com.mysql.cj.jdbc.Driver") \
    .option("url", "jdbc:mysql://iabd-mysql") \
    .option("dbtable", "retail_db.customers") \
    .option("port", "3306") \
    .option("user", "iabd") \
    .option("password", "iabd") \
    .load()

In [9]:
spark.catalog.listTables()

[]

In [10]:
jdbcDF.createOrReplaceTempView("clientes")

In [11]:
spark.catalog.listTables()

[Table(name='clientes', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

In [12]:
jdbcDF.write.mode("errorIfExists").saveAsTable("clientes")

In [13]:
spark.catalog.listTables()

[Table(name='clientes', database='s8a', description=None, tableType='MANAGED', isTemporary=False),
 Table(name='clientes', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

In [15]:
spark.sql("describe table s8a.clientes").show()

+-----------------+---------+-------+
|         col_name|data_type|comment|
+-----------------+---------+-------+
|      customer_id|      int|   null|
|   customer_fname|   string|   null|
|   customer_lname|   string|   null|
|   customer_email|   string|   null|
|customer_password|   string|   null|
|  customer_street|   string|   null|
|    customer_city|   string|   null|
|   customer_state|   string|   null|
| customer_zipcode|   string|   null|
+-----------------+---------+-------+



In [19]:
jdbcDF.write.format("parquet").mode("overwrite").saveAsTable("clientesp")

In [20]:
jdbcDF.write.format("json").mode("append").saveAsTable("clientesj")

In [18]:
df_clientes = spark.table("clientes")
df_clientes.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- customer_fname: string (nullable = true)
 |-- customer_lname: string (nullable = true)
 |-- customer_email: string (nullable = true)
 |-- customer_password: string (nullable = true)
 |-- customer_street: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)
 |-- customer_zipcode: string (nullable = true)



In [32]:
tablas = spark.catalog.listTables()
tablas

[Table(name='clientes', database='s8a', description=None, tableType='MANAGED', isTemporary=False),
 Table(name='clientesj', database='s8a', description=None, tableType='MANAGED', isTemporary=False),
 Table(name='clientesp', database='s8a', description=None, tableType='MANAGED', isTemporary=False),
 Table(name='clientes', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

In [35]:
spark.sql("show tables").show();

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|      s8a| clientes|      false|
|      s8a|clientesj|      false|
|      s8a|clientesp|      false|
|         | clientes|       true|
+---------+---------+-----------+



In [43]:
spark.sql("""
ALTER TABLE clientesp
SET TBLPROPERTIES (
  'comment' = 'Datos de clientes de retail_db',
  'active' = 'true'
)
""")

DataFrame[]

In [44]:
spark.sql("show tables").show();

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|      s8a| clientes|      false|
|      s8a|clientesj|      false|
|      s8a|clientesp|      false|
|         | clientes|       true|
+---------+---------+-----------+



In [45]:
spark.catalog.listTables()

[Table(name='clientes', database='s8a', description=None, tableType='MANAGED', isTemporary=False),
 Table(name='clientesj', database='s8a', description=None, tableType='MANAGED', isTemporary=False),
 Table(name='clientesp', database='s8a', description='Datos de clientes de retail_db', tableType='MANAGED', isTemporary=False),
 Table(name='clientes', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

In [52]:
spark.sql("ALTER TABLE clientesp ALTER COLUMN customer_id COMMENT 'identificador del cliente. PK';")

AnalysisException: org.apache.hadoop.hive.ql.metadata.HiveException: Unable to alter table. The following columns have types incompatible with the existing columns in their respective positions :
col

In [50]:
spark.sql("""
ALTER TABLE clientesp
    ALTER COLUMN customer_fname
        COMMENT 'Nombre del cliente';
""")

AnalysisException: org.apache.hadoop.hive.ql.metadata.HiveException: Unable to alter table. The following columns have types incompatible with the existing columns in their respective positions :
col

In [51]:
spark.sql("describe table s8a.clientesp").show()

+-----------------+---------+-------+
|         col_name|data_type|comment|
+-----------------+---------+-------+
|      customer_id|      int|   null|
|   customer_fname|   string|   null|
|   customer_lname|   string|   null|
|   customer_email|   string|   null|
|customer_password|   string|   null|
|  customer_street|   string|   null|
|    customer_city|   string|   null|
|   customer_state|   string|   null|
| customer_zipcode|   string|   null|
+-----------------+---------+-------+



In [56]:
spark.catalog.listColumns("clientesp")

[Column(name='customer_id', description=None, dataType='int', nullable=True, isPartition=False, isBucket=False),
 Column(name='customer_fname', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False),
 Column(name='customer_lname', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False),
 Column(name='customer_email', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False),
 Column(name='customer_password', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False),
 Column(name='customer_street', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False),
 Column(name='customer_city', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False),
 Column(name='customer_state', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False),
 Column(name='customer_zipcode', description=None, 