In [0]:
spider2_volumes_path = "/Volumes/doan_spider/default/spider2-sqlite"
dbfs_target_path ="dbfs:/doan/spider2"

files = [{'path': f.path, 'name': f.name} for f in dbutils.fs.ls(spider2_volumes_path)]
files

In [0]:
for f in files:
  dbutils.fs.cp(f['path'], dbfs_target_path + "/" + f['name'],)

In [0]:
dbfs_spider_files = dbutils.fs.ls(dbfs_target_path)

for dbfs_file in dbfs_spider_files[:2]:
  jdbc_url_prefix = "jdbc:sqlite:"
  file_path = dbfs_file.path
  schema_name = dbfs_file.name.split('.sqlite')[0]
  print(f"{jdbc_url_prefix}/{file_path}")



In [0]:
for dbfs_file in dbfs_spider_files[:2]:
  jdbc_url_prefix = "jdbc:sqlite:"
  file_path = dbfs_file.path.replace('dbfs:/', 'dbfs/')
  schema_name = dbfs_file.name.split('.sqlite')[0]
  print(f"{jdbc_url_prefix}/{file_path}")

In [0]:
def table_exists(catalog, schema, table):
  try:
    query = f"SHOW TABLES IN {catalog}.{schema} LIKE '{table}'"
    result = spark.sql(query).collect()
    return len(result) > 0
  except Exception as e:
    print(f"table check failed for {table}")
    return False
  
catalog = "doan_spider"
dbfs_spider_files = dbutils.fs.ls(dbfs_target_path)

failed_tables = []
failed_count = 0

for dbfs_file in dbfs_spider_files:
  
  #instantiate files
  jdbc_url_prefix = "jdbc:sqlite:"
  file_path = dbfs_file.path.replace('dbfs:/', 'dbfs/')
  schema_name = dbfs_file.name.split('.sqlite')[0].replace('-', '_')

  try:
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog}.{schema_name}")

    sqlite_tables = [
      t.tbl_name
      for t in (
          spark.read.format("jdbc")
          .option("url", f"{jdbc_url_prefix}/{file_path}")
          .option("dbtable", "sqlite_master")  # or any valid SQL sub-query in parentheses
          .option("driver", "org.sqlite.JDBC")
          .load().select("tbl_name").dropDuplicates()
      ).collect()
    ]
  except Exception as e:
    print(f"Failed schema creation: {schema_name}")
    failed_tables.append({
        "schema": schema_name,
        "table": "none - failed schema creation",
        "error":str(e)
      })

  for table in sqlite_tables:
    if table_exists(catalog, schema_name, table):
      print(f"Table: {catalog}.{schema_name}.{table} exists. Skipping. \n")
      pass
    else:
      try:
        df = (
            spark.read.format("jdbc")
            .option("url", f"{jdbc_url_prefix}/{file_path}")
            .option("dbtable", table)  # or any valid SQL sub-query in parentheses
            .option("driver", "org.sqlite.JDBC")
            .load()
        )

        df.write.mode("overwrite").saveAsTable(f"{catalog}.{schema_name}.{table}")
      except Exception as e:
        failed_tables.append({
          "schema": schema_name,
          "table": table,
          "error":str(e)
        })
        failed_count = failed_count + 1
        print(f"Failed to write table: {catalog}.{schema_name}.{table}")
        print(f"Failed table writes: {str(failed_count)}")
        print(f"\n\n")
        pass


In [0]:
from pyspark.sql.functions import current_timestamp
failed_questions = spark.createDataFrame(failed_tables).withColumn("process_time", current_timestamp())

In [0]:
failed_questions.write.saveAsTable("doan_spider.default.failed_ingestion_tables")

In [0]:
display(spark.table("doan_spider.default.failed_ingestion_tables"))