In [0]:
%run "./Mini-Kurs-ETL-Spark/3. Python/3. Pobierz Dane"

In [0]:
print(dbutils.fs.head("dbfs:/FileStore/tables/Files/actors.csv", 1000))

[Truncated to first 1000 bytes]
imdb_title_id,ordering,imdb_name_id,category,job,characters
"tt0000009","1","nm0063086","actress",,"[Miss Geraldine Holbrook (Miss Jerry)]"
"tt0000009","2","nm0183823","actor",,"[Mr. Hamilton]"
"tt0000009","3","nm1309758","actor",,"[Chauncey Depew - the Director of the New York Central Railroad]"
"tt0000009","4","nm0085156","director",,
"tt0000574","1","nm0846887","actress",,"[Kate Kelly]"
"tt0000574","2","nm0846894","actor",,"[School Master]"
"tt0000574","3","nm3002376","actor",,"[Steve Hart]"
"tt0000574","4","nm0170118","actress",,
"tt0000574","5","nm0846879","director",,
"tt0000574","6","nm0317210","producer","producer",
"tt0000574","7","nm0425854","producer","producer",
"tt0000574","8","nm0846911","producer","producer",
"tt0000574","9","nm2421834","composer",,
"tt0000574","10","nm0675239","cinematographer",,
"tt0001892","1","nm0003425","actress",,"[Stella]"
"tt0001892","2","nm0699637","actor",,"[Grev Johan Waldberg]"
"tt0001892","3","nm0375839","acto

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

schema = StructType([
    StructField("imdb_title_id", StringType(), True),
    StructField("ordering", IntegerType(), True),
    StructField("imdb_name_id", StringType(), True),
    StructField("category", StringType(), True),
    StructField("job", StringType(), True),
    StructField("characters", StringType(), True)
])

df = spark.read.csv(
    "dbfs:/FileStore/tables/Files/actors.csv",
    header=True,                                 
    schema=schema
)

df.show()
df.printSchema()


+-------------+--------+------------+---------------+----------+--------------------+
|imdb_title_id|ordering|imdb_name_id|       category|       job|          characters|
+-------------+--------+------------+---------------+----------+--------------------+
|    tt0000009|       1|   nm0063086|        actress|      null|[Miss Geraldine H...|
|    tt0000009|       2|   nm0183823|          actor|      null|      [Mr. Hamilton]|
|    tt0000009|       3|   nm1309758|          actor|      null|[Chauncey Depew -...|
|    tt0000009|       4|   nm0085156|       director|      null|                null|
|    tt0000574|       1|   nm0846887|        actress|      null|        [Kate Kelly]|
|    tt0000574|       2|   nm0846894|          actor|      null|     [School Master]|
|    tt0000574|       3|   nm3002376|          actor|      null|        [Steve Hart]|
|    tt0000574|       4|   nm0170118|        actress|      null|                null|
|    tt0000574|       5|   nm0846879|       director| 

In [0]:
%fs head /FileStore/tables/bad_data.csv

In [0]:
file_path = "/FileStore/tables/bad_data.csv"

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("salary", StringType(), True)
])

df_permissive = spark.read.option("mode", "PERMISSIVE").option("header", "true") \
    .schema(schema) \
    .csv(file_path)

print("PERMISSIVE MODE:")
df_permissive.show()


PERMISSIVE MODE:
+----+-------+----+------+
|  id|   name| age|salary|
+----+-------+----+------+
|   1|  Alice|  30|  5000|
|   2|    Bob|null|  6000|
|   3|Charlie|null|  7000|
|   4|  David|  40|  null|
|   5|    Eve|  35|   abc|
|   6|  Frank|null|  null|
|null|   null|null|  null|
+----+-------+----+------+



In [0]:
df_dropmalformed = spark.read.option("mode", "DROPMALFORMED").option("header", "true").csv(file_path)
print("DROPMALFORMED MODE:")
df_dropmalformed.show()

DROPMALFORMED MODE:
+---+-------+-----------+------+
| id|   name|        age|salary|
+---+-------+-----------+------+
|  1|  Alice|         30|  5000|
|  2|    Bob|twenty-five|  6000|
|  3|Charlie|       null|  7000|
|  4|  David|         40|  null|
|  5|    Eve|         35|   abc|
+---+-------+-----------+------+



In [0]:
try:
    df_failfast = spark.read.option("mode", "FAILFAST").option("header", "true").csv(file_path)
    print("FAILFAST MODE:")
    df_failfast.show()
except Exception as e:
    print(f"FAILFAST MODE: Odczyt zatrzymany z powodu błędu: {e}")

FAILFAST MODE:
FAILFAST MODE: Odczyt zatrzymany z powodu błędu: An error occurred while calling o546.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 9.0 failed 1 times, most recent failure: Lost task 0.0 in stage 9.0 (TID 9) (ip-10-172-168-131.us-west-2.compute.internal executor driver): com.databricks.sql.io.FileReadException: Error while reading file dbfs:/FileStore/tables/bad_data.csv.
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1$$anon$2.logFileNameAndThrow(FileScanRDD.scala:704)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1$$anon$2.getNext(FileScanRDD.scala:673)
	at org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.$anonfun$hasNext$1(FileScanRDD.scala:493)
	at scala.runtime.java8.JFunction0$mcZ$sp.apply(JFunction0$mcZ$sp.java:23)
	at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler

In [0]:
df_with_bad_records = spark.read.option("header", "true") \
    .option("badRecordsPath", "dbfs:/FileStore/bad_records") \
    .csv(file_path)

df_with_bad_records.show()

+---+-------+-----------+------+
| id|   name|        age|salary|
+---+-------+-----------+------+
|  1|  Alice|         30|  5000|
|  2|    Bob|twenty-five|  6000|
|  3|Charlie|       null|  7000|
|  4|  David|         40|  null|
|  5|    Eve|         35|   abc|
+---+-------+-----------+------+



In [0]:
%fs ls /FileStore/bad_records/20250316T085357/bad_records/

path,name,size,modificationTime
dbfs:/FileStore/bad_records/20250316T085357/bad_records/part-00000-a971e165-9a77-4da2-87b8-a1be16cfced8,part-00000-a971e165-9a77-4da2-87b8-a1be16cfced8,362,1742115239000


In [0]:
parquet_path = "dbfs:/FileStore/output_partitioned_bucketed.parquet/"

df_with_bad_records.write \
    .format("parquet") \
    .mode("overwrite") \
    .option("path", parquet_path)


df_parquet = spark.read.parquet(parquet_path)

print("Dane odczytane z pliku PARQUET:")
df_parquet.show(truncate=False)


Dane odczytane z pliku PARQUET:
+---+-------+-----------+------+
|id |name   |age        |salary|
+---+-------+-----------+------+
|1  |Alice  |30         |5000  |
|2  |Bob    |twenty-five|6000  |
|3  |Charlie|null       |7000  |
|4  |David  |40         |null  |
|5  |Eve    |35         |abc   |
+---+-------+-----------+------+



In [0]:
%fs ls dbfs:/FileStore/output_partitioned_bucketed.parquet/

path,name,size,modificationTime
dbfs:/FileStore/output_partitioned_bucketed.parquet/_SUCCESS,_SUCCESS,0,1742116283000
dbfs:/FileStore/output_partitioned_bucketed.parquet/_committed_7998666796950465210,_committed_7998666796950465210,123,1742116283000
dbfs:/FileStore/output_partitioned_bucketed.parquet/_started_7998666796950465210,_started_7998666796950465210,0,1742116281000
dbfs:/FileStore/output_partitioned_bucketed.parquet/part-00000-tid-7998666796950465210-b18139a0-f00c-4e4e-a165-15af7edd9c9e-29-1-c000.snappy.parquet,part-00000-tid-7998666796950465210-b18139a0-f00c-4e4e-a165-15af7edd9c9e-29-1-c000.snappy.parquet,1330,1742116283000
