# Creating a DataFrameReader 

In [0]:
type(spark)

In [0]:
dfr = spark.read
type(dfr)

# Create DataFrame from different sources

## Create DataFrame from CSV

In [0]:
df = dfr.csv(
    path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/dataset/sample_dataset/users_001.csv",
    header=True,
    inferSchema=True
)
df.printSchema()
df.display()

## Create DataFrame from Delimited File

In [0]:
df = dfr.csv(
    path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/dataset/sample_dataset/users_001.dat",
    header=True,
    inferSchema=True,
    sep="|"
)
df.printSchema()
df.display()

## Create a DataFrame from JSON File

In [0]:
df = spark.read.json(
    path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/dataset/sample_dataset/users_001.json",
)
df.display()

In [0]:
df = spark.read.json(
    path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/dataset/sample_dataset/users_002.json",
    multiLine=True
)
df.display()

In [0]:
df = spark.read.json(
    path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/dataset/sample_dataset/users_003.json",
    multiLine=True
)
df.display()

In [0]:
df = spark.read.json(
    path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/dataset/sample_dataset/users_004.json",
    multiLine=True
)
df.display()

# Custom Schema

In [0]:
from pyspark.sql.types import *

USER_SCHEMA = StructType([
    StructField("id", IntegerType()),
    StructField("age", IntegerType()),
    StructField("gender", StringType()),
    StructField("designation", StringType()),
    StructField("salary", IntegerType()),
])

df = spark.read.csv(
    path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/dataset/sample_dataset/users_001.csv",
    header=True,
    schema=USER_SCHEMA
)
df.printSchema()
df.display()

# Handling Bad Records

## PERMISSIVE

In [0]:
spark.read.json(
    path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/dataset/access_logs.json",
).display()

## DROPMALFORMED

In [0]:
spark.read.json(
    path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/dataset/access_logs.json",
    mode="DROPMALFORMED",
).display()

## FAILFAST

In [0]:
spark.read.json(
    path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/dataset/access_logs.json",
    mode="FAILFAST",
).display()

# DataFrame Writer API

## Convert CSV into JSON

In [0]:
df = spark.read.csv(
    path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/dataset/sample_dataset/users_001.csv",
    header=True,
    inferSchema=True
)
type(df.write)
df.write.save(
    format="JSON",
    path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/output_json",
)

In [0]:
spark.read.json(path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/output_json").show()

## Output Modes

### ErrorIfExists

In [0]:
df.write.save(
    format="JSON",
    path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/output_json",
)

### Overwrite

In [0]:
df.write.save(
    format="JSON",
    path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/output_json",
    mode="overwrite"
)

### Append

In [0]:
df.write.save(
    format="JSON",
    path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/output_json",
    mode="append"
)

### Ignore

In [0]:
df.write.save(
    format="JSON",
    path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/output_json",
    mode="ignore"
)