## Testing the notebook

In [0]:
print("Welcome to learn Databricks - with Fun")

In [0]:
type(spark)

## Create a DataFrameReader

In [0]:
dfr = spark.read
print(type(dfr))

## Create Dataframe from different Sources

### Create Dataframe from csv

In [0]:
df = dfr.csv(
    path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/dataset/sample_dataset/users_001.csv",
    header=True,
    inferSchema=True,
)
df.display()

#### Create Dataframe from Delimited file

In [0]:
df = dfr.csv(
    path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/dataset/sample_dataset/users_001.dat",
    inferSchema=True,
    header=True,
    sep="|",
)
df.printSchema()
df.display()

### Create a DataFrame from a json file

In [0]:
df = dfr.json(path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/dataset/sample_dataset/users_001.json")
df.printSchema()
df.display()

In [0]:
df = dfr.json(path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/dataset/sample_dataset/users_002.json", multiLine=True).display()

In [0]:
#multiline jsons with two records
df = dfr.json(path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/dataset/sample_dataset/users_003.json", multiLine=True).display()

In [0]:
#Multiline jsons with two records put in an array
df = dfr.json(path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/dataset/sample_dataset/users_004.json", multiLine=True).display()

In [0]:
from pyspark.sql.types import *
USER_SCHEMA = StructType([
    StructField("id", IntegerType()),
    StructField("age", IntegerType()),
    StructField("gen", StringType()),
    StructField("designation", StringType()),
    StructField("salary", IntegerType()),
])

df = dfr.csv(path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/dataset/sample_dataset/users_001.csv", header=True, schema=USER_SCHEMA)
df.printSchema()
df.display()

In [0]:
from pyspark.sql.types import *
 
USER_SCHEMA = StructType(
    [
        StructField("id", IntegerType()),
        StructField("age", IntegerType()),
        StructField("gen", StringType()),
        StructField("designation", StringType()),
        StructField("salary", IntegerType()),
    ]
)
df = spark.read.csv(
    path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/dataset/sample_dataset/users_001.csv",
    header=True,
    schema=USER_SCHEMA
)
df.printSchema()
df.display()

### Handling Bad Records

In [0]:
# using PERMISSIVE method
df = dfr.json(path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/dataset/access_logs.json", mode="PERMISSIVE")
df.display()

In [0]:
# using DROPMALFORMED method
df = dfr.json(path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/dataset/access_logs.json", mode="DROPMALFORMED")
df.display()

In [0]:
# using FAILFAST method
df = dfr.json(path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/dataset/access_logs.json", mode="FAILFAST")
df.display()

### Convert CSV to JSON using different modes

In [0]:
# using OVERWRITE mode
df = spark.read.csv(
    path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/dataset/sample_dataset/users_001.csv",
    inferSchema=True,
    header=True,
)
print(type(df.write))
df.write.save(
    format="JSON",
    path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/output_json/",
    mode="overwrite",
)
dfr.json(
    path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/output_json/"
).display()

In [0]:
# using APPEND mode
df = spark.read.csv(
    path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/dataset/sample_dataset/users_001.csv",
    inferSchema=True,
    header=True,
)
print(type(df.write))
df.write.save(
    format="JSON",
    path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/output_json/",
    mode="append",
)
dfr.json(
    path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/output_json/"
).display()

In [0]:
# using IGNORE mode
df = spark.read.csv(
    path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/dataset/sample_dataset/users_001.csv",
    inferSchema=True,
    header=True,
)
print(type(df.write))
df.write.save(
    format="JSON",
    path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/output_json/",
    mode="ignore",
)
dfr.json(
    path="/Volumes/quickstart_catalog/quickstart_schema/sandbox/output_json/"
).display()