# Prerequestie

## Create DeltaTable

In [0]:
%sql

USE CATALOG dwlabdatabrickstest96_2249171740111810;
 
CREATE TABLE quickstart_schema.users (
  id INT,
  name STRING,
  dob DATE,
  email STRING,
  gender STRING,
  country STRING,
  region STRING,
  city STRING,
  asset INT,
  marital_status STRING
)

In [0]:
df = spark.read.csv(
    path="/Volumes/dwlabdatabrickstest96_2249171740111810/quickstart_schema/sandbox/dataset/user_dataset/users_001.csv",
    header=True,
    inferSchema=True
)
df.write.saveAsTable("quickstart_schema.users", mode="overwrite")

## Read Delta Table

In [0]:
delta_Table = spark.read.table("quickstart_schema.users")
delta_Table.display()

## Read another file for schema enforcement
> It will throw error

In [0]:
spark.read.csv(
    path="/Volumes/dwlabdatabrickstest96_2249171740111810/quickstart_schema/sandbox/dataset/user_dataset/users_006_new_column_education.csv",
    header=True,
    inferSchema=True
).write.mode("append").saveAsTable("quickstart_schema.users", mode="overwrite")


## Solution - Schema Evolution 
1. Additional COlumn
2.  Missing COlumn
3.  Differnt DataTypes

In [0]:
spark.read.csv(
    path="/Volumes/dwlabdatabrickstest96_2249171740111810/quickstart_schema/sandbox/dataset/user_dataset/users_006_new_column_education.csv",
    header=True,
    inferSchema=True
).write.option("mergeSchema", "true").mode("append").saveAsTable("quickstart_schema.users", mode="overwrite")
delta_Table.display()

In [0]:
%sql 
DESCRIBE TABLE quickstart_schema.users;

In [0]:
%sql
SELECT * FROM quickstart_schema.users WHERE id>2501 LIMIT 4

## Source with Less Columns - It will allow

In [0]:
df_less = spark.read.csv(
    path="/Volumes/dwlabdatabrickstest96_2249171740111810/quickstart_schema/sandbox/dataset/user_dataset/users_012_less_columns.csv",
    header=True,
    inferSchema=True
)
df_less.write.mode("append").saveAsTable("quickstart_schema.users", mode="overwrite")
delta_Table.display()

In [0]:
%sql
SELECT * FROM quickstart_schema.users WHERE id>5501 LIMIT 4

## Column with differnt Column names - Fail

In [0]:
df_ColumnheaderNotMatching = spark.read.csv(
    path="/Volumes/dwlabdatabrickstest96_2249171740111810/quickstart_schema/sandbox/dataset/user_dataset/users_013_less_columns_with_different_column_names.csv",
    header=True,
    inferSchema=True
)
df_ColumnheaderNotMatching.write.mode("append").saveAsTable("quickstart_schema.users", mode="overwrite")



In [0]:
df_ColumnheaderNotMatching.printSchema()

### Solution 1  - Custom Schema

In [0]:
from pyspark.sql.types import *
 
USER_SCHEMA = StructType([
    StructField("id", IntegerType()),
    StructField("name", StringType()),
    StructField("dob", DateType()),
    StructField("email", StringType()),
    StructField("gender", StringType()),
    StructField("country", StringType()),
    StructField("region", StringType()),
    StructField("city", StringType()),
    StructField("asset", IntegerType()),
    StructField("marital_status", StringType()),
    StructField("education", StringType()),
])
 
df_ColumnheaderNotMatching = spark.read.csv(
    path="/Volumes/dwlabdatabrickstest96_2249171740111810/quickstart_schema/sandbox/dataset/user_dataset/users_013_less_columns_with_different_column_names.csv",
    header=True,
    inferSchema=True,
    schema=USER_SCHEMA
)
df_ColumnheaderNotMatching.printSchema()
df_ColumnheaderNotMatching.write.mode("append").saveAsTable("quickstart_schema.users")

In [0]:
delta_Table.display()

### Solution 02 - Adding ALIAS

In [0]:
from pyspark.sql.functions import col
df_diffColName = spark.read.csv(
    path="/Volumes/dwlabdatabrickstest96_2249171740111810/quickstart_schema/sandbox/dataset/user_dataset/users_013_less_columns_with_different_column_names.csv",
    header=True,
    inferSchema=True
)
df_diffColName.printSchema()
df_diffColName.select(col("user_id").alias("id"), col("full_name").alias("name")).write.mode(
    "append"
).saveAsTable("quickstart_schema.users")

In [0]:
%sql
SELECT * FROM quickstart_schema.users WHERE id>5501 

## Source with Differnt Datatype 

### Fail

In [0]:
df_diffDataType = spark.read.csv(
    path="/Volumes/dwlabdatabrickstest96_2249171740111810/quickstart_schema/sandbox/dataset/user_dataset/users_013_datatype_mismatch_dob.csv",
    header=True,
    inferSchema=True
)

df_diffDataType.write.option("mergeSchema", "true").mode("append").saveAsTable("quickstart_schema.users", mode="overwrite")
delta_Table.display()