# Schema Enforcement

* In Spark, schema enforcement means you explicitly define the structure and data types of your DataFrame instead of relying on `inferSchema=True`

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
.appName("RDD_Operations") \
.getOrCreate()

In [3]:
df=spark.read.format('csv').option('header','true').option('inferschema','true').load('/content/customers.csv')

here will be scanning the whole and can also lead to wrong inference , not consistency

In [12]:
# show schema
df.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)
 |-- registration_date: date (nullable = true)
 |-- is_active: boolean (nullable = true)



In [7]:
# df.filter(df.is_active==True).count()
df.count()

2612731

In [8]:
df.show(10)

+-----------+----------+---------+-----------+-------+-----------------+---------+
|customer_id|      name|     city|      state|country|registration_date|is_active|
+-----------+----------+---------+-----------+-------+-----------------+---------+
|          0|Customer_0|  Kolkata|  Karnataka|  India|       2023-12-21|    false|
|          1|Customer_1|Ahmedabad|  Telangana|  India|       2023-03-07|     true|
|          2|Customer_2|Bangalore|  Karnataka|  India|       2023-01-14|    false|
|          3|Customer_3|  Chennai|  Karnataka|  India|       2023-11-16|    false|
|          4|Customer_4|    Delhi|  Telangana|  India|       2023-05-23|     true|
|          5|Customer_5|Hyderabad|Maharashtra|  India|       2023-07-10|    false|
|          6|Customer_6|     Pune|West Bengal|  India|       2023-03-04|     true|
|          7|Customer_7|Hyderabad|West Bengal|  India|       2023-08-11|    false|
|          8|Customer_8|  Chennai|West Bengal|  India|       2023-10-21|    false|
|   

In [9]:
df.columns

['customer_id',
 'name',
 'city',
 'state',
 'country',
 'registration_date',
 'is_active']

### How to define schema in Spark

You use `StructType` and `StructField` from `pyspark.sql.types`.

## Struct Type

In [16]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType , BooleanType , FloatType


In [24]:
schema = StructType([
    StructField("customer_id", IntegerType(), False),
    StructField("name", StringType(), False),
    StructField("city", StringType(), False),
    StructField("state", StringType(), False),
    StructField("country", StringType(), False),
    StructField("registration_date", StringType(), False),
    StructField("is_active", BooleanType(), False)
])

In [30]:
df=spark.read.format('csv').option('header','true').schema(schema).load('/content/customers.csv')

In [18]:
df.show(3)

+-----------+----------+---------+---------+-------+-----------------+---------+
|customer_id|      name|     city|    state|country|registration_date|is_active|
+-----------+----------+---------+---------+-------+-----------------+---------+
|          0|Customer_0|  Kolkata|Karnataka|  India|       2023-12-21|    false|
|          1|Customer_1|Ahmedabad|Telangana|  India|       2023-03-07|     true|
|          2|Customer_2|Bangalore|Karnataka|  India|       2023-01-14|    false|
+-----------+----------+---------+---------+-------+-----------------+---------+
only showing top 3 rows



In [29]:
df.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)
 |-- registration_date: string (nullable = true)
 |-- is_active: boolean (nullable = true)



## DDL Schema

* It is a string format used to describe a schema, similar to SQL table definitions.

In [36]:
ddl_schema = """
customer_id INT,
name STRING,
city STRING,
state STRING,
country STRING,
registration_date DATE,
is_active BOOLEAN
"""


In [37]:
df=spark.read.format('csv').option('header','true').schema(ddl_schema).load('/content/customers.csv')

In [38]:
df.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)
 |-- registration_date: date (nullable = true)
 |-- is_active: boolean (nullable = true)

