<a href="https://colab.research.google.com/github/arulrajgopal-zerotoone/zero_to_one_spark/blob/main/apache_spark/05_schema_control.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install pyspark

from pyspark.sql import SparkSession

#create spark session
spark= SparkSession.builder.appName('mysparksession').getOrCreate()

#create spark context
sc = spark.sparkContext

In [0]:
from pyspark.sql.functions import lit

#read a file with inferschema

In [0]:
user_sch_inf_df = spark.read\
    .format("csv")\
    .option('Header',False)\
    .option('InferSchema',True)\
    .option('sep','|')\
    .load('user.user')

In [0]:
user_sch_inf_df.printSchema()

#read a file without schema

In [0]:
user_df = spark.read\
    .format("csv")\
    .option('Header',False)\
    .option('sep','|')\
    .load('user.user')

In [0]:
user_df.printSchema()

#column renaming

In [0]:
user_df.show(3)

In [0]:
col_mapping = {
    '_c0':'user_id',
    '_c1':'age',
    '_c2':'gender',
    '_c3':'occupation',
    '_c4':'zip_code'
    }

In [0]:
def col_renaming(df, col_map):
  list = []
  for i, j in col_map.items():
    list.append(f"{i} as {j}")

  renamed_df = df.selectExpr(*list)

  return renamed_df

In [0]:
col_renamed_df = col_renaming(user_df, col_mapping)
col_renamed_df.show()

#apply schema

In [0]:
schema = {
'user_id': 'INT',
 'age':'INT',
 'gender':'STRING',
 'occupation':'STRING',
 'zip_code':'INT'
}

In [0]:
def date_type_conversion(df, schema):
  column_list = []
  for column, datatype in schema.items():
      column_list.append(f"cast({column} as {datatype}) {column}")
      data_type_converted_df = df.selectExpr(column_list)

  return data_type_converted_df

In [0]:
data_type_converted_df = date_type_conversion(col_renamed_df, schema)
col_renamed_df.printSchema()
data_type_converted_df.printSchema()

#apply schema using structtype

In [0]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType

In [0]:
struct_schema = StructType([ \
    StructField("user_id",IntegerType(),True), \
    StructField("age",IntegerType(),True), \
    StructField("gender",StringType(),True), \
    StructField("occupation",StringType(),True), \
    StructField("zip_code",IntegerType(),True) \
  ])

In [0]:
user_df = spark.read\
    .format('csv')\
    .option('Header',False)\
    .schema(struct_schema)\
    .option('sep','|')\
    .load('user.user')

In [0]:
user_df.printSchema()

In [0]:
user_df.show()

#other options to cast

In [0]:
col_renamed_df.printSchema()
col_renamed_df.show(2)

In [0]:
col_renamed_df.select(col_renamed_df.age.cast("int")).printSchema()
col_renamed_df.selectExpr('cast(age as INT) as age_int').printSchema()
col_renamed_df.withColumn('new_age',lit(None)).printSchema()