<a href="https://colab.research.google.com/github/arulrajgopal-zerotoone/zero_to_one_spark/blob/main/apache_spark/05_schema_control.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark

from pyspark.sql import SparkSession

#create spark session
spark= SparkSession.builder.appName('mysparksession').getOrCreate()

#create spark context
sc = spark.sparkContext

Collecting pyspark
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=cb10e2a141445b311ea1f36bfbc215c9166f5da30b2a36215434fbf357d08c74
  Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.3


#read a file with inferschema

In [16]:
user_sch_inf_df = spark.read\
    .format("csv")\
    .option('Header',False)\
    .option('InferSchema',True)\
    .option('sep','|')\
    .load('user.user')

In [17]:
user_sch_inf_df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: integer (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)



#read a file without schema

In [23]:
user_df = spark.read\
    .format("csv")\
    .option('Header',False)\
    .option('sep','|')\
    .load('user.user')

In [24]:
user_df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)



#column renaming

In [44]:
user_df.show(3)

+---+---+---+----------+-----+
|_c0|_c1|_c2|       _c3|  _c4|
+---+---+---+----------+-----+
|  1| 24|  M|technician|85711|
|  2| 53|  F|     other|94043|
|  3| 23|  M|    writer|32067|
+---+---+---+----------+-----+
only showing top 3 rows



In [34]:
col_mapping = {
    '_c0':'user_id',
    '_c1':'age',
    '_c2':'gender',
    '_c3':'occupation',
    '_c4':'zip_code'
    }

In [45]:
def col_renaming(df, col_map):
  list = []
  for i, j in col_map.items():
    list.append(f"{i} as {j}")

  renamed_df = df.selectExpr(*list)

  return renamed_df

In [46]:
col_renamed_df = col_renaming(user_df, col_mapping)
col_renamed_df.show()

+-------+---+------+-------------+--------+
|user_id|age|gender|   occupation|zip_code|
+-------+---+------+-------------+--------+
|      1| 24|     M|   technician|   85711|
|      2| 53|     F|        other|   94043|
|      3| 23|     M|       writer|   32067|
|      4| 24|     M|   technician|   43537|
|      5| 33|     F|        other|   15213|
|      6| 42|     M|    executive|   98101|
|      7| 57|     M|administrator|   91344|
|      8| 36|     M|administrator|   05201|
|      9| 29|     M|      student|   01002|
|     10| 53|     M|       lawyer|   90703|
|     11| 39|     F|        other|   30329|
|     12| 28|     F|        other|   06405|
|     13| 47|     M|     educator|   29206|
|     14| 45|     M|    scientist|   55106|
|     15| 49|     F|     educator|   97301|
|     16| 21|     M|entertainment|   10309|
|     17| 30|     M|   programmer|   06355|
|     18| 35|     F|        other|   37212|
|     19| 40|     M|    librarian|   02138|
|     20| 42|     F|    homemake

#apply schema

In [50]:
schema = {
'user_id': 'INT',
 'age':'INT',
 'gender':'STRING',
 'occupation':'STRING',
 'zip_code':'INT'
}

In [51]:
def date_type_conversion(df, schema):
  column_list = []
  for column, datatype in schema.items():
      column_list.append(f"cast({column} as {datatype}) {column}")
      data_type_converted_df = df.selectExpr(column_list)

  return data_type_converted_df

In [52]:
data_type_converted_df = date_type_conversion(col_renamed_df, schema)
col_renamed_df.printSchema()
data_type_converted_df.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- zip_code: string (nullable = true)

root
 |-- user_id: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- zip_code: integer (nullable = true)



#apply schema using structtype

In [None]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
from pyspark.sql.functions import *

In [None]:
dataDF = [('james','1991-04-01','M',3000),
  ('Michel','2000-05-19','M',4000),
  ('Robert','1978-09-05','M',4000),
  ('Maria','1967-12-01','F',4000),
  ('Jen','1980-02-17','F',-1)
]


schema = StructType([StructField('name', StringType(), True),
         StructField('dob', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', IntegerType(), True)
         ])


df = spark.createDataFrame(dataDF, schema)
df.printSchema()
df.show(truncate=False)

In [None]:
dataDF = [(('James','','Smith'),'1991-04-01','M',3000),
  (('Michael','Rose',''),'2000-05-19','M',4000),
  (('Robert','','Williams'),'1978-09-05','M',4000),
  (('Maria','Anne','Jones'),'1967-12-01','F',4000),
  (('Jen','Mary','Brown'),'1980-02-17','F',-1)
]

structureSchema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('id', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', IntegerType(), True)
         ])

df2 = spark.createDataFrame(dataDF,structureSchema)
df2.printSchema()
df2.show(truncate=False)

In [None]:



# Casting
from pyspark.sql.types import  StructType,  StructField, StringType, IntegerType
from pyspark.sql.functions import lit

df_schema = StructType(fields=[StructField("sr_no", StringType(), False),
                                StructField("name", StringType(), True),
                                StructField("age", StringType(), True),
                                StructField("fav_sport", StringType(), True)])

list = [
  (1, 'Arul',23,'football'),
  (2,'Sekar',34,'cricket'),
  (3,'Vinoth',33,'chess'),
  (4,'Ravi',30,'tennis')]

df = spark.createDataFrame(list, df_schema)


df.show()
df.printSchema()
#direct casting
df_1= df.select(df.age.cast("int"))
#casting without alias
df_2 = df.selectExpr('cast(age as INT)')
#casting with alias
df_3 = df.selectExpr('cast(age as INT) as new_age')
#casting with new column as null
df_4 = df.withColumn('new_age',lit(None))
#casting with new column as null using selectExpr
df_5 = df.selectExpr('cast(null as INT) as new_age')


df_1.show()
df_1.printSchema()
df_2.show()
df_2.printSchema()
df_3.show()
df_3.printSchema()
df_4.show()
df_4.printSchema()
df_5.show()
df_5.printSchema()