# <font color=Blue>Nested Struct Column</font>

In [1]:
import findspark
findspark.init()

In [2]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Nested Struct Column").master("local[*]").getOrCreate()

24/01/18 23:01:44 WARN Utils: Your hostname, ajith-Lenovo-G50-80 resolves to a loopback address: 127.0.1.1; using 192.168.1.36 instead (on interface wlp3s0)
24/01/18 23:01:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/01/18 23:01:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
spark

We often need to work with the nested struct column and this can be defined using **StructType**.In the below example column **full_name** data type is StructType which is nested.

In [13]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

Struct_data = [(("James", "", "Smith"), "36636", "M", 3100),
            (("Michael","Rose",""),"40288","M",4300),
            (("Robert","","Williams"),"42114","M",1400),
            (("Maria","Anne","Jones"),"39192","F",5500),
            (("Jen","Mary","Brown"),"","F",-1)       
            ]

Struct_schema = StructType([ StructField('full_name', StructType([ StructField('first_name', StringType(), True),
                                                                  StructField('middle_name', StringType(), True),
                                                                  StructField('last_name', StringType(), True)
                                                                ])
                                        ),
                            StructField('id', StringType(), True),
                            StructField('gender', StringType(), True),
                            StructField('salary', IntegerType(), True)
                         ])

df = spark.createDataFrame(data=Struct_data, schema=Struct_schema)

# print Schema
df.printSchema()


root
 |-- full_name: struct (nullable = true)
 |    |-- first_name: string (nullable = true)
 |    |-- middle_name: string (nullable = true)
 |    |-- last_name: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



In [14]:
# show data
df.show()

                                                                                

+--------------------+-----+------+------+
|           full_name|   id|gender|salary|
+--------------------+-----+------+------+
|    {James, , Smith}|36636|     M|  3100|
|   {Michael, Rose, }|40288|     M|  4300|
|{Robert, , Williams}|42114|     M|  1400|
|{Maria, Anne, Jones}|39192|     F|  5500|
|  {Jen, Mary, Brown}|     |     F|    -1|
+--------------------+-----+------+------+



### Adding and Changing Columns

This can be done by using **struct()** function

In [17]:
from pyspark.sql.functions import col, struct

# create a new column by using withColumn() and rename using alias() 
df2 = df.withColumn("Other_Info", struct(col('id').alias('emp_id'), 
                                         col('gender').alias('gender'),
                                         col('salary').alias('salary')
                                        ))

df2.printSchema()

df2.show()

root
 |-- full_name: struct (nullable = true)
 |    |-- first_name: string (nullable = true)
 |    |-- middle_name: string (nullable = true)
 |    |-- last_name: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- Other_Info: struct (nullable = false)
 |    |-- emp_id: string (nullable = true)
 |    |-- gender: string (nullable = true)
 |    |-- salary: integer (nullable = true)

+--------------------+-----+------+------+----------------+
|           full_name|   id|gender|salary|      Other_Info|
+--------------------+-----+------+------+----------------+
|    {James, , Smith}|36636|     M|  3100|{36636, M, 3100}|
|   {Michael, Rose, }|40288|     M|  4300|{40288, M, 4300}|
|{Robert, , Williams}|42114|     M|  1400|{42114, M, 1400}|
|{Maria, Anne, Jones}|39192|     F|  5500|{39192, F, 5500}|
|  {Jen, Mary, Brown}|     |     F|    -1|       {, F, -1}|
+--------------------+-----+------+------+------

##### Original Credit: https://sparkbyexamples.com/pyspark/pyspark-structtype-and-structfield/