In [1]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('SCHEMAS').getOrCreate()


#### df.show( no.of columns to be shown(default=20), truncate=False(default=True -->> displays only 20 characters in the column value), vertical=True -->> shows df vertically))

In [16]:
data = [("James","","Smith","030696","M",60000),
        ("Michael","Rose","","240298","M",70000),
        ("Robert","","Williams","141294","",400000),
        ("Maria","Anne","Jones","230992","F",500000),
        ("Jen","Mary","Brown","","F",0),
       ("Tom", "Jerry", "", "230998", 'M', 1000000)]

columns = ["first_name","middle_name","last_name","id","gender","salary"]
df = spark.createDataFrame(data = data, schema = columns)
# Default - displays 20 rows and 20 charactes from column value 
df.show()
df.printSchema()

+----------+-----------+---------+------+------+-------+
|first_name|middle_name|last_name|    id|gender| salary|
+----------+-----------+---------+------+------+-------+
|     James|           |    Smith|030696|     M|  60000|
|   Michael|       Rose|         |240298|     M|  70000|
|    Robert|           | Williams|141294|      | 400000|
|     Maria|       Anne|    Jones|230992|     F| 500000|
|       Jen|       Mary|    Brown|      |     F|      0|
|       Tom|      Jerry|         |230998|     M|1000000|
+----------+-----------+---------+------+------+-------+

root
 |-- first_name: string (nullable = true)
 |-- middle_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



In [18]:
from pyspark.sql.types import StringType, StructType, StructField, IntegerType, ArrayType, MapType

structureData = [
        (("James","","Smith"),"030696","M",60000),
        (("Michael","Rose",""),"240298","M",70000),
        (("Robert","","Williams"),"141294","",400000),
        (("Maria","Anne","Jones"),"230992","F",500000),
        (("Jen","Mary","Brown"),"","F",0),
        (("Tom", "", "Jerry"), "230998", 'M', 1000000)
  ]
structureSchema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('id', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', IntegerType(), True)
         ])

df2 = spark.createDataFrame(data=structureData,schema=structureSchema)
df2.printSchema()
df2.show(truncate=False)


root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+--------------------+------+------+-------+
|name                |id    |gender|salary |
+--------------------+------+------+-------+
|{James, , Smith}    |030696|M     |60000  |
|{Michael, Rose, }   |240298|M     |70000  |
|{Robert, , Williams}|141294|      |400000 |
|{Maria, Anne, Jones}|230992|F     |500000 |
|{Jen, Mary, Brown}  |      |F     |0      |
|{Tom, , Jerry}      |230998|M     |1000000|
+--------------------+------+------+-------+



In [28]:
from pyspark.sql.functions import col,struct,when

updatedDF = df2.withColumn("OtherInfo", struct(col("id").alias("identifier"),
                                               col("gender").alias("gender"),
                                               col("salary").alias("salary"),
                                        when(col("salary").cast(IntegerType()) < 200000,"Low")       # adding a new column("Salary_Grade") in "otherInfo" column  
                                       .when(col("salary").cast(IntegerType()) < 500000,"Medium")
                                       .otherwise("High").alias("Salary_Grade"))
                           ) \
                            .drop("id","gender","salary")   

updatedDF.printSchema()
updatedDF.show(truncate=False)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- OtherInfo: struct (nullable = false)
 |    |-- identifier: string (nullable = true)
 |    |-- gender: string (nullable = true)
 |    |-- salary: integer (nullable = true)
 |    |-- Salary_Grade: string (nullable = false)

+--------------------+--------------------------+
|name                |OtherInfo                 |
+--------------------+--------------------------+
|{James, , Smith}    |{030696, M, 60000, Low}   |
|{Michael, Rose, }   |{240298, M, 70000, Low}   |
|{Robert, , Williams}|{141294, , 400000, Medium}|
|{Maria, Anne, Jones}|{230992, F, 500000, High} |
|{Jen, Mary, Brown}  |{, F, 0, Low}             |
|{Tom, , Jerry}      |{230998, M, 1000000, High}|
+--------------------+--------------------------+



In [32]:
df.show(n=3,truncate=25,vertical=True)

-RECORD 0---------------
 first_name  | James    
 middle_name |          
 last_name   | Smith    
 id          | 030696   
 gender      | M        
 salary      | 60000    
-RECORD 1---------------
 first_name  | Michael  
 middle_name | Rose     
 last_name   |          
 id          | 240298   
 gender      | M        
 salary      | 70000    
-RECORD 2---------------
 first_name  | Robert   
 middle_name |          
 last_name   | Williams 
 id          | 141294   
 gender      |          
 salary      | 400000   
only showing top 3 rows



In [36]:
structureData = [
    (("John", "A", "Doe"), ["reading", "traveling"], {"age": "30", "city": "New York"}),
    (("Jane", "B", "Smith"), ["coding", "photography"], {"age": "25", "city": "San Francisco"}),
    (("Bob", None, "Johnson"), ["hiking", "painting"], {"age": "35", "city": "Los Angeles"}),
    (("Alice", "C", "Williams"), ["swimming", "writing"], {"age": "28", "city": "Chicago"})
]

arrayStructureSchema = StructType([
    StructField('name', StructType([
       StructField('firstname', StringType(), True),
       StructField('middlename', StringType(), True),
       StructField('lastname', StringType(), True)
       ])),
       StructField('hobbies', ArrayType(StringType()), True),
       StructField('properties', MapType(StringType(),StringType()), True)
    ])

df3 = spark.createDataFrame(data=structureData, schema=arrayStructureSchema)
df3.show(truncate=False)

+--------------------+---------------------+----------------------------------+
|name                |hobbies              |properties                        |
+--------------------+---------------------+----------------------------------+
|{John, A, Doe}      |[reading, traveling] |{city -> New York, age -> 30}     |
|{Jane, B, Smith}    |[coding, photography]|{city -> San Francisco, age -> 25}|
|{Bob, NULL, Johnson}|[hiking, painting]   |{city -> Los Angeles, age -> 35}  |
|{Alice, C, Williams}|[swimming, writing]  |{city -> Chicago, age -> 28}      |
+--------------------+---------------------+----------------------------------+



### schema using DDL - Data Definition Language (column_name type (if not null -->> NOT NULL), another_column_name ......)

In [74]:
schema = 'author STRING, title STRING, pages INT NOT NULL'

In [80]:
sample_data = [
    ('Bindu', 'Way in the Light', 200),
    ('Anjali', 'Way in the Dark', 300)
]

df = spark.createDataFrame(data=sample_data, schema=schema)
df.show()

+------+----------------+-----+
|author|           title|pages|
+------+----------------+-----+
| Bindu|Way in the Light|  200|
|Anjali| Way in the Dark|  300|
+------+----------------+-----+

