In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("ex").getOrCreate()

data = [(1,'mah',['dotnet','azure']) ,
        (2,'ma',['aws','azure']) ]

schema = ['id','name','skills']
df =spark.createDataFrame(data,schema)
df.printSchema()
display(df)

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)



DataFrame[id: bigint, name: string, skills: array<string>]

In [6]:
# explode - create a row for each one 

from pyspark.sql.functions import explode 

df1 =df.withColumn('skill',explode(df.skills))  # observe the output  
df.printSchema()
df1.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- skill: string (nullable = true)



In [12]:
# split -> convert it into array

from pyspark.sql import SparkSession
from pyspark.sql.functions import split 

spark = SparkSession.builder.appName("ex").getOrCreate()

data = [(1,'mah','dotnet,azure') ,(2,'ma','aws ,azure') ] # dont pass a array 

schema = ['id','name','skills']
df=spark.createDataFrame(data,schema)
df1 =df.withColumn('skillss',split(df.skills,','))
df.printSchema()
df1.printSchema()    # ---------- It wil be array type 

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: string (nullable = true)

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: string (nullable = true)
 |-- skillss: array (nullable = true)
 |    |-- element: string (containsNull = false)



In [17]:
# array()  --> merge the values and convert it into array
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark  = SparkSession.builder.appName("ex").getOrCreate()

data = [(1,'maheer','.net','azere'),(2,'dd','aws','si')]
schema = ['id','name','1stskill','2ndskill']
df =spark.createDataFrame(data,schema)
df.printSchema()

# df1 = df.withColumn('skill',array(df.1stskill , df.2ndskill ) )  # ----->  we should not  use df. 

df1 = df.withColumn('skill', array(col('1stskill') , col('2ndskill') ))
df1.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- 1stskill: string (nullable = true)
 |-- 2ndskill: string (nullable = true)

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- 1stskill: string (nullable = true)
 |-- 2ndskill: string (nullable = true)
 |-- skill: array (nullable = false)
 |    |-- element: string (containsNull = true)



In [22]:
# Array_contains()
from pyspark.sql  import SparkSession
from pyspark.sql.functions import *
spark = SparkSession.builder.appName("ex").getOrCreate()

data = [(1,'mah',['.net','azure']) , (2,'a',['.ne','ae']) ]
schema = ['id','name','skill']

df = spark.createDataFrame(data,schema)
df.printSchema()

df1= df.withColumn("Has.net Skill",array_contains(col('skill'),'.net'))  # returns boolean

df1.printSchema() 


root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skill: array (nullable = true)
 |    |-- element: string (containsNull = true)

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skill: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Has.net Skill: boolean (nullable = true)

