In [21]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local[*]') \
    .config('spark.hadoop.fs.defaultFS','file:///') \
    .config('spark.hadoop.yarn.resourcemanager.address','local') \
    .config("spark.driver.memory", "2g") \
    .appName("demo").getOrCreate()

spark.conf.set("spark.sql.shuffle.partitions", "4")

spark


In [22]:
df = spark.createDataFrame(
    [
        ("sue", 32),
        ("li", 3),
        ("bob", 75),
        ("heo", 13)
    ], schema = ["first_name", "age"]
    )


In [23]:
from pyspark.sql.functions import col, when

df1 = df.withColumn("life_stage", 
                    when(col("age") <= 12, "child")\
                    .when(col("age").between(13, 19), "teenager")\
                    .otherwise("adult"))


In [24]:
df1.printSchema()


root
 |-- first_name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- life_stage: string (nullable = false)



In [25]:
df.write.mode('overwrite').format("csv") \
    .option("header", "true").save("Person.csv")


In [26]:
# Save the concatenated DataFrame to disk in Parquet format
output_path = "Person.parquet"
df.write.mode("overwrite").parquet(output_path)


In [27]:
df.write.saveAsTable("some_people")


In [28]:
spark.stop()
