# PySpark Tutorial

## PySpark Session

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("tutorial").getOrCreate()

In [3]:
data = [
    ("Agus Richard", 26, "Richard", "Male"),
    ("Damara Astiningtyas", 23, "Ara", "Female")
]
column = ["Name", "Age", "Nickname", "Gender"]

df = spark.createDataFrame(data, column)
df.show()

                                                                                

+-------------------+---+--------+------+
|               Name|Age|Nickname|Gender|
+-------------------+---+--------+------+
|       Agus Richard| 26| Richard|  Male|
|Damara Astiningtyas| 23|     Ara|Female|
+-------------------+---+--------+------+


In [4]:
df.createOrReplaceTempView("sample_table")
df2 = spark.sql("SELECT Name FROM sample_table")
df2.show()

+-------------------+
|               Name|
+-------------------+
|       Agus Richard|
|Damara Astiningtyas|
+-------------------+


In [5]:
spark.table("sample_table").write.saveAsTable("sample_hive_table")

SparkRuntimeException: [LOCATION_ALREADY_EXISTS] Cannot name the managed table as `spark_catalog`.`default`.`sample_hive_table`, as its associated location 'file:/Users/agusrichard/Documents/personal/workbook/data-engineering-workbook/pyspark-tutorial/playground/spark-warehouse/sample_hive_table' already exists. Please pick a different table name, or remove the existing location first.

In [None]:
df3 = spark.sql("SELECT * FROM sample_hive_table")
df3.show()

In [None]:
dbs = spark.catalog.listDatabases()
print(dbs)

In [None]:
tables = spark.catalog.listTables()
for table in tables:
    print(table.name)

## PySpark RDD

In [None]:
spark = SparkSession.builder.appName("pyspark-rdd").getOrCreate()

In [None]:
data = list(range(100))
print(data)

In [None]:
rdd = spark.sparkContext.parallelize(data, 10)

In [None]:
rdd.getNumPartitions()

In [None]:
rdd = spark.sparkContext.textFile("./test.txt")

In [None]:
rdd2 = rdd.flatMap(lambda line: line.split(" "))
rdd3 = rdd2.map(lambda x: (x, 1))
rdd4 = rdd3.map(lambda x: (x[1], x[0])).sortByKey()
print(rdd4.collect())

In [None]:
rdd4.saveAsTextFile("result_test")
# somethihgn

## Repartition and Coalesce

In [9]:
spark = SparkSession.builder.appName("repartition-coalesce").master("local[4]").getOrCreate()

24/01/30 16:44:03 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [10]:
rdd = spark.sparkContext.parallelize(list(range(1000)))
print("N partitions: ", rdd.getNumPartitions())

N partitions:  8


In [11]:
rdd2 = rdd.repartition(4)
print("N partitions: ", rdd2.getNumPartitions())

N partitions:  4


In [13]:
rdd2.saveAsTextFile("./tmp/repartition")

                                                                                

## Broadcast and Accumulator Variables

In [15]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

states = {"NY":"New York", "CA":"California", "FL":"Florida"}
broadcastStates = spark.sparkContext.broadcast(states)

data = [("James","Smith","USA","CA"),
    ("Michael","Rose","USA","NY"),
    ("Robert","Williams","USA","CA"),
    ("Maria","Jones","USA","FL")
  ]

rdd = spark.sparkContext.parallelize(data)

def state_convert(code):
    return broadcastStates.value[code]

result = rdd.map(lambda x: (x[0],x[1],x[2],state_convert(x[3]))).collect()
print(result)

24/01/30 17:04:09 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


[('James', 'Smith', 'USA', 'California'), ('Michael', 'Rose', 'USA', 'New York'), ('Robert', 'Williams', 'USA', 'California'), ('Maria', 'Jones', 'USA', 'Florida')]


In [16]:
import pyspark
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("accumulator").getOrCreate()

accum=spark.sparkContext.accumulator(0)
rdd=spark.sparkContext.parallelize([1,2,3,4,5])
rdd.foreach(lambda x:accum.add(x))
print(accum.value)

accuSum=spark.sparkContext.accumulator(0)
def countFun(x):
    global accuSum
    accuSum+=x
rdd.foreach(countFun)
print(accuSum.value)

accumCount=spark.sparkContext.accumulator(0)
rdd2=spark.sparkContext.parallelize([1,2,3,4,5])
rdd2.foreach(lambda x:accumCount.add(1))
print(accumCount.value)

24/01/30 17:04:25 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


15


                                                                                

15
5
