# PySpark Basic example

## Spark session

In [1]:
import pyspark
from pyspark.sql import SparkSession

spark1 = SparkSession.builder.master("local[1]") \
.appName("Spark Example 1") \
.getOrCreate()

# Sử dụng local[x]khi chạy ở chế độ Standalone
# x phải là một giá trị nguyên và phải lớn hơn 0; 
# điều này thể hiện số lượng phân vùng cần tạo khi sử dụng RDD, DataFrame và Bộ dữ liệu. 
# Lý tưởng nhất, giá trị x phải là số lõi CPU bạn có

spark1

In [2]:
spark2 = SparkSession.newSession
spark2

<function pyspark.sql.session.SparkSession.newSession(self) -> 'SparkSession'>

In [3]:
spark3 = SparkSession.builder.getOrCreate
print(spark3)

<bound method SparkSession.Builder.getOrCreate of <pyspark.sql.session.SparkSession.Builder object at 0x7f447ff02110>>


### Spark Session with Config

In [5]:
spark1.stop()
spark = SparkSession.builder \
      .master("local[1]") \
      .appName("Spark Examples 4") \
      .config("spark.some.config.option", "config-value") \
      .getOrCreate()

spark


### Create SparkSession with Hive Enable

In [6]:

# Enabling Hive to use in Spark
spark.stop()
spark = SparkSession.builder \
      .master("local[1]") \
      .appName("Spark with Hive") \
      .config("spark.sql.warehouse.dir", "<path>/spark-warehouse") \
      .enableHiveSupport() \
      .getOrCreate()

spark

### Create PySpark dataframe

In [8]:
# Create DataFrame
df = spark.createDataFrame(
    [("Scala", 25000), ("Spark", 35000), ("PHP", 21000)])
df.show()

+-----+-----+
|   _1|   _2|
+-----+-----+
|Scala|25000|
|Spark|35000|
|  PHP|21000|
+-----+-----+



### Working with Spark SQL

In [9]:

# Spark SQL
df.createOrReplaceTempView("sample_table")
df2 = spark.sql("SELECT _1,_2 FROM sample_table")
df2.show()


+-----+-----+
|   _1|   _2|
+-----+-----+
|Scala|25000|
|Spark|35000|
|  PHP|21000|
+-----+-----+



### Create Hive table

In [10]:

# Create Hive table & query it.  
spark.table("sample_table").write.saveAsTable("sample_hive_table")
df3 = spark.sql("SELECT _1,_2 FROM sample_hive_table")
df3.show()


+-----+-----+
|   _1|   _2|
+-----+-----+
|Scala|25000|
|Spark|35000|
|  PHP|21000|
+-----+-----+



###  Working with Catalogs

In [11]:
dbs = spark.catalog.listDatabases()
dbs

[Database(name='default', catalog='spark_catalog', description='Default Hive database', locationUri='file:/home/jovyan/pyspark-tutorial/%3Cpath%3E/spark-warehouse')]

In [12]:
tbls = spark.catalog.listTables()
tbls

[Table(name='sample_hive_table', catalog='spark_catalog', namespace=['default'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='sample_table', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]

## PySpark Accumulator (Bộ tích luỹ) with Example

`PySpark Accumulator` là một biến dùng chung được sử dụng với RDD và DataFrame để thực hiện các phép tính tổng và bộ đếm tương tự như bộ đếm Map-reduce

In [15]:
# stop previous spark
spark.stop()

In [16]:
import pyspark
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("accumulator").getOrCreate()

accum=spark.sparkContext.accumulator(0)
rdd=spark.sparkContext.parallelize([1,2,3,4,5])
rdd.foreach(lambda x:accum.add(x))
print(accum.value)
print("-------------")
accuSum=spark.sparkContext.accumulator(0)
def countFun(x):
    global accuSum
    accuSum+=x
rdd.foreach(countFun)
print(accuSum.value)
print("-------------")
accumCount=spark.sparkContext.accumulator(0)
rdd2=spark.sparkContext.parallelize([1,2,3,4,5])
rdd2.foreach(lambda x:accumCount.add(1))
print(accumCount.value)


15
-------------
15
-------------
5


## PySpark `Repartition()` vs `Coalesce()`

- `Repartition()`: Phân vùng lại, được sử dụng để tăng hoặc giảm phân vùng RDD/DataFrame
- `Coalesce()`: Kết hợp thành một nhóm, được sử dụng để chỉ giảm số lượng phân vùng theo cách hiệu quả.


### Trong RDD

In [19]:

# Create spark session with local[1]
rdd = spark.sparkContext.parallelize(range(0,20))
print("From local[1] : "+str(rdd.getNumPartitions()))

# Use parallelize with 6 partitions
rdd1 = spark.sparkContext.parallelize(range(0,25), 2)
print("parallelize : "+str(rdd1.getNumPartitions()))

# rddFromFile = spark.sparkContext.textFile("src/main/resources/test.txt",10)
# print("TextFile : "+str(rddFromFile.getNumPartitions()))


From local[1] : 1
parallelize : 2


In [20]:
rdd1.saveAsTextFile("/tmp/partition")

In [21]:
!ls /tmp/partition

part-00000  part-00001	_SUCCESS


In [22]:
!cat /tmp/partition/part-00000

0
1
2
3
4
5
6
7
8
9
10
11


In [23]:
!cat /tmp/partition/part-00001

12
13
14
15
16
17
18
19
20
21
22
23
24


In [24]:
# Using repartition
rdd2 = rdd1.repartition(4)
print("Repartition size : "+str(rdd2.getNumPartitions()))
rdd2.saveAsTextFile("/tmp/re-partition")

Repartition size : 4


In [25]:
!ls /tmp/re-partition

part-00000  part-00001	part-00002  part-00003	_SUCCESS


In [26]:
# Using coalesce()
rdd3 = rdd2.coalesce(3)
print("Repartition size : "+str(rdd3.getNumPartitions()))
rdd3.saveAsTextFile("/tmp/coalesce")
!ls /tmp/coalesce

Repartition size : 3
part-00000  part-00001	part-00002  _SUCCESS


In [28]:
!cat /tmp/coalesce/part-00000

0
1
2
3
4
5
6
7
8
9
12
13
14
15
16
17
18
19
20
21


In [29]:
spark.stop()
!rm -rf /tmp/

rm: cannot remove '/tmp/hsperfdata_root': Operation not permitted


In [30]:
!ls /tmp

hsperfdata_root


In [31]:
# Complete Example
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com') \
        .master("local[5]").getOrCreate()

df = spark.range(0,20)
print(df.rdd.getNumPartitions())

spark.conf.set("spark.sql.shuffle.partitions", "500")

rdd = spark.sparkContext.parallelize(range(0,20))
print("From local[5]"+str(rdd.getNumPartitions()))

rdd1 = spark.sparkContext.parallelize(range(0,25), 6)
print("parallelize : "+str(rdd1.getNumPartitions()))

"""rddFromFile = spark.sparkContext.textFile("src/main/resources/test.txt",10)
print("TextFile : "+str(rddFromFile.getNumPartitions())) """

rdd1.saveAsTextFile("/tmp/partition2")

rdd2 = rdd1.repartition(4)
print("Repartition size : "+str(rdd2.getNumPartitions()))
rdd2.saveAsTextFile("/tmp/re-partition2")

rdd3 = rdd1.coalesce(4)
print("Repartition size : "+str(rdd3.getNumPartitions()))
rdd3.saveAsTextFile("/tmp/coalesce2")

5
From local[5]5
parallelize : 6
Repartition size : 4
Repartition size : 4


### Trong Dataframe

In [34]:
spark.stop()

In [41]:

# DataFrame example
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com') \
        .master("local[5]").getOrCreate()

df=spark.range(0,20)
print(df.rdd.getNumPartitions())

df.write.mode("overwrite").csv("/tmp/partition")


5


In [42]:
!ls /tmp/partition

part-00000-7d5b9a09-31d9-49e0-bdda-42663ef87627-c000.csv
part-00001-7d5b9a09-31d9-49e0-bdda-42663ef87627-c000.csv
part-00002-7d5b9a09-31d9-49e0-bdda-42663ef87627-c000.csv
part-00003-7d5b9a09-31d9-49e0-bdda-42663ef87627-c000.csv
part-00004-7d5b9a09-31d9-49e0-bdda-42663ef87627-c000.csv
_SUCCESS


In [43]:
!cat /tmp/partition/part-00004-7d5b9a09-31d9-49e0-bdda-42663ef87627-c000.csv

16
17
18
19


In [44]:
!cat /tmp/partition/part-00003-7d5b9a09-31d9-49e0-bdda-42663ef87627-c000.csv

12
13
14
15


## PySpark Broadcast Variables