# PySpark Tutorial

## PySpark Session

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("tutorial").getOrCreate()

In [3]:
data = [
    ("Agus Richard", 26, "Richard", "Male"),
    ("Damara Astiningtyas", 23, "Ara", "Female")
]
column = ["Name", "Age", "Nickname", "Gender"]

df = spark.createDataFrame(data, column)
df.show()

                                                                                

+-------------------+---+--------+------+
|               Name|Age|Nickname|Gender|
+-------------------+---+--------+------+
|       Agus Richard| 26| Richard|  Male|
|Damara Astiningtyas| 23|     Ara|Female|
+-------------------+---+--------+------+


In [4]:
df.createOrReplaceTempView("sample_table")
df2 = spark.sql("SELECT Name FROM sample_table")
df2.show()

+-------------------+
|               Name|
+-------------------+
|       Agus Richard|
|Damara Astiningtyas|
+-------------------+


In [5]:
spark.table("sample_table").write.saveAsTable("sample_hive_table")

                                                                                

In [6]:
df3 = spark.sql("SELECT * FROM sample_hive_table")
df3.show()

+-------------------+---+--------+------+
|               Name|Age|Nickname|Gender|
+-------------------+---+--------+------+
|Damara Astiningtyas| 23|     Ara|Female|
|       Agus Richard| 26| Richard|  Male|
+-------------------+---+--------+------+


In [7]:
dbs = spark.catalog.listDatabases()
print(dbs)

[Database(name='default', catalog='spark_catalog', description='default database', locationUri='file:/Users/agusrichard/Documents/personal/workbook/data-engineering-workbook/pyspark-tutorial/playground/spark-warehouse')]


In [9]:
tables = spark.catalog.listTables()
for table in tables:
    print(table.name)

sample_hive_table
sample_table


## PySpark RDD

In [10]:
spark = SparkSession.builder.appName("pyspark-rdd").getOrCreate()

24/01/28 18:59:32 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [12]:
data = list(range(100))
print(data)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]


In [14]:
rdd = spark.sparkContext.parallelize(data, 10)

In [15]:
rdd.getNumPartitions()

10

In [16]:
rdd = spark.sparkContext.textFile("./test.txt")

In [17]:
rdd2 = rdd.flatMap(lambda line: line.split(" "))
rdd3 = rdd2.map(lambda x: (x, 1))
rdd4 = rdd3.map(lambda x: (x[1], x[0])).sortByKey()
print(rdd4.collect())

[Stage 20:>                                                         (0 + 2) / 2]

[(1, 'Project'), (1, 'Gutenberg’s'), (1, 'Alice’s'), (1, 'Adventures'), (1, 'in'), (1, 'Wonderland'), (1, 'by'), (1, 'Lewis'), (1, 'Carroll'), (1, 'This'), (1, 'eBook'), (1, 'is'), (1, 'for'), (1, 'the'), (1, 'use'), (1, 'of'), (1, 'anyone'), (1, 'anywhere'), (1, 'at'), (1, 'no'), (1, 'cost'), (1, 'and'), (1, 'with'), (1, 'Alice’s'), (1, 'Adventures'), (1, 'in'), (1, 'Wonderland'), (1, 'by'), (1, 'Lewis'), (1, 'Carroll'), (1, 'This'), (1, 'eBook'), (1, 'is'), (1, 'for'), (1, 'the'), (1, 'use'), (1, 'of'), (1, 'anyone'), (1, 'anywhere'), (1, 'at'), (1, 'no'), (1, 'cost'), (1, 'and'), (1, 'with'), (1, 'This'), (1, 'eBook'), (1, 'is'), (1, 'for'), (1, 'the'), (1, 'use'), (1, 'of'), (1, 'anyone'), (1, 'anywhere'), (1, 'at'), (1, 'no'), (1, 'cost'), (1, 'and'), (1, 'with'), (1, 'Project'), (1, 'Gutenberg’s'), (1, 'Alice’s'), (1, 'Adventures'), (1, 'in'), (1, 'Wonderland'), (1, 'by'), (1, 'Lewis'), (1, 'Carroll'), (1, 'This'), (1, 'eBook'), (1, 'is'), (1, 'for'), (1, 'the'), (1, 'use'), (1, 

                                                                                

In [19]:
rdd4.saveAsTextFile("result_test")
# somethihgn