<a href="https://colab.research.google.com/github/cantaruttim/Learning_PySpark/blob/main/Spark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark



# ***Resilient Distributed Datasets - RDD***

In [2]:
import pyspark
from pyspark import SparkContext, SparkConf

In [3]:
conf = SparkConf().setAppName('appName').setMaster('local')
sc = SparkContext(conf=conf)

In [4]:
data = [1, 2, 3, 4, 5]
distData = sc.parallelize(data) # distData is a parallelize distributed dataset

In [16]:
distData

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:287

In [9]:
distData.reduce(lambda a, b: a + b) ## 1 + 2 = 3 + 3 = 6 + 4 = 10 + 5 = 15

15

In [20]:
novo = sc.parallelize(data, 10) # 10 partitions

In [21]:
novo.collect() # collect() used to visualized data information.

[1, 2, 3, 4, 5]

In [11]:
distFile = sc.textFile("data.txt")

In [13]:
rdd = sc.parallelize(range(1, 4)).map(lambda x: (x, "a" * x))
rdd.saveAsSequenceFile("path/to/file")
sorted(sc.sequenceFile("path/to/file").collect())

[(1, 'a'), (2, 'aa'), (3, 'aaa')]

In [22]:
lines = sc.textFile("Pentateuco.txt")
lineLengths = lines.map(lambda s: len(s)) # as the result of a map transformation.
                                          # lineLengths is not immediately computed, due to laziness.
totalLength = lineLengths.reduce(lambda a, b: a + b) # which is an action

In [27]:
print(lineLengths.collect())

[31, 0, 33, 52, 0, 34, 33, 0, 53, 38, 32, 0, 17, 11, 16, 18, 18, 0, 47, 34, 52, 54, 0, 39, 0, 60, 57, 62, 5, 0, 28, 0, 56, 20, 0, 0, 31, 0, 15, 0, 47, 10, 34, 49, 50, 0, 14, 0, 14, 15, 0, 0, 27, 0, 0, 31, 0, 2, 9, 1, 67, 1, 37, 1, 48, 1, 56, 1, 39, 0, 20, 0, 2, 0, 0, 0, 0, 10, 35, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [29]:
totalLength

1567

# **Spark SQL**

In [32]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [34]:
spark

In [40]:
df = spark.read.csv("Treino.csv", sep = ';', header=True)
df.show()

+-------+------+---------------+----+
|  Aluno|Equipe|Tempo de estudo|Nota|
+-------+------+---------------+----+
|Aluno A|  Alfa|         1 hora|   2|
|Aluno B|  Beta|         1 hora|   8|
|Aluno C|  Gama|         1 hora|   2|
|Aluno D|  Alfa|        3 horas|   4|
|Aluno E|  Beta|        3 horas|   6|
|Aluno F|  Gama|        3 horas|  10|
|Aluno G|  Alfa|        5 horas|  10|
|Aluno H|  Beta|        5 horas|   8|
|Aluno I|  Gama|        5 horas|  10|
|Aluno J|  Alfa|        7 horas|   8|
|Aluno K|  Beta|        7 horas|   9|
|Aluno L|  Gama|        7 horas|   8|
+-------+------+---------------+----+



In [41]:
df.printSchema()

root
 |-- Aluno: string (nullable = true)
 |-- Equipe: string (nullable = true)
 |-- Tempo de estudo: string (nullable = true)
 |-- Nota: string (nullable = true)



In [45]:
df.select('Nota', 'Tempo de estudo')\
  .orderBy('Equipe')\
  .show()

+----+---------------+
|Nota|Tempo de estudo|
+----+---------------+
|   2|         1 hora|
|   4|        3 horas|
|  10|        5 horas|
|   8|        7 horas|
|   8|         1 hora|
|   6|        3 horas|
|   8|        5 horas|
|   9|        7 horas|
|   2|         1 hora|
|  10|        3 horas|
|  10|        5 horas|
|   8|        7 horas|
+----+---------------+

