# Instalação do PySpark

In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=e365819ae7208efd13e2a931efdda9f6b42f472b20341b189899c5e129e67166
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
!pip install findspark

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [5]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [6]:
sc = SparkContext.getOrCreate()

In [7]:
spark = SparkSession.builder.appName('PySpark Dataframe').getOrCreate()

# Transformations

## `map()`

In [8]:
data = [1,2,3,4,5]
myRDD = sc.parallelize(data)
newRDD = myRDD.map(lambda x: x*2)

# collect() é uma forma de visualização de dados
print(newRDD.collect())

[2, 4, 6, 8, 10]


## `filter()`

In [10]:
data = [1,2,3,4,5,6,7,8,9,10]
myRDD = sc.parallelize(data)
newRDD = myRDD.filter(lambda x: x%2 == 0)

print(newRDD.collect())

[2, 4, 6, 8, 10]


## `distinct()`

In [11]:
data = [1,1,1,2,2,2,3,3,3,3]
myRDD = sc.parallelize(data)
newRDD = myRDD.distinct()

print(newRDD.collect())

[2, 1, 3]


In [12]:
print(newRDD.count())

3


In [13]:
print(newRDD)

PythonRDD[11] at collect at <ipython-input-11-b946c00e7997>:5


In [14]:
print(type(newRDD))

<class 'pyspark.rdd.PipelinedRDD'>


## `groupByKey()`

Agrupa valores de um DataFrame dentro de uma chave.

In [15]:
myRDD = sc.parallelize([('a',1), ('a',2), ('a',3), ('b',1)])

# print result as list
resultList = myRDD.groupByKey().mapValues(list)
resultList.collect()

[('b', [1]), ('a', [1, 2, 3])]

## `reduceByKey()`

In [16]:
from operator import add
myRDD = sc.parallelize([('a',1), ('a',2), ('a',3), ('b',1)])
# adds the values by keys

newRDD = myRDD.reduceByKey(add)
newRDD.collect()

[('b', 1), ('a', 6)]

## `sortByKey()` - o OrderBy do SQL

In [17]:
myRDD = sc.parallelize([('c',1), ('d',2), ('a',3), ('b',4)])
# adds the values by keys

newRDD = myRDD.sortByKey()
newRDD.collect()

[('a', 3), ('b', 4), ('c', 1), ('d', 2)]

## `union()`

In [18]:
myRDD1 = sc.parallelize([1,2,3,4])
myRDD2 = sc.parallelize([3,4,5,6,7])
# union of myRDD1 and myRDD2
newRDD = myRDD1.union(myRDD2)
newRDD.collect()

[1, 2, 3, 4, 3, 4, 5, 6, 7]

# Actions

## `count()`

In [19]:
data = ['Scala', 'Python', 'Java', 'R']
myRDD = sc.parallelize(data)
# returns 4 as optout
myRDD.count()

4

## `reduce()`

In [20]:
data = [1,2,3,4,5]
myRDD = sc.parallelize(data)
# returns the product of all the elements
myRDD.reduce(lambda x, y: x*y)

120

## `forEach()`

In [21]:
def fun(x):
  print(x)
data = ['Scala', 'Python', 'Java', 'R']
myRDD = sc.parallelize(data)
# function applied to all the elements
myRDD.foreach(fun)

## `countByValue()`

In [22]:
data = ['Python', 'Scala', 'Python', 'R', 'Python', 'Java', 'R']
myRDD = sc.parallelize(data)
# items() returns a list with all the dictionary keys and values returned by countByValue()
myRDD.countByValue().items()

dict_items([('Python', 3), ('Scala', 1), ('R', 2), ('Java', 1)])

## `countByKey()`

In [25]:
data = [('a', 1), ('b', 1), ('c', 1), ('a', 1)]
myRDD = sc.parallelize(data)
myRDD.countByKey().items()

dict_items([('a', 2), ('b', 1), ('c', 1)])

## `take(n)` - SELECT TOP ou LIMIT do SQL

In [26]:
data = [2, 5, 3, 8, 4]
myRDD = sc.parallelize(data)
# return the first 3 elements
myRDD.take(3)

[2, 5, 3]

## `top()`

In [28]:
data = [2, 5, 3, 8, 4]
myRDD = sc.parallelize(data)
# return the first 3 elements
myRDD.top(3)

[8, 5, 4]