#1 instalação do PySpark

In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285387 sha256=0d59fb6582d262b7676a943bbf064101a1783292a56861af5129c89b88f24d52
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


In [2]:
!pip install findspark
from pyspark import SparkContext
from pyspark.sql import SparkSession

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [3]:
sc = SparkContext.getOrCreate()

In [4]:
spark = SparkSession.builder.appName("PySpark Dataframe").getOrCreate()

#2 Transformation

#2.1 map()

In [6]:
data = [1,2,3,4,5]
myRDD = sc.parallelize(data)
newRDD = myRDD.map(lambda x: x*2)
print(newRDD.collect())

[2, 4, 6, 8, 10]


#2.2 filter()

In [8]:
data = [1,2,3,4,5,6,7,8,9]
myRDD = sc.parallelize(data)
newRDD = myRDD.filter(lambda x: x%2 == 0)
print(newRDD.collect())

[2, 4, 6, 8]


#2.3 distinct()

In [9]:
data = [1,1,1,2,2,2,2,3,3,3,3,4,5]
myRDD = sc.parallelize(data)
newRDD = myRDD.distinct()
print(newRDD.collect())

[2, 4, 1, 3, 5]


In [10]:
data = [1,1,1,2,2,2,2,3,3,3,3,4,5]
myRDD = sc.parallelize(data)
newRDD = myRDD.distinct()
print(newRDD.count())

5


In [11]:
data = [1,1,1,2,2,2,2,3,3,3,3,4,5]
myRDD = sc.parallelize(data)
newRDD = myRDD.distinct()
print(type(newRDD))

<class 'pyspark.rdd.PipelinedRDD'>


#2.4 groupByKey()

In [12]:
myRDD = sc.parallelize([("a", 1), ("a", 2), ("a", 3), ("b", 1)])

resultList = myRDD.groupByKey().mapValues(list)
resultList.collect()

[('b', [1]), ('a', [1, 2, 3])]

#2.5 reduceByKey()

In [14]:
from operator import add
myRDD = sc.parallelize([("a", 1), ("a", 2), ("a", 3), ("b", 1)])

newRDD = myRDD.reduceByKey(add)
newRDD.collect()

[('b', 1), ('a', 6)]

#2.6 sortByKey()

In [15]:
from operator import add
myRDD = sc.parallelize([("a", 1), ("d", 2), ("c", 3), ("b", 1)])

newRDD = myRDD.sortByKey()
newRDD.collect()

[('a', 1), ('b', 1), ('c', 3), ('d', 2)]

#2.7 union()

In [16]:
myRDD1 = sc.parallelize([1,2,3,4])
myRDD2 = sc.parallelize([3,4,5,6,7])

newRDD = myRDD1.union(myRDD2)
newRDD.collect()

[1, 2, 3, 4, 3, 4, 5, 6, 7]

#3 Actions

#3.1 count()

In [17]:
data = ["scala", "Python", "Java", "R"]
myRDD = sc.parallelize(data)
myRDD.count()

4

In [18]:
data = [1,2,3,3,4,5,5,6,7]
myRDD = sc.parallelize(data)
myRDD.count()

9

#3.2 reduce()

In [20]:
data = [1,2,3,4,5]
myRDD = sc.parallelize(data)
myRDD.reduce(lambda x, y : x * y)

120

#3.3 foreach()

In [21]:
def fun(x):
  print(x)
data = ["scala", "Python", "Java", "R"]
myRDD = sc.parallelize(data)
myRDD.foreach(fun)

#3.4 countByValue()

In [24]:
data = ["scala", "Python", "scala", "R", "Python", "Java", "Python", "R", "scala", "Python", "R"]
myRDD = sc.parallelize(data)
myRDD.countByValue().items()

dict_items([('scala', 3), ('Python', 4), ('R', 3), ('Java', 1)])

#3.5 countbyKey()

In [26]:
data = [("a", 1), ("d", 1), ("c", 1), ("b", 1), ("b", 1)]
myRDD = sc.parallelize(data)
myRDD.countByKey().items()

dict_items([('a', 1), ('d', 1), ('c', 1), ('b', 2)])

#3.6 take(n)

In [27]:
data = [1,2,3,3,4,5,5,6,7]
myRDD = sc.parallelize(data)
myRDD.take(3)

[1, 2, 3]

#3.7 top(n)

In [28]:
data = [1,2,3,3,4,5,5,6,7]
myRDD = sc.parallelize(data)
myRDD.top(3)

[7, 6, 5]