In [40]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

"""teste.csv
1,12.0,"teste"
1,13.0,"oi"
1,2.0,"nada"
2,0.001,""
3,100.1,"ok"
"""

df = spark.read.load("teste.csv",
                     sep=",",
                     header="False",
                     schema="num INT, value DOUBLE, msg STRING",
                     format="csv")

# possivel tambem utilizar inferSchema=True ou inferSchema='true'

In [41]:
df.show()

+---+-----+-----+
|num|value|  msg|
+---+-----+-----+
|  1| 12.0|teste|
|  1| 13.0|   oi|
|  1|  2.0| nada|
|  2|0.001| null|
|  3|100.1|   ok|
+---+-----+-----+



In [42]:
df.printSchema()

root
 |-- num: integer (nullable = true)
 |-- value: double (nullable = true)
 |-- msg: string (nullable = true)



In [45]:
df.groupBy("num").count().show()

+---+-----+
|num|count|
+---+-----+
|  1|    3|
|  3|    1|
|  2|    1|
+---+-----+



In [46]:
df.filter(df['num'] > 1).show()

+---+-----+----+
|num|value| msg|
+---+-----+----+
|  2|0.001|null|
|  3|100.1|  ok|
+---+-----+----+



# Utilizacao de SQL

In [51]:
df.createOrReplaceTempView("dados") # tabela criada

In [52]:
sqlDF = spark.sql("SELECT sum(value) as soma FROM dados")
sqlDF.show()

+-------+
|   soma|
+-------+
|127.101|
+-------+



In [53]:
sqlDF = spark.sql("SELECT avg(num) as media FROM dados")
sqlDF.show()

+-----+
|media|
+-----+
|  1.6|
+-----+



In [54]:
spark.stop()