In [15]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local[2]")\
        .appName('PySpark DataFrame #2')\
        .getOrCreate()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, StringType, FloatType

schema = StructType([ \
                    StructField("cust_id", StringType(), True),\
                    StructField("item_id", StringType(), True),\
                    StructField("amount_spent", FloatType(), True)
                    ])

In [3]:
df = spark.read.schema(schema).csv("customer-orders.csv")
df.printSchema()

root
 |-- cust_id: string (nullable = true)
 |-- item_id: string (nullable = true)
 |-- amount_spent: float (nullable = true)



In [4]:
df_ca = df.groupBy("cust_id").sum("amount_spent")
df_ca.show()

+-------+------------------+
|cust_id| sum(amount_spent)|
+-------+------------------+
|     51| 4975.219970226288|
|      7| 4755.070008277893|
|     15| 5413.510010659695|
|     54| 6065.390002984554|
|     11| 5152.289969373494|
|     29|5032.5300433933735|
|     69| 5123.010002791882|
|     42| 5696.840004444122|
|     73| 6206.199985742569|
|     87| 5206.400022745132|
|     64| 5288.690012812614|
|      3| 4659.629958629608|
|     30| 4990.720004022121|
|     34|5330.8000039458275|
|     59| 5642.890004396439|
|      8|5517.2399980425835|
|     22| 5019.449993014336|
|     28|  5000.71000123024|
|     85|  5503.42998456955|
|     35|  5155.41999566555|
+-------+------------------+
only showing top 20 rows



In [6]:
df_ca = df.groupBy("cust_id").sum("amount_spent").withColumnRenamed("sum(amount_spent)","sum")
df_ca.show(10)

+-------+------------------+
|cust_id|               sum|
+-------+------------------+
|     51| 4975.219970226288|
|      7| 4755.070008277893|
|     15| 5413.510010659695|
|     54| 6065.390002984554|
|     11| 5152.289969373494|
|     29|5032.5300433933735|
|     69| 5123.010002791882|
|     42| 5696.840004444122|
|     73| 6206.199985742569|
|     87| 5206.400022745132|
+-------+------------------+
only showing top 10 rows



In [7]:
import pyspark.sql.functions as f
df_ca = df.groupBy("cust_id") \
    .agg(f.sum('amount_spent').alias('sum'))
df_ca.show(5)

+-------+-----------------+
|cust_id|              sum|
+-------+-----------------+
|     51|4975.219970226288|
|      7|4755.070008277893|
|     15|5413.510010659695|
|     54|6065.390002984554|
|     11|5152.289969373494|
+-------+-----------------+
only showing top 5 rows



In [None]:
df.groupBy("cust_id")\
    .agg(
        f.sum('amount_spent').alias('sum'),
        f.max('amount_spent').alias('max'),
        f.avg('amount_spent').alias('avg')).collect()

## SparkSQL로 처리하기

In [13]:
df.createOrReplaceTempView("customer_orders")
spark.sql("""SELECT cust_id, SUM(amount_spent) sum, MAX(amount_spent) max, AVG(amount_spent) avg
FROM customer_orders
GROUP BY 1
""").head(5)

[Row(cust_id='51', sum=4975.219970226288, max=97.61000061035156, avg=48.77666637476753),
 Row(cust_id='7', sum=4755.070008277893, max=98.5999984741211, avg=50.58585115189248),
 Row(cust_id='15', sum=5413.510010659695, max=99.56999969482422, avg=52.05298087172783),
 Row(cust_id='54', sum=6065.390002984554, max=99.2300033569336, avg=49.31211384540288),
 Row(cust_id='11', sum=5152.289969373494, max=99.11000061035156, avg=47.70638860531013)]

In [14]:
spark.catalog.listTables()

[Table(name='customer_orders', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]