In [1]:
import pandas as pd
from pyspark import SparkContext
from pyspark.sql import SQLContext, Row

from pyspark.sql.functions import when, udf, col, regexp_replace
from pyspark.sql.types import DoubleType,IntegerType, StringType
import pyspark.sql.functions as f

In [2]:
sc = SparkContext( 'local' )
sqlCtx = SQLContext( sc )

In [5]:
df = sqlCtx.read.csv( 'data/grade.csv', header=True, inferSchema=True)
df

DataFrame[학년: string, 과목: string, 결과: string, 중간: int, 기말: int]

In [6]:
df.show()

+-----+----+-----+----+----+
| 학년|과목| 결과|중간|기말|
+-----+----+-----+----+----+
|1학년|국어| 좋음|  80|  90|
|1학년|국어| 나쁨|  50|  40|
|1학년|국어| 나쁨|  20|  50|
|1학년|수학| 좋음|  83|  95|
|1학년|수학| 좋음|  93|  86|
|2학년|국어| 나쁨|  44|  65|
|2학년|국어| 좋음|  95|  98|
|2학년|수학| 좋음|  96|  99|
|2학년|수학| 나쁨|  57|  69|
+-----+----+-----+----+----+



In [11]:
df.printSchema()

root
 |-- 학년: string (nullable = true)
 |-- 과목: string (nullable = true)
 |-- 결과: string (nullable = true)
 |-- 중간: integer (nullable = true)
 |-- 기말: integer (nullable = true)



In [10]:
# df.groupBy(df['학년'])
df.groupBy('학년').max().show()

+-----+---------+---------+
| 학년|max(중간)|max(기말)|
+-----+---------+---------+
|2학년|       96|       99|
|1학년|       93|       95|
+-----+---------+---------+



In [12]:
df.groupBy('학년').sum().show()

+-----+---------+---------+
| 학년|sum(중간)|sum(기말)|
+-----+---------+---------+
|2학년|      292|      331|
|1학년|      326|      361|
+-----+---------+---------+



In [13]:
df.groupBy('학년').avg().show()

+-----+---------+---------+
| 학년|avg(중간)|avg(기말)|
+-----+---------+---------+
|2학년|     73.0|    82.75|
|1학년|     65.2|     72.2|
+-----+---------+---------+



In [15]:
df.groupBy('학년').avg('기말').orderBy('학년').show()

+-----+---------+
| 학년|avg(기말)|
+-----+---------+
|1학년|     72.2|
|2학년|    82.75|
+-----+---------+



In [17]:
df.groupBy('학년', '과목').avg().orderBy('학년').show()

+-----+----+---------+---------+
| 학년|과목|avg(중간)|avg(기말)|
+-----+----+---------+---------+
|1학년|국어|     50.0|     60.0|
|1학년|수학|     88.0|     90.5|
|2학년|수학|     76.5|     84.0|
|2학년|국어|     69.5|     81.5|
+-----+----+---------+---------+



In [18]:
df.groupBy('학년').agg({'기말' : 'sum'}).show()

+-----+---------+
| 학년|sum(기말)|
+-----+---------+
|2학년|      331|
|1학년|      361|
+-----+---------+



In [20]:
df.groupBy('학년').agg(f.sum('중간').alias('중간총합'), f.avg('중간').alias('중간평균') ).show()

+-----+--------+--------+
| 학년|중간총합|중간평균|
+-----+--------+--------+
|2학년|     292|    73.0|
|1학년|     326|    65.2|
+-----+--------+--------+



In [21]:
df.fillna( {'중간':10} ).show() #na가 없어서

+-----+----+-----+----+----+
| 학년|과목| 결과|중간|기말|
+-----+----+-----+----+----+
|1학년|국어| 좋음|  80|  90|
|1학년|국어| 나쁨|  50|  40|
|1학년|국어| 나쁨|  20|  50|
|1학년|수학| 좋음|  83|  95|
|1학년|수학| 좋음|  93|  86|
|2학년|국어| 나쁨|  44|  65|
|2학년|국어| 좋음|  95|  98|
|2학년|수학| 좋음|  96|  99|
|2학년|수학| 나쁨|  57|  69|
+-----+----+-----+----+----+



In [22]:
sc.stop()