## Pandas로 데이터프레임 처리

In [None]:
import pandas as pd
pd_df = pd.read_csv(
    "1800.csv", 
    names=["stationID", "date", "measure_type", "temperature"],
    usecols=[0, 1, 2, 3]
)
pd_df.head()

In [None]:
pd_minTemps = pd_df[pd_df["measure_type"] == "TMIN"]
pd_minTemps.head()

In [None]:
pd_stationTemps = pd_minTemps[["stationID", "temperature"]]
pd_minTempsByStation = pd_stationTemps.groupby(["stationID"]).min("temperature")
pd_minTempsByStation.head()

# Pyspark으로 데이터 처리

In [10]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

conf = SparkConf()
conf.set("spark.app.name", "PySpark DataFrame #1")
conf.set("spark.master", "local[2]")

spark = SparkSession.builder\
        .config(conf=conf)\
        .getOrCreate()

### **데이터읽기**

##### 스키마 정보없이 데이터 읽기

In [None]:
df = spark.read.format("csv").load("1800.csv") # spark.read.csv("1800.csv")
df.printSchema()

##### toDF로 컬럼명 입력해서 데이터 읽기

In [None]:
df = spark.read.format("csv")\
    .load("1800.csv")\
    .toDF("stationID","date","measure_type","temperature","_c4", "_c5", "_c6", "_c7")
df.printSchema()

##### inferSchema로 Spark가 유추해서 컬럼타입읽기

In [None]:
df = spark.read.format("csv")\
    .option("inferSchema", "true")\
    .load("1800.csv")\
    .toDF("stationID","date","measure_type","temperature","_c4", "_c5", "_c6", "_c7")
df.printSchema()

##### StrucField로 명시적으로 스키마 지정해서 읽기

In [None]:
from pyspark.sql.types import StringType, IntegerType, FloatType
from pyspark.sql.types import StructType, StructField

schema = StructType([ \

    ## StrucField(이름, 타입, Null허용여부)
    StructField("stationID", StringType(), True), \
    StructField("date", IntegerType(), True), \
    StructField("measure_type", StringType(), True), \
    StructField("temperature", FloatType(), True)
    ])
# df = spark.read.schema(schema).format("csv").load("1800.csv")
df = spark.read.schema(schema).csv("1800.csv")
df.printSchema()

### **measure_type이 TMIN인 값만 추출**

##### 데이터프레임의 filter메소드 사용

In [None]:
minTemps = df.filter(df.measure_type == "TMIN")
minTemps.count()

##### where메소드의 Column expression으로 필터링

In [None]:
minTemps = df.where(df.measure_type == "TMIN")
minTemps.count()

##### where메소드의 SQL expression으로 필터링

In [None]:
minTemps = df.where("measure_type = 'TMIN'")
minTemps.count()

### **StationID에 Groupby를 적용해서 최저온도 확인하기**

In [29]:
# show()는 최대 20개까지 보여줌
minTmepsByStation = minTemps.groupBy("stationID").min("temperature")
minTmepsByStation.show()

+-----------+----------------+
|  stationID|min(temperature)|
+-----------+----------------+
|ITE00100554|          -148.0|
|EZE00100082|          -135.0|
+-----------+----------------+



##### 데이터프레임처럼 컬럼 선택하기

In [30]:
stationTemps = minTemps[["stationID", "temperature"]]
stationTemps.show(5)

+-----------+-----------+
|  stationID|temperature|
+-----------+-----------+
|ITE00100554|     -148.0|
|EZE00100082|     -135.0|
|ITE00100554|     -125.0|
|EZE00100082|     -130.0|
|ITE00100554|      -46.0|
+-----------+-----------+
only showing top 5 rows



##### select 메소드로 컬럼 선택하기

In [31]:
stationTemps = minTemps.select("stationID", "temperature")
stationTemps.show(5)

+-----------+-----------+
|  stationID|temperature|
+-----------+-----------+
|ITE00100554|     -148.0|
|EZE00100082|     -135.0|
|ITE00100554|     -125.0|
|EZE00100082|     -130.0|
|ITE00100554|      -46.0|
+-----------+-----------+
only showing top 5 rows



##### collect 메서드로 python list형태로 가져오기

In [33]:
results = minTmepsByStation.collect()
for result in results:
    print(result[0] + "\t{:2f}F".format(result[1]))

ITE00100554	-148.000000F
EZE00100082	-135.000000F


## SparkSQL로 처리해보기

In [34]:
df.createOrReplaceTempView("station1800")
result = spark.sql("""SELECT stationID, MIN(temperature)
FROM station1800
WHERE measure_type = "TMIN"
GROUP BY 1""").collect()

#pyspark.sql.row는 DataFrame의 레코드에 해당하며 필드별로 이름이 존재
for r in results:
    print(r)

Row(stationID='ITE00100554', min(temperature)=-148.0)
Row(stationID='EZE00100082', min(temperature)=-135.0)
