In [1]:
from pyspark.sql import SparkSession

ss = SparkSession.builder.appName('Spark_SQL').getOrCreate()
ss

24/12/10 14:49:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


| **특징**              | **`ss.read.csv()`**                     | **`ss.read.format('csv').load()`**            |
|-----------------------|------------------------------------------|-----------------------------------------------|
| **목적**              | CSV 파일 읽기에 특화                    | 다양한 파일 형식을 처리하기 위한 일반적 방식 |
| **가독성**            | 간결하고 직관적                         | 다소 장황                                    |
| **유연성**            | CSV 파일에 한정                         | 다양한 데이터 형식을 지원                     |
| **옵션 설정 방법**     | 메서드 인자로 직접 전달                  | `.option()` 또는 `load()` 인자로 설정         |
| **다른 파일 형식 확장**| 불가능                                  | 가능                                         |


In [24]:
emp_df = ss.read.csv('data/emp.csv', header = True)
# 큰 차이없음
# emp_df2 = ss.read.format('csv').load('data/emp.csv', header = True)

dept_df = ss.read.csv('data/dept.csv', header = True)

In [25]:
emp_df.printSchema(), dept_df.printSchema()

root
 |-- empno: string (nullable = true)
 |-- ename: string (nullable = true)
 |-- job: string (nullable = true)
 |-- mgr: string (nullable = true)
 |-- hiredate: string (nullable = true)
 |-- sal: string (nullable = true)
 |-- comm: string (nullable = true)
 |-- deptno: string (nullable = true)

root
 |-- deptno: string (nullable = true)
 |-- dname: string (nullable = true)
 |-- loc: string (nullable = true)



(None, None)

In [27]:
emp_df.createOrReplaceTempView('emp_tmp')
dept_df.createOrReplaceTempView('dept_tmp')

In [51]:
# 뷰에서 조인
ss.sql('''
        select
            A.*
            , B.*
        from emp_tmp as A
        
        inner join dept_tmp as B
        on A.deptno = B.deptno
        ''')\
.createOrReplaceTempView('join_view')


In [56]:
ss.sql('''
select *
from join_view
where upper(loc) = upper('new york')
''').show()

+-----+------+---------+----+----------+----+----+------+------+----------+--------+
|empno| ename|      job| mgr|  hiredate| sal|comm|deptno|deptno|     dname|     loc|
+-----+------+---------+----+----------+----+----+------+------+----------+--------+
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450|null|    10|    10|ACCOUNTING|NEW YORK|
| 7839|  KING|PRESIDENT|null|1981-11-17|5000|null|    10|    10|ACCOUNTING|NEW YORK|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300|null|    10|    10|ACCOUNTING|NEW YORK|
+-----+------+---------+----+----------+----+----+------+------+----------+--------+



In [69]:
# 부서 위치가 NEW YORK인 직원 목록

ss.sql('''
select * from emp_tmp
where deptno = (
    select deptno from dept_tmp
    where loc = 'NEW YORK'
    )
''').show()

+-----+------+---------+----+----------+----+----+------+
|empno| ename|      job| mgr|  hiredate| sal|comm|deptno|
+-----+------+---------+----+----------+----+----+------+
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450|null|    10|
| 7839|  KING|PRESIDENT|null|1981-11-17|5000|null|    10|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300|null|    10|
+-----+------+---------+----+----------+----+----+------+



In [73]:
# case 1 - 조인 사용
ss.sql('''
select *
from join_view
where upper(loc) = upper('new york')
''').explain()


# case 2 - 서브 쿼리 사용
# 서브 쿼리가 이 상황에서는 효율적임 왜냐, 컬럼을 이미 좁혀서 가져왔기 때문
ss.sql('''
select * from emp_tmp
where deptno = (
    select deptno from dept_tmp
    where loc = 'NEW YORK'
    )
''').explain()

== Physical Plan ==
*(2) BroadcastHashJoin [deptno#433], [deptno#458], Inner, BuildRight, false
:- *(2) Filter isnotnull(deptno#433)
:  +- FileScan csv [empno#426,ename#427,job#428,mgr#429,hiredate#430,sal#431,comm#432,deptno#433] Batched: false, DataFilters: [isnotnull(deptno#433)], Format: CSV, Location: InMemoryFileIndex[file:/home/lab17/git/src/data/emp.csv], PartitionFilters: [], PushedFilters: [IsNotNull(deptno)], ReadSchema: struct<empno:string,ename:string,job:string,mgr:string,hiredate:string,sal:string,comm:string,dep...
+- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, false]),false), [id=#859]
   +- *(1) Filter ((isnotnull(loc#460) AND (upper(loc#460) = NEW YORK)) AND isnotnull(deptno#458))
      +- FileScan csv [deptno#458,dname#459,loc#460] Batched: false, DataFilters: [isnotnull(loc#460), (upper(loc#460) = NEW YORK), isnotnull(deptno#458)], Format: CSV, Location: InMemoryFileIndex[file:/home/lab17/git/src/data/dept.csv], PartitionFilters: [], PushedF

In [74]:
ss.stop()