In [1]:
from pyspark.sql import  SQLContext, Row
from pyspark import SparkContext,SparkConf
import pandas as pd
from pyspark.sql.functions import regexp_extract, regexp_replace, when,udf,col

In [2]:
sc = SparkContext('local')
sqlCtx = SQLContext( sc )

In [3]:
emp =[('홍길동',1),('이순신',2),
      ('임꺽정',3),('김철수',3),('김철수1',5)]
dept = [('개발',1), ('연구',2),
        ('영업',3),('기획',4) ]
empA=sqlCtx.createDataFrame( emp, ['name','deptid'] )
deptB=sqlCtx.createDataFrame( dept,['deptname','deptid'] )

In [4]:
empA.show()

+-------+------+
|   name|deptid|
+-------+------+
| 홍길동|     1|
| 이순신|     2|
| 임꺽정|     3|
| 김철수|     3|
|김철수1|     5|
+-------+------+



In [5]:
deptB.show()

+--------+------+
|deptname|deptid|
+--------+------+
|    개발|     1|
|    연구|     2|
|    영업|     3|
|    기획|     4|
+--------+------+



In [6]:
empA.join(deptB, on=['deptid']).show() #default = inner

+------+------+--------+
|deptid|  name|deptname|
+------+------+--------+
|     1|홍길동|    개발|
|     3|임꺽정|    영업|
|     3|김철수|    영업|
|     2|이순신|    연구|
+------+------+--------+



In [7]:
empA.join(deptB, on=['deptid'], how='left').show() #empA의 데이터는 다 출력

+------+-------+--------+
|deptid|   name|deptname|
+------+-------+--------+
|     5|김철수1|    null|
|     1| 홍길동|    개발|
|     3| 임꺽정|    영업|
|     3| 김철수|    영업|
|     2| 이순신|    연구|
+------+-------+--------+



In [8]:
empA.join(deptB, on=['deptid'], how='right').show() #deptB의 데이터는 다 출력

+------+------+--------+
|deptid|  name|deptname|
+------+------+--------+
|     1|홍길동|    개발|
|     3|임꺽정|    영업|
|     3|김철수|    영업|
|     2|이순신|    연구|
|     4|  null|    기획|
+------+------+--------+



In [9]:
empA.join(deptB, on=['deptid'], how='full').show()

+------+-------+--------+
|deptid|   name|deptname|
+------+-------+--------+
|     5|김철수1|    null|
|     1| 홍길동|    개발|
|     3| 임꺽정|    영업|
|     3| 김철수|    영업|
|     2| 이순신|    연구|
|     4|   null|    기획|
+------+-------+--------+



In [10]:
# name, id
testA = [('A',1),('B',2),('C',3),('D',4)]
# name, myid
testB = [('E',1),('A',2),('C',3),('F',4)]
#spark데이터프레임을 만들고
#inner, left, right, full join 을 테스트하시요

In [11]:
testAA = sqlCtx.createDataFrame( testA, ['name','id'] )
testBB = sqlCtx.createDataFrame( testB,['name', 'myid'] )
testAA.show()
testBB.show()

+----+---+
|name| id|
+----+---+
|   A|  1|
|   B|  2|
|   C|  3|
|   D|  4|
+----+---+

+----+----+
|name|myid|
+----+----+
|   E|   1|
|   A|   2|
|   C|   3|
|   F|   4|
+----+----+



In [12]:
testAA.join(testBB, on=['name']).show()

+----+---+----+
|name| id|myid|
+----+---+----+
|   C|  3|   3|
|   A|  1|   2|
+----+---+----+



In [13]:
testAA.join(testBB, on=['name'], how='left').show()

+----+---+----+
|name| id|myid|
+----+---+----+
|   B|  2|null|
|   D|  4|null|
|   C|  3|   3|
|   A|  1|   2|
+----+---+----+



In [16]:
testAA.join(testBB, on=['name'], how='right').show()

+----+----+----+
|name|  id|myid|
+----+----+----+
|   F|null|   4|
|   E|null|   1|
|   C|   3|   3|
|   A|   1|   2|
+----+----+----+



In [17]:
testAA.join(testBB, on=['name'], how='full').show()

+----+----+----+
|name|  id|myid|
+----+----+----+
|   F|null|   4|
|   E|null|   1|
|   B|   2|null|
|   D|   4|null|
|   C|   3|   3|
|   A|   1|   2|
+----+----+----+



In [18]:
jdf = testAA.join(testBB, on=['name'], how='full')
jdf.show()

+----+----+----+
|name|  id|myid|
+----+----+----+
|   F|null|   4|
|   E|null|   1|
|   B|   2|null|
|   D|   4|null|
|   C|   3|   3|
|   A|   1|   2|
+----+----+----+



In [19]:
jdf.fillna({'id' : 4}).show()

+----+---+----+
|name| id|myid|
+----+---+----+
|   F|  4|   4|
|   E|  4|   1|
|   B|  2|null|
|   D|  4|null|
|   C|  3|   3|
|   A|  1|   2|
+----+---+----+



In [20]:
jdf.dropna(subset=['myid']).show()

+----+----+----+
|name|  id|myid|
+----+----+----+
|   F|null|   4|
|   E|null|   1|
|   C|   3|   3|
|   A|   1|   2|
+----+----+----+



In [21]:
testAA.unionAll(testBB).show() #row 밑으로 합침

+----+---+
|name| id|
+----+---+
|   A|  1|
|   B|  2|
|   C|  3|
|   D|  4|
|   E|  1|
|   A|  2|
|   C|  3|
|   F|  4|
+----+---+



In [None]:
sc.stop()