# Spark Context

In [147]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [32]:
conf = SparkConf().setMaster('local').setAppName('spark_context')
sc = SparkContext(conf = conf)
sc                                                

### 관객수가 500만 이상의 영화를 가져오기  
방법 1) join > filter > 영화정보 꺼내기  
방법 2) 필터 > join > 영화정보 꺼내기  

In [6]:
movies_rdd = sc.parallelize([
    (1, ("어벤져스", "마블")),
    (2, ("슈퍼맨", "DC")),
    (3, ("배트맨", "DC")),
    (4, ("겨울왕국", "디즈니")),
    (5, ("아이언맨", "마블"))
])


attendances_rdd = sc.parallelize([
    (1, (13934592, "KR")),
    (2, (2182227,"KR")),
    (3, (4226242, "KR")),
    (4, (10303058, "KR")),
    (5, (4300365, "KR"))
])

In [17]:
#1. 조인 먼저
movies_att = movies_rdd.join(attendances_rdd)

movies_att.filter(
    lambda x : x[1][1][0] > 5000000
).collect()

[(4, (('겨울왕국', '디즈니'), (10303058, 'KR'))),
 (1, (('어벤져스', '마블'), (13934592, 'KR')))]

In [18]:
#2. 필터 먼저
filtered_rdd = attendances_rdd.filter(
    lambda x : x[1][0] > 5000000
)

filtered_rdd.join(movies_rdd).collect()

[(4, ((10303058, 'KR'), ('겨울왕국', '디즈니'))),
 (1, ((13934592, 'KR'), ('어벤져스', '마블')))]

In [33]:
sc.stop()

# Spark Session

In [148]:
ss = SparkSession.builder.appName('Spark_SQL').getOrCreate()
ss

In [149]:
movies = [
    (1, "어벤져스", "마블", 2012, 4, 26),
    (2, "슈퍼맨", "DC", 2013, 6, 13),
    (3, "배트맨", "DC", 2008, 8, 6),
    (4, "겨울왕국", "디즈니", 2014, 1, 16),
    (5, "아이언맨", "마블", 2008, 4, 30)
]

In [38]:
movie_schema = ['idx', 'title', 'company', 'movie_year', 'movie_month', 'movie_date']

In [158]:
#데이터프레임 생성
df = ss.createDataFrame(data = movies, schema = movie_schema)
df.dtypes

[('idx', 'bigint'),
 ('title', 'string'),
 ('company', 'string'),
 ('movie_year', 'bigint'),
 ('movie_month', 'bigint'),
 ('movie_date', 'bigint')]

In [46]:
df.show()

+---+--------+-------+----------+-----------+----------+
|idx|   title|company|movie_year|movie_month|movie_date|
+---+--------+-------+----------+-----------+----------+
|  1|어벤져스|   마블|      2012|          4|        26|
|  2|  슈퍼맨|     DC|      2013|          6|        13|
|  3|  배트맨|     DC|      2008|          8|         6|
|  4|겨울왕국| 디즈니|      2014|          1|        16|
|  5|아이언맨|   마블|      2008|          4|        30|
+---+--------+-------+----------+-----------+----------+



In [48]:
df.select('title').show()

+--------+
|   title|
+--------+
|어벤져스|
|  슈퍼맨|
|  배트맨|
|겨울왕국|
|아이언맨|
+--------+



In [50]:
df.where("title = '어벤져스'").show()

+---+--------+-------+----------+-----------+----------+
|idx|   title|company|movie_year|movie_month|movie_date|
+---+--------+-------+----------+-----------+----------+
|  1|어벤져스|   마블|      2012|          4|        26|
+---+--------+-------+----------+-----------+----------+



In [52]:
df.filter(df.movie_year >= 2010).show()

+---+--------+-------+----------+-----------+----------+
|idx|   title|company|movie_year|movie_month|movie_date|
+---+--------+-------+----------+-----------+----------+
|  1|어벤져스|   마블|      2012|          4|        26|
|  2|  슈퍼맨|     DC|      2013|          6|        13|
|  4|겨울왕국| 디즈니|      2014|          1|        16|
+---+--------+-------+----------+-----------+----------+



### SQL을 사용하기 위해서는 View에 등록해야함

In [159]:
# df에서 Temp View로 변환이 일어남
df.createOrReplaceTempView('movies') #Table Name

In [160]:
#view를 select projection하는 문장
query1 = '''
        select * from movies
        '''

In [161]:
ss.sql(query1).show()

+---+--------+-------+----------+-----------+----------+
|idx|   title|company|movie_year|movie_month|movie_date|
+---+--------+-------+----------+-----------+----------+
|  1|어벤져스|   마블|      2012|          4|        26|
|  2|  슈퍼맨|     DC|      2013|          6|        13|
|  3|  배트맨|     DC|      2008|          8|         6|
|  4|겨울왕국| 디즈니|      2014|          1|        16|
|  5|아이언맨|   마블|      2008|          4|        30|
+---+--------+-------+----------+-----------+----------+



### 2010년 이후 개봉한 영화 조회

In [66]:
ss.sql('''
select *
from movies
where movie_year >= 2010
''').show()

+---+--------+-------+----------+-----------+----------+
|idx|   title|company|movie_year|movie_month|movie_date|
+---+--------+-------+----------+-----------+----------+
|  1|어벤져스|   마블|      2012|          4|        26|
|  2|  슈퍼맨|     DC|      2013|          6|        13|
|  4|겨울왕국| 디즈니|      2014|          1|        16|
+---+--------+-------+----------+-----------+----------+



### 회사가 마블 추출

In [68]:
ss.sql('''
select *
from movies
where company like '%마블%'
''').show()

+---+--------+-------+----------+-----------+----------+
|idx|   title|company|movie_year|movie_month|movie_date|
+---+--------+-------+----------+-----------+----------+
|  1|어벤져스|   마블|      2012|          4|        26|
|  5|아이언맨|   마블|      2008|          4|        30|
+---+--------+-------+----------+-----------+----------+



In [69]:
ss.sql('''
select *
from movies
where company = '마블'
''').show()

+---+--------+-------+----------+-----------+----------+
|idx|   title|company|movie_year|movie_month|movie_date|
+---+--------+-------+----------+-----------+----------+
|  1|어벤져스|   마블|      2012|          4|        26|
|  5|아이언맨|   마블|      2008|          4|        30|
+---+--------+-------+----------+-----------+----------+



### ~~맨 인 영화

In [70]:
ss.sql('''
select *
from movies
where title like '%맨'
''').show()

+---+--------+-------+----------+-----------+----------+
|idx|   title|company|movie_year|movie_month|movie_date|
+---+--------+-------+----------+-----------+----------+
|  2|  슈퍼맨|     DC|      2013|          6|        13|
|  3|  배트맨|     DC|      2008|          8|         6|
|  5|아이언맨|   마블|      2008|          4|        30|
+---+--------+-------+----------+-----------+----------+



### `~이~` 들어간 영화

In [71]:
ss.sql('''
select *
from movies
where title like '%이%'
''').show()

+---+--------+-------+----------+-----------+----------+
|idx|   title|company|movie_year|movie_month|movie_date|
+---+--------+-------+----------+-----------+----------+
|  5|아이언맨|   마블|      2008|          4|        30|
+---+--------+-------+----------+-----------+----------+



### 개봉월이 4월 ~ 8월 사이임

In [73]:
ss.sql('''
select *
from movies
where movie_month between 4 and 8
''').show()

+---+--------+-------+----------+-----------+----------+
|idx|   title|company|movie_year|movie_month|movie_date|
+---+--------+-------+----------+-----------+----------+
|  1|어벤져스|   마블|      2012|          4|        26|
|  2|  슈퍼맨|     DC|      2013|          6|        13|
|  3|  배트맨|     DC|      2008|          8|         6|
|  5|아이언맨|   마블|      2008|          4|        30|
+---+--------+-------+----------+-----------+----------+



### 컬럼이 여러개인 상황
- and / or

### - ~맨, 개봉연도가 2010년 이하

In [84]:
ss.sql('''
select *
from movies
where title like '%맨' and movie_year <= 2010
''').show()

+---+--------+-------+----------+-----------+----------+
|idx|   title|company|movie_year|movie_month|movie_date|
+---+--------+-------+----------+-----------+----------+
|  3|  배트맨|     DC|      2008|          8|         6|
|  5|아이언맨|   마블|      2008|          4|        30|
+---+--------+-------+----------+-----------+----------+



### 회사가 마블, dc인 영화

In [83]:
ss.sql('''
select *
from movies
where company = '마블'
    or upper(company) = 'DC'
'''
).show()

+---+--------+-------+----------+-----------+----------+
|idx|   title|company|movie_year|movie_month|movie_date|
+---+--------+-------+----------+-----------+----------+
|  1|어벤져스|   마블|      2012|          4|        26|
|  2|  슈퍼맨|     DC|      2013|          6|        13|
|  3|  배트맨|     DC|      2008|          8|         6|
|  5|아이언맨|   마블|      2008|          4|        30|
+---+--------+-------+----------+-----------+----------+



In [85]:
ss.sql('''
select *
from movies
where company in ('마블', 'DC')
'''
).show()

+---+--------+-------+----------+-----------+----------+
|idx|   title|company|movie_year|movie_month|movie_date|
+---+--------+-------+----------+-----------+----------+
|  1|어벤져스|   마블|      2012|          4|        26|
|  2|  슈퍼맨|     DC|      2013|          6|        13|
|  3|  배트맨|     DC|      2008|          8|         6|
|  5|아이언맨|   마블|      2008|          4|        30|
+---+--------+-------+----------+-----------+----------+



### 회사가 "마"로 시작하거나, "니"로 끝나는 영화

In [97]:
ss.sql('''
select *
from movies
where company like '마%'
    or company like '%니'
    
''').show()

+---+--------+-------+----------+-----------+----------+
|idx|   title|company|movie_year|movie_month|movie_date|
+---+--------+-------+----------+-----------+----------+
|  1|어벤져스|   마블|      2012|          4|        26|
|  4|겨울왕국| 디즈니|      2014|          1|        16|
|  5|아이언맨|   마블|      2008|          4|        30|
+---+--------+-------+----------+-----------+----------+



### 회사가 "마"로 시작하거나, "니"로 끝나는 영화 중, 2010년 이후로 개봉한 영화

In [96]:
ss.sql('''
select *
from movies
where (company like '마%'
    or company like '%니')
    and movie_year >= 2010
    
''').show()

+---+--------+-------+----------+-----------+----------+
|idx|   title|company|movie_year|movie_month|movie_date|
+---+--------+-------+----------+-----------+----------+
|  1|어벤져스|   마블|      2012|          4|        26|
|  4|겨울왕국| 디즈니|      2014|          1|        16|
+---+--------+-------+----------+-----------+----------+



### 개봉 연도 오름차순
ASC : ascending 오름차순, default  
DESC : descending 내림차순

In [88]:
ss.sql('''
select *
from movies
order by movie_year, movie_month, movie_date
''').show()

+---+--------+-------+----------+-----------+----------+
|idx|   title|company|movie_year|movie_month|movie_date|
+---+--------+-------+----------+-----------+----------+
|  5|아이언맨|   마블|      2008|          4|        30|
|  3|  배트맨|     DC|      2008|          8|         6|
|  1|어벤져스|   마블|      2012|          4|        26|
|  2|  슈퍼맨|     DC|      2013|          6|        13|
|  4|겨울왕국| 디즈니|      2014|          1|        16|
+---+--------+-------+----------+-----------+----------+



### 개봉 영화가 2개 이상인 회사

In [100]:
ss.sql('''
select
    company
    , count(title) as cnt
from movies
group by company
having cnt >= 2
order by company
''').show()



+-------+---+
|company|cnt|
+-------+---+
|     DC|  2|
|   마블|  2|
+-------+---+



                                                                                

count(*) : NULL 포함  
count(company) : NULL 제외  
mean  
sum  

In [115]:
ss.sql('''
select
    count(*) as `행 개수(NULL 포함)`
    , count(title)
    , mean(movie_year)
    , avg(movie_year)
    , sum(movie_month)
from movies
''').show()

+------------------+------------+----------------+---------------+----------------+
|행 개수(NULL 포함)|count(title)|mean(movie_year)|avg(movie_year)|sum(movie_month)|
+------------------+------------+----------------+---------------+----------------+
|                 5|           5|          2011.0|         2011.0|              23|
+------------------+------------+----------------+---------------+----------------+



In [150]:
attendances = [
    (1, 13934592., "KR"),
    (2, 2182227.,"KR"),
    (3, 4226242., "KR"),
    (4, 10303058., "KR"),
    (5, 4300365., "KR")
]

In [131]:
att_schma = StructType([
    StructField('id', IntegerType(), True)
    , StructField('att', FloatType(), True)
    , StructField('theater_country', StringType(), True)    
])

In [153]:
att_df = ss.createDataFrame(data = attendances, schema = att_schma)
att_df.dtypes

[('id', 'int'), ('att', 'float'), ('theater_country', 'string')]

In [137]:
att_df.printSchema()
att_df.show()

root
 |-- id: integer (nullable = true)
 |-- att: float (nullable = true)
 |-- theater_country: string (nullable = true)

+---+-----------+---------------+
| id|        att|theater_country|
+---+-----------+---------------+
|  1|1.3934592E7|             KR|
|  2|  2182227.0|             KR|
|  3|  4226242.0|             KR|
|  4|1.0303058E7|             KR|
|  5|  4300365.0|             KR|
+---+-----------+---------------+



In [154]:
att_df.createOrReplaceTempView('att_table')

ss.sql('''
select *
from att_table
''').show()

+---+-----------+---------------+
| id|        att|theater_country|
+---+-----------+---------------+
|  1|1.3934592E7|             KR|
|  2|  2182227.0|             KR|
|  3|  4226242.0|             KR|
|  4|1.0303058E7|             KR|
|  5|  4300365.0|             KR|
+---+-----------+---------------+



In [162]:
ss.sql('''
select
    A.*
    , B.att
    , B.theater_country
from movies as A
inner join att_table as B
on A.idx = B.id

order by A.idx

''').show()



+---+--------+-------+----------+-----------+----------+-----------+---------------+
|idx|   title|company|movie_year|movie_month|movie_date|        att|theater_country|
+---+--------+-------+----------+-----------+----------+-----------+---------------+
|  1|어벤져스|   마블|      2012|          4|        26|1.3934592E7|             KR|
|  2|  슈퍼맨|     DC|      2013|          6|        13|  2182227.0|             KR|
|  3|  배트맨|     DC|      2008|          8|         6|  4226242.0|             KR|
|  4|겨울왕국| 디즈니|      2014|          1|        16|1.0303058E7|             KR|
|  5|아이언맨|   마블|      2008|          4|        30|  4300365.0|             KR|
+---+--------+-------+----------+-----------+----------+-----------+---------------+



                                                                                

In [164]:
movies_views = ss.sql('''
                        select
                            A.*
                            , B.att
                            , B.theater_country
                        from movies as A
                        inner join att_table as B
                        on A.idx = B.id
                        
                        order by A.idx
                        
                        ''')

movies_views.createOrReplaceTempView('movies_views')

In [166]:
ss.sql('''
select *
from movies_views
''').show()



+---+--------+-------+----------+-----------+----------+-----------+---------------+
|idx|   title|company|movie_year|movie_month|movie_date|        att|theater_country|
+---+--------+-------+----------+-----------+----------+-----------+---------------+
|  1|어벤져스|   마블|      2012|          4|        26|1.3934592E7|             KR|
|  2|  슈퍼맨|     DC|      2013|          6|        13|  2182227.0|             KR|
|  3|  배트맨|     DC|      2008|          8|         6|  4226242.0|             KR|
|  4|겨울왕국| 디즈니|      2014|          1|        16|1.0303058E7|             KR|
|  5|아이언맨|   마블|      2008|          4|        30|  4300365.0|             KR|
+---+--------+-------+----------+-----------+----------+-----------+---------------+



                                                                                

In [167]:
ss.stop()