In [73]:
#singLeton pattern object builder
from pyspark.sql import SparkSession
ss = SparkSession.builder.appName('FirstSparkSessionApp').getOrCreate()
ss

In [74]:
myRange = ss.range(1,100)
myRange #분산 객체임

DataFrame[id: bigint]

In [75]:
myRange.show(10)

+---+
| id|
+---+
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
+---+
only showing top 10 rows



In [76]:
data = [
    ('Brook', 20),
    ('Denny', 31),
    ('Jules', 30),
]

In [77]:
df = ss.createDataFrame(data)
df

DataFrame[_1: string, _2: bigint]

In [78]:
df.show()

+-----+---+
|   _1| _2|
+-----+---+
|Brook| 20|
|Denny| 31|
|Jules| 30|
+-----+---+



In [79]:
df.printSchema()

root
 |-- _1: string (nullable = true)
 |-- _2: long (nullable = true)



In [80]:
#people이라는 임시 View 생성
df.createOrReplaceTempView('people')

In [81]:
res = ss.sql('select _1, _2 from people where _2 = 20')
res.show()

+-----+---+
|   _1| _2|
+-----+---+
|Brook| 20|
+-----+---+



### StructType으로 구조 정의 (p.52)

In [98]:
from pyspark.sql.types import *
schema = StructType(
    [
        StructField('Author', StringType(), False) #마지막 인자는 nullable = True or False
        , StructField('Age', IntegerType(), False) #마지막 인자는 nullable = True or False    
    ]
)

In [104]:
a_df = ss.createDataFrame(schema = schema, data= data)
print(a_df, a_df.show())

+------+---+
|Author|Age|
+------+---+
| Brook| 20|
| Denny| 31|
| Jules| 30|
+------+---+

DataFrame[Author: string, Age: int] None


### DDL로 정의하기 (p.52)

In [112]:
#create our data
data = [[1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter", "LinkedIn"]],
       [2, "Brooke","Wenig","https://tinyurl.2", "5/5/2018", 8908, ["twitter", "LinkedIn"]],
       [3, "Denny", "Lee", "https://tinyurl.3","6/7/2019",7659, ["web", "twitter", "FB", "LinkedIn"]],
       [4, "Tathagata", "Das","https://tinyurl.4", "5/12/2018", 10568, ["twitter", "FB"]],
       [5, "Matei","Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web", "twitter", "FB", "LinkedIn"]],
       [6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568, ["twitter", "LinkedIn"]]
      ]

In [158]:
schema_ddl = """ \
    `ID` Int NOT NUll, \
    `First` String, \
    `Last` String, \
    `Url` String, \
    `Published` String, \
    `Hits` Int, \
    `Campagins` Array<String> \
"""
# '말고 ` 써야함
# SQL DDL 대소문자 상관없음
# nullable은 Not NUll

In [188]:
b_df = ss.createDataFrame(schema = schema_ddl, data = data)

In [189]:
b_df.printSchema()

root
 |-- ID: integer (nullable = false)
 |-- First: string (nullable = true)
 |-- Last: string (nullable = true)
 |-- Url: string (nullable = true)
 |-- Published: string (nullable = true)
 |-- Hits: integer (nullable = true)
 |-- Campagins: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [161]:
b_df.select('ID').show()

+---+
| ID|
+---+
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
+---+



In [169]:
from pyspark.sql.functions import *
#expr(sql 표현식 불러와서 씀)
b_df.select(expr('Hits')*2).show(2) #projecttion

+----------+
|(Hits * 2)|
+----------+
|      9070|
|     17816|
+----------+
only showing top 2 rows



In [172]:
b_df.select(
        expr(
            ('ID')
        ) * 2
    ) \
.show()

+--------+
|(ID * 2)|
+--------+
|       2|
|       4|
|       6|
|       8|
|      10|
|      12|
+--------+



In [177]:
b_df.select(expr('*')).show()

+---+---------+-------+-----------------+---------+-----+--------------------+
| ID|    First|   Last|              Url|Published| Hits|           Campagins|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+



In [180]:
b_df.select(expr('Hits') * 2).show()
b_df.select(col('Hits') * 2).show()

+----------+
|(Hits * 2)|
+----------+
|      9070|
|     17816|
|     15318|
|     21136|
|     81156|
|     51136|
+----------+

+----------+
|(Hits * 2)|
+----------+
|      9070|
|     17816|
|     15318|
|     21136|
|     81156|
|     51136|
+----------+



In [184]:
b_df.withColumn('Big Hitters',
                (expr('Hits > 1000'))) \
                .show()                   

+---+---------+-------+-----------------+---------+-----+--------------------+-----------+
| ID|    First|   Last|              Url|Published| Hits|           Campagins|Big Hitters|
+---+---------+-------+-----------------+---------+-----+--------------------+-----------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|       true|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|       true|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|       true|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|       true|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|       true|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|       true|
+---+---------+-------+-----------------+---------+-----+--------------------+-----------+



In [192]:
b_df \
.withColumn('AuthorId', (concat
                                 (
                                    expr('First')
                                    , expr('Last')
                                    , expr('id')
                                )
                        )
           ) \
.select(col('AuthorID')) \
.show(5)

+-------------+
|     AuthorID|
+-------------+
|  JulesDamji1|
| BrookeWenig2|
|    DennyLee3|
|TathagataDas4|
|MateiZaharia5|
+-------------+
only showing top 5 rows



In [205]:
#expr, col, select 다 똑같음
print('expr')
b_df.select(expr
            ('Hits')
           )\
.show()
print(f"{'*'*100}")

print('col')
b_df.select(col
            ('Hits')
           )\
.show()

print(f"{'*'*100}")

print('그냥')
b_df.select('Hits')\
.show()

expr
+-----+
| Hits|
+-----+
| 4535|
| 8908|
| 7659|
|10568|
|40578|
|25568|
+-----+

****************************************************************************************************
col
+-----+
| Hits|
+-----+
| 4535|
| 8908|
| 7659|
|10568|
|40578|
|25568|
+-----+

****************************************************************************************************
그냥
+-----+
| Hits|
+-----+
| 4535|
| 8908|
| 7659|
|10568|
|40578|
|25568|
+-----+



In [206]:
ss.stop()
# 두 개의 세션은 활성화할 수 없어 하나는 꺼줘야함

# RDD => DataFrame 나중에 다시 해보기

In [64]:
from pyspark import SparkConf, SparkContext
import os
conf = SparkConf().setMaster('local').setAppName('restaurant')
ss_df = SparkContext(conf = conf).getOrCreate()
ss_df

In [65]:
direc = os.path.join(os.getcwd(), 'data')
file_name = 'restaurant_reviews.csv'
file_path = os.path.join(direc, file_name)
file_path

'/home/lab17/git/src/data/restaurant_reviews.csv'

In [66]:
data = ss_df.textFile('file:///' + file_path.replace('\\','/'))
type(data)

pyspark.rdd.RDD

In [70]:
data.toDF()

TypeError: Can not infer schema for type: <class 'str'>

In [55]:
ss_df.stop()

In [56]:
ss = SparkSession.builder.appName('SparkSession').getOrCreate()
ss

In [57]:
ss.to(data)

24/12/06 14:12:18 ERROR Utils: Exception encountered
org.apache.spark.SparkException: Failed to get broadcast_4_piece0 of broadcast_4
	at org.apache.spark.broadcast.TorrentBroadcast.$anonfun$readBlocks$1(TorrentBroadcast.scala:197)
	at scala.runtime.java8.JFunction1$mcVI$sp.apply(JFunction1$mcVI$sp.java:23)
	at scala.collection.immutable.List.foreach(List.scala:392)
	at org.apache.spark.broadcast.TorrentBroadcast.readBlocks(TorrentBroadcast.scala:169)
	at org.apache.spark.broadcast.TorrentBroadcast.$anonfun$readBroadcastBlock$4(TorrentBroadcast.scala:253)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.broadcast.TorrentBroadcast.$anonfun$readBroadcastBlock$2(TorrentBroadcast.scala:231)
	at org.apache.spark.util.KeyLock.withLock(KeyLock.scala:64)
	at org.apache.spark.broadcast.TorrentBroadcast.$anonfun$readBroadcastBlock$1(TorrentBroadcast.scala:226)
	at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1405)
	at org.apache.spark.broadcast.TorrentBroadcast.read

Py4JJavaError: An error occurred while calling o207.partitions.
: java.io.IOException: org.apache.spark.SparkException: Failed to get broadcast_4_piece0 of broadcast_4
	at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1412)
	at org.apache.spark.broadcast.TorrentBroadcast.readBroadcastBlock(TorrentBroadcast.scala:226)
	at org.apache.spark.broadcast.TorrentBroadcast.getValue(TorrentBroadcast.scala:103)
	at org.apache.spark.broadcast.Broadcast.value(Broadcast.scala:70)
	at org.apache.spark.rdd.HadoopRDD.getJobConf(HadoopRDD.scala:145)
	at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:201)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
	at org.apache.spark.api.java.JavaRDDLike.partitions(JavaRDDLike.scala:61)
	at org.apache.spark.api.java.JavaRDDLike.partitions$(JavaRDDLike.scala:61)
	at org.apache.spark.api.java.AbstractJavaRDDLike.partitions(JavaRDDLike.scala:45)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.spark.SparkException: Failed to get broadcast_4_piece0 of broadcast_4
	at org.apache.spark.broadcast.TorrentBroadcast.$anonfun$readBlocks$1(TorrentBroadcast.scala:197)
	at scala.runtime.java8.JFunction1$mcVI$sp.apply(JFunction1$mcVI$sp.java:23)
	at scala.collection.immutable.List.foreach(List.scala:392)
	at org.apache.spark.broadcast.TorrentBroadcast.readBlocks(TorrentBroadcast.scala:169)
	at org.apache.spark.broadcast.TorrentBroadcast.$anonfun$readBroadcastBlock$4(TorrentBroadcast.scala:253)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.broadcast.TorrentBroadcast.$anonfun$readBroadcastBlock$2(TorrentBroadcast.scala:231)
	at org.apache.spark.util.KeyLock.withLock(KeyLock.scala:64)
	at org.apache.spark.broadcast.TorrentBroadcast.$anonfun$readBroadcastBlock$1(TorrentBroadcast.scala:226)
	at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1405)
	... 26 more


TypeError: Can not infer schema for type: <class 'str'>

In [63]:
ss.stop()

In [68]:
type(data)

pyspark.rdd.RDD

In [None]:
ss.stop()