In [2]:
print('hello')

hello


In [3]:
!which python

/opt/conda/bin/python


In [4]:
import os
os.environ.get('SPARK_HOME')

'/usr/local/spark'

In [5]:
import pyspark
from pyspark.sql import SparkSession

In [7]:
spark = SparkSession.builder.appName('pyspark_ex').getOrCreate()

In [8]:
spark

# DataFrame build

In [9]:
data = [('Alice',1),('Bob',2),('Charlie',3)]
# DataFrame 객체(부산객체)를 생성 <> 판다스의 데이터프레임이 아님
data1 = spark.createDataFrame(data, ['Name','Value'])
data1

DataFrame[Name: string, Value: bigint]

In [10]:
data1[0]

Column<'Name'>

In [12]:
data1.show()

+-------+-----+
|   Name|Value|
+-------+-----+
|  Alice|    1|
|    Bob|    2|
|Charlie|    3|
+-------+-----+



## dataframe filter

In [19]:
data1.filter(data1.Name == 'Alice').show()

+-----+-----+
| Name|Value|
+-----+-----+
|Alice|    1|
+-----+-----+



In [21]:
data1.filter(data1.Value == 2).show()

+----+-----+
|Name|Value|
+----+-----+
| Bob|    2|
+----+-----+



## spark - sql

In [22]:
data1.createOrReplaceTempView('students')
spark.sql('SELECT * FROM students').show()

+-------+-----+
|   Name|Value|
+-------+-----+
|  Alice|    1|
|    Bob|    2|
|Charlie|    3|
+-------+-----+



In [23]:
spark.sql('SELECT * FROM students WHERE Value > 2').show()

+-------+-----+
|   Name|Value|
+-------+-----+
|Charlie|    3|
+-------+-----+



# RDD build

In [15]:
rdd = spark.sparkContext.parallelize([1,2,3,4,5])
rdd.collect()

[1, 2, 3, 4, 5]

In [18]:
sq_rdd = rdd.map(lambda x:x*x)
sq_rdd.take(3)

[1, 4, 9]

# Machine Learning Library

In [24]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

In [25]:
data_ = [('Alice',25),('Bob',30),('Charlie',33)]
data2 = spark.createDataFrame(data_, ['Name', 'Age'])
data2

DataFrame[Name: string, Age: bigint]

In [32]:
assembler = VectorAssembler(inputCols= ['Age'], outputCol = 'features')
vector_df = assembler.transform(data2)
lr =LinearRegression(featuresCol = 'features', labelCol = 'Age')
model = lr.fit(vector_df)
pred = model.transform(vector_df)
pred.show()

+-------+---+--------+-----------------+
|   Name|Age|features|       prediction|
+-------+---+--------+-----------------+
|  Alice| 25|  [25.0]|24.99999999999993|
|    Bob| 30|  [30.0]|30.00000000000001|
|Charlie| 33|  [33.0]|33.00000000000006|
+-------+---+--------+-----------------+



In [33]:
print(model.coefficients)

[1.000000000000016]


In [34]:
spark.stop()

# Streaming

In [None]:
from pyspark.sql.functions import explode
from pyspark.sql.functions import split

In [None]:
#lines는 스트리밍에서 읽어온 데이터 객체
lines = spark.readStream.format("socket")\ # 스트리밍 소스는 tcp소켓방식
    .option('host', 'loclahost')\ # 데이터를 수신할 호스트는 로컬호스트
    .option('port', 9999).load() # 데이터를 수신할 포트번호

# streaming data를 로드
words = lines.select(explode(split(lines.value, '')).alias('word'))
# lines.value: 소켓에서 받은 한 줄의 텍스트 (string)
# split(lines.value, ''): 한 줄의 텍스트를 한 글자씩 나눔 → 리스트 형태
# explode(...): 리스트의 각 원소를 각 행(row) 으로 펼침
# .alias('word'): 컬럼 이름을 word로 지정