In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Pyspark DataFrame #5")\
    .getOrCreate()

In [None]:
# Snowflae 연결 정보
from dotenv import load_dotenv
import os
load_dotenv()

user = os.environ.get("SNOWFLAKE_USER")
password = os.environ.get("SNOWFLAKE_PASS")
account = os.environ.get("SNOWFLAKE_ACCOUNT")
jdbc_url = f"jdbc:snowflake://{account}.snowflakecomputing.com/?warehouse=COMPUTE_WH"

In [8]:
df_user_session_channel = spark.read \
    .format("jdbc")\
    .option("url",jdbc_url)\
    .option("dbtable","dev.raw_data.user_session_channel")\
    .option("user",user)\
    .option("password",password)\
    .option("driver", "net.snowflake.client.jdbc.SnowflakeDriver") \
    .load()

df_user_session_channel.printSchema()

root
 |-- USERID: long (nullable = true)
 |-- SESSIONID: string (nullable = true)
 |-- CHANNEL: string (nullable = true)



In [None]:
df_session_timestamp = spark.read \
    .format("jdbc")\
    .option("url",jdbc_url)\
    .option("dbtable","dev.raw_data.session_timestamp")\
    .option("user",user)\
    .option("password",password)\
    .option("driver", "net.snowflake.client.jdbc.SnowflakeDriver") \
    .load()

df_session_timestamp.printSchema()

root
 |-- SESSIONID: string (nullable = true)
 |-- TS: timestamp (nullable = true)



In [None]:
# 몇 개의 파티션을 가지고 있는지 확인
df_user_session_channel.rdd.getNumPartitions()

1

In [11]:
df_session_timestamp.rdd.getNumPartitions()

1

## DataFrame으로 처리하기

In [15]:
join_expr = df_user_session_channel.SESSIONID == df_session_timestamp.SESSIONID
session_df = df_user_session_channel.join(df_session_timestamp, join_expr, "inner")
session_df.printSchema()

root
 |-- USERID: long (nullable = true)
 |-- SESSIONID: string (nullable = true)
 |-- CHANNEL: string (nullable = true)
 |-- SESSIONID: string (nullable = true)
 |-- TS: timestamp (nullable = true)



In [16]:
session_df.show(5)

+------+--------------------+--------+--------------------+--------------------+
|USERID|           SESSIONID| CHANNEL|           SESSIONID|                  TS|
+------+--------------------+--------+--------------------+--------------------+
|   371|5c3a3b139a11689e0...|   Naver|5c3a3b139a11689e0...|2019-05-06 21:33:...|
|   654|4afa19649ae378da3...|   Naver|4afa19649ae378da3...|2019-05-10 16:23:...|
|   768|87efe7b5fa21d969f...|Facebook|87efe7b5fa21d969f...| 2019-05-25 23:18:00|
|   264|0765933456f074d2c...| Youtube|0765933456f074d2c...|2019-05-05 17:55:...|
|  1027|a4a1108bbcc329a70...| Organic|a4a1108bbcc329a70...|2019-05-26 14:19:...|
+------+--------------------+--------+--------------------+--------------------+
only showing top 5 rows



In [None]:
# sessionid가 두개 있어서 에러 발생생
session_df = df_user_session_channel.join(df_session_timestamp, join_expr, "inner").select(
    "userid", "sessionid", "channel", "ts"
)

In [22]:
session_df = df_user_session_channel.join(df_session_timestamp, join_expr, "inner").select(
    "userid", df_user_session_channel.SESSIONID, "channel", "ts"
)

In [23]:
channel_count_df = session_df.groupby("channel").count().orderBy("count", ascending=False)
channel_count_df.show()

+---------+-----+
|  channel|count|
+---------+-----+
|  Youtube|17091|
|   Google|16982|
|    Naver|16921|
|  Organic|16904|
|Instagram|16831|
| Facebook|16791|
+---------+-----+



In [24]:
## MAU계산하기
from pyspark.sql.functions import date_format, asc, countDistinct

session_df.withColumn('month', date_format('ts', 'yyyy-MM')).groupby('month').\
    agg(countDistinct("userid").alias("mau")).sort(asc('month')).show()

+-------+---+
|  month|mau|
+-------+---+
|2019-05|281|
|2019-06|459|
|2019-07|623|
|2019-08|662|
|2019-09|639|
|2019-10|763|
|2019-11|721|
+-------+---+



## SparkSQL으로 처리하기

In [25]:
df_user_session_channel.createOrReplaceTempView("user_session_channel")
df_session_timestamp.createOrReplaceTempView("session_timestamp")

In [28]:
channel_count_df = spark.sql("""
    SELECT channel, count(distinct userId) uniqueUsers
    FROM session_timestamp st
    JOIN user_session_channel usc ON st.sessionID = usc.sessionID
    GROUP BY 1
    ORDER BY 1""")
channel_count_df.show()

+---------+-----------+
|  channel|uniqueUsers|
+---------+-----------+
| Facebook|        889|
|   Google|        893|
|Instagram|        895|
|    Naver|        882|
|  Organic|        895|
|  Youtube|        889|
+---------+-----------+



In [29]:
mau_df = spark.sql("""
SELECT
    LEFT(A.ts, 7) AS month,
    COUNT(DISTINCT B.userid) AS mau
FROM session_timestamp A
JOIN user_session_channel B ON A.sessionid = B.sessionid
GROUP BY 1
ORDER BY 1 DESC""")
mau_df.collect()

[Row(month='2019-11', mau=721),
 Row(month='2019-10', mau=763),
 Row(month='2019-09', mau=639),
 Row(month='2019-08', mau=662),
 Row(month='2019-07', mau=623),
 Row(month='2019-06', mau=459),
 Row(month='2019-05', mau=281)]