In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Python Spark Grouping") \
    .master("local[2]") \
    .config("spark.driver.memory","2g") \
    .config("spark.executor.memory","2g") \
    .getOrCreate()

In [2]:
# Snowflae 연결 정보
from dotenv import load_dotenv
import os

load_dotenv("C:/Users/Yeojun/airflow/airflow-dag/.env")

user = os.environ.get("SNOWFLAKE_USER")
password = os.environ.get("SNOWFLAKE_PASS")
account = os.environ.get("SNOWFLAKE_ACCOUNT")
jdbc_url = f"jdbc:snowflake://{account}.snowflakecomputing.com/?warehouse=COMPUTE_WH"

In [3]:
df_user_session_channel = spark.read \
.format("jdbc")\
.option("url",jdbc_url)\
.option("dbtable","dev.raw_data.user_session_channel")\
.option("user",user)\
.option("password",password)\
.option("driver", "net.snowflake.client.jdbc.SnowflakeDriver") \
.load()

df_session_timestamp = spark.read \
    .format("jdbc")\
    .option("url",jdbc_url)\
    .option("dbtable","dev.raw_data.session_timestamp")\
    .option("user",user)\
    .option("password",password)\
    .option("driver", "net.snowflake.client.jdbc.SnowflakeDriver") \
    .load()

df_session_transaction = spark.read \
    .format("jdbc")\
    .option("url",jdbc_url)\
    .option("dbtable","dev.raw_data.session_transaction")\
    .option("user",user)\
    .option("password",password)\
    .option("driver", "net.snowflake.client.jdbc.SnowflakeDriver") \
    .load()

In [4]:
df_user_session_channel.createOrReplaceTempView("user_session_channel")
df_session_timestamp.createOrReplaceTempView("session_timestamp")
df_session_transaction.createOrReplaceTempView("session_transaction")

In [5]:
spark.table("session_timestamp").show(5)

+--------------------+--------------------+
|           SESSIONID|                  TS|
+--------------------+--------------------+
|7cdace91c487558e2...|2019-05-01 00:13:...|
|94f192dee566b018e...|2019-05-01 00:49:...|
|7ed2d3454c5eea711...|2019-05-01 10:18:...|
|f1daf122cde863010...|2019-05-01 13:10:...|
|fd0efcca272f704a7...|2019-05-01 13:45:...|
+--------------------+--------------------+
only showing top 5 rows



In [9]:
df_user_session_channel.show(5)

+------+--------------------+--------+
|USERID|           SESSIONID| CHANNEL|
+------+--------------------+--------+
|   184|c41dd99a69df04044...|   Naver|
|    80|fdc0eb412a84fa549...| Organic|
|   251|0a54b19a13b6712dc...|Facebook|
|   264|a914ecef9c12ffdb9...|  Google|
|   744|05ae14d7ae387b933...|Facebook|
+------+--------------------+--------+
only showing top 5 rows



### JOIN key가 정말 하나씩만 존재하는가?

In [None]:
spark.sql("""SELECT sessionid, COUNT(1) count
FROM user_session_channel
GROUP BY 1
ORDER BY 2 DESC
LIMIT 1""").show() 

+--------------------+-----+
|           sessionid|count|
+--------------------+-----+
|2e907f44e0a961631...|    1|
+--------------------+-----+



In [12]:
spark.sql("""SELECT sessionid, COUNT(1) count
FROM session_timestamp
GROUP BY 1
ORDER BY 2 DESC
LIMIT 1""").show() 

+--------------------+-----+
|           sessionid|count|
+--------------------+-----+
|32bbf7b2bc4ed14eb...|    1|
+--------------------+-----+



In [11]:
spark.sql("""SELECT sessionid, COUNT(1) count
FROM session_transaction
GROUP BY 1
ORDER BY 2 DESC
LIMIT 1""").show() 

+--------------------+-----+
|           sessionid|count|
+--------------------+-----+
|532ff98823e7d1433...|    1|
+--------------------+-----+



### 월별 채널별 총 방문자 계산

In [13]:
mon_channel_rev_df = spark.sql("""
    SELECT 
        LEFT(sti.ts, 7) year_month,
        usc.channel channel,
        COUNT(DISTINCT userid) total_visitors
    FROM user_session_channel usc
    LEFT JOIN session_timestamp sti ON usc.sessionid = sti.sessionid
    GROUP BY 1 ,2
    ORDER BY 1, 2""")
mon_channel_rev_df.show()


+----------+---------+--------------+
|year_month|  channel|total_visitors|
+----------+---------+--------------+
|   2019-05| Facebook|           247|
|   2019-05|   Google|           253|
|   2019-05|Instagram|           234|
|   2019-05|    Naver|           237|
|   2019-05|  Organic|           238|
|   2019-05|  Youtube|           244|
|   2019-06| Facebook|           414|
|   2019-06|   Google|           412|
|   2019-06|Instagram|           410|
|   2019-06|    Naver|           398|
|   2019-06|  Organic|           416|
|   2019-06|  Youtube|           400|
|   2019-07| Facebook|           558|
|   2019-07|   Google|           556|
|   2019-07|Instagram|           567|
|   2019-07|    Naver|           553|
|   2019-07|  Organic|           557|
|   2019-07|  Youtube|           564|
|   2019-08| Facebook|           611|
|   2019-08|   Google|           610|
+----------+---------+--------------+
only showing top 20 rows



### 월별 채널별 총 방문자와 구매 방문자 계산

In [14]:
mon_channel_rev_df = spark.sql("""
    SELECT 
        LEFT(sti.ts, 7) year_month,
        usc.channel channel,
        COUNT(DISTINCT userid) total_visitors,
        COUNT(DISTINCT CASE WHEN amount is not NULL THEN userid END) paid_visitors
    FROM user_session_channel usc
    LEFT JOIN session_timestamp sti ON usc.sessionid = sti.sessionid
    LEFT JOIN session_transaction str ON usc.sessionid = str.sessionid
    GROUP BY 1 ,2
    ORDER BY 1, 2""")

mon_channel_rev_df.show()


+----------+---------+--------------+-------------+
|year_month|  channel|total_visitors|paid_visitors|
+----------+---------+--------------+-------------+
|   2019-05| Facebook|           247|           14|
|   2019-05|   Google|           253|           10|
|   2019-05|Instagram|           234|           11|
|   2019-05|    Naver|           237|           11|
|   2019-05|  Organic|           238|           17|
|   2019-05|  Youtube|           244|           10|
|   2019-06| Facebook|           414|           22|
|   2019-06|   Google|           412|           13|
|   2019-06|Instagram|           410|           21|
|   2019-06|    Naver|           398|           15|
|   2019-06|  Organic|           416|           14|
|   2019-06|  Youtube|           400|           17|
|   2019-07| Facebook|           558|           32|
|   2019-07|   Google|           556|           21|
|   2019-07|Instagram|           567|           25|
|   2019-07|    Naver|           553|           19|
|   2019-07|

### 월별 채널별 총 매출액 (리펀드 포함), 총 방문자, 매출 발생 방문자, 전환률 계산

In [15]:
mon_channel_rev_df = spark.sql("""
SELECT LEFT(ts, 7) month,
    usc.channel,
    COUNT(DISTINCT userid) uniqueUsers,
    COUNT(DISTINCT (CASE WHEN amount >= 0 THEN userid END)) paidUsers,
    SUM(amount) grossRevenue,
    SUM(CASE WHEN refunded is not True THEN amount END) netRevenue,
    ROUND(COUNT(DISTINCT CASE WHEN amount >= 0 THEN userid END)*100
        / COUNT(DISTINCT userid), 2) conversionRate
FROM user_session_channel usc
LEFT JOIN session_timestamp t ON t.sessionid = usc.sessionid
LEFT JOIN session_transaction st ON st.sessionid = usc.sessionid
GROUP BY 1, 2
ORDER BY 1, 2;
""")

mon_channel_rev_df.show()


+-------+---------+-----------+---------+------------+----------+--------------+
|  month|  channel|uniqueUsers|paidUsers|grossRevenue|netRevenue|conversionRate|
+-------+---------+-----------+---------+------------+----------+--------------+
|2019-05| Facebook|        247|       14|        1199|       997|          5.67|
|2019-05|   Google|        253|       10|         580|       580|          3.95|
|2019-05|Instagram|        234|       11|         959|       770|           4.7|
|2019-05|    Naver|        237|       11|         867|       844|          4.64|
|2019-05|  Organic|        238|       17|        1846|      1571|          7.14|
|2019-05|  Youtube|        244|       10|         529|       529|           4.1|
|2019-06| Facebook|        414|       22|        1578|      1578|          5.31|
|2019-06|   Google|        412|       13|         947|       947|          3.16|
|2019-06|Instagram|        410|       21|        1462|      1418|          5.12|
|2019-06|    Naver|        3