In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Python Spark Ranking") \
    .master("local[2]") \
    .config("spark.driver.memory","2g") \
    .config("spark.executor.memory","2g") \
    .getOrCreate()

In [10]:
# Snowflae 연결 정보
from dotenv import load_dotenv
import os

load_dotenv("C:/Users/Yeojun/airflow/airflow-dag/.env")

user = os.environ.get("SNOWFLAKE_USER")
password = os.environ.get("SNOWFLAKE_PASS")
account = os.environ.get("SNOWFLAKE_ACCOUNT")
jdbc_url = f"jdbc:snowflake://{account}.snowflakecomputing.com/?warehouse=COMPUTE_WH"

In [None]:
df_user_session_channel = spark.read \
.format("jdbc")\
.option("url",jdbc_url)\
.option("dbtable","dev.raw_data.user_session_channel")\
.option("user",user)\
.option("password",password)\
.option("driver", "net.snowflake.client.jdbc.SnowflakeDriver") \
.load()

df_session_timestamp = spark.read \
    .format("jdbc")\
    .option("url",jdbc_url)\
    .option("dbtable","dev.raw_data.session_timestamp")\
    .option("user",user)\
    .option("password",password)\
    .option("driver", "net.snowflake.client.jdbc.SnowflakeDriver") \
    .load()

df_session_transaction = spark.read \
    .format("jdbc")\
    .option("url",jdbc_url)\
    .option("dbtable","dev.raw_data.session_transaction")\
    .option("user",user)\
    .option("password",password)\
    .option("driver", "net.snowflake.client.jdbc.SnowflakeDriver") \
    .load()

In [15]:
df_user_session_channel.createOrReplaceTempView("user_session_channel")
df_session_timestamp.createOrReplaceTempView("session_timestamp")
df_session_transaction.createOrReplaceTempView("session_transaction")

### 총 매출이 가장 많은 사용자 10명 찾기

In [18]:
top_rev_user_df = spark.sql("""
SELECT userid,
SUM(str.amount) revenue,
SUM(CASE WHEN str.refunded = False THEN str.amount END) net_revenue
FROM user_session_channel usc
JOIN session_transaction str ON usc.sessionid = str.sessionid
GROUP BY 1
ORDER BY 2 DESC
LIMIT 10""")

In [19]:
top_rev_user_df.show()

+------+-------+-----------+
|userid|revenue|net_revenue|
+------+-------+-----------+
|   989|    743|        743|
|   772|    556|        556|
|  1615|    506|        506|
|   654|    488|        488|
|  1651|    463|        463|
|   973|    438|        438|
|   262|    422|        422|
|  1099|    421|        343|
|  2682|    414|        414|
|   891|    412|        412|
+------+-------+-----------+



In [20]:
top_rev_user_df2 = spark.sql("""
SELECT 
    userid,
    SUM(st.amount) total_amount,
    RANK() OVER (ORDER BY SUM(st.amount) DESC) rank
FROM user_session_channel usc
JOIN session_transaction st ON usc.sessionid = st.sessionid
GROUP BY 1
ORDER BY rank
LIMIT 10""")

In [21]:
top_rev_user_df2.show()

+------+------------+----+
|userid|total_amount|rank|
+------+------------+----+
|   989|         743|   1|
|   772|         556|   2|
|  1615|         506|   3|
|   654|         488|   4|
|  1651|         463|   5|
|   973|         438|   6|
|   262|         422|   7|
|  1099|         421|   8|
|  2682|         414|   9|
|   891|         412|  10|
+------+------------+----+

