# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [None]:
%help

####  Run this cell to set up and start your interactive session.


In [1]:
%idle_timeout 2880
%glue_version 3.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 0.37.0 
Current idle_timeout is 2800 minutes.
idle_timeout has been set to 2880 minutes.
Setting Glue version to: 3.0
Previous worker type: G.1X
Setting new worker type to: G.1X
Previous number of workers: 5
Setting new number of workers to: 5
Authenticating with environment variables and user-defined glue_role_arn: arn:aws:iam::117760591469:role/AWSGlueServiceRoleDefault
Trying to create a Glue session for the kernel.
Worker Type: G.1X
Number of Workers: 5
Session ID: c65bb277-f296-4551-ad3a-cc5b7bdec319
Job Type: glueetl
Applying the following default arguments:
--glue_kernel_version 0.37.0
--enable-glue-datacatalog true
Waiting for session c65bb

In [64]:
# Define parameters
now = datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
bucket = 'jrde-upload-from-glue'
folder = 'query_results-' + now

db = 'from_rds'
aisles_table = 'ecommerce_public_aisles'
departments_table = 'ecommerce_public_departments'
orders_table = 'ecommerce_public_orders'
products_table = 'ecommerce_public_products'
order_product_table = 'ecommerce_public_order_product'




In [6]:
# Create dataframes for raw datasets
aisles_df = glueContext.create_dynamic_frame.from_catalog(database=db, table_name=aisles_table).toDF()
departments_df = glueContext.create_dynamic_frame.from_catalog(database=db, table_name=departments_table).toDF()
orders_df = glueContext.create_dynamic_frame.from_catalog(database=db, table_name=orders_table).toDF()
products_df = glueContext.create_dynamic_frame.from_catalog(database=db, table_name=products_table).toDF()
order_product_df = glueContext.create_dynamic_frame.from_catalog(database=db, table_name=order_product_table).toDF()




In [22]:
# Create order_products_prior view for getting new data
prior_orders = orders_df.filter(orders_df['eval_set']=='prior')
joined_order_product = prior_orders.join(order_product_df, 'order_id', 'left')
joined_order_product.createOrReplaceTempView('order_products_prior')
orders_df.createOrReplaceTempView('orders_view')




In [23]:
# query 1
table1 = spark.sql('select user_id, \n'
            'max(order_number) max_order_number, \n'
            'sum(day_since_prior_order) sum_day_since_prior_order, \n'
            'avg(day_since_prior_order) avg_day_since_prior_order \n'
        'from orders_view \n'
        'group by user_id;')
#table1.show()

+-------+----------------+-------------------------+-------------------------+
|user_id|max_order_number|sum_day_since_prior_order|avg_day_since_prior_order|
+-------+----------------+-------------------------+-------------------------+
| 188592|              45|                    349.0|        7.931818181818182|
| 203309|               4|                     54.0|                     18.0|
|  41061|              55|                    331.0|         6.12962962962963|
| 202186|              18|                    102.0|                      6.0|
| 157678|               9|                     51.0|                    6.375|
| 163889|              30|                    265.0|        9.137931034482758|
| 118728|              74|                    359.0|        4.917808219178082|
| 181991|              51|                    364.0|                     7.28|
| 109349|              62|                    342.0|         5.60655737704918|
|  81279|              14|                    152.0|

In [35]:
# Query 2
table2 = spark.sql('select user_id, \n'
                       'count(product_id) total_products_groupby_userid, \n'
                       'count(distinct(product_id)) distinct_products, \n'
                       'sum(reordered) / count(case when order_number > 1 then 1 else 0 end) user_reorder_ratio \n'
                'from order_products_prior \n'
                'group by user_id;')
#table2.show()




In [27]:
# Query 3
table3 = spark.sql('select user_id, \n'
                       'product_id, \n'
                       'sum(order_number) total_products_groupby_userid_productid, \n'
                       'min(order_number) min_ordernum_groupby_userid_productid, \n'
                       'max(order_number) max_ordernum_groupby_userid_productid, \n'
                       'avg(add_to_cart_order) avg_addtocartorder_groupby_userid_productid \n'
                'from order_products_prior \n'
                'group by user_id, product_id;')
#table3.show()

+-------+----------+---------------------------------------+-------------------------------------+-------------------------------------+-------------------------------------------+
|user_id|product_id|total_products_groupby_userid_productid|min_ordernum_groupby_userid_productid|max_ordernum_groupby_userid_productid|avg_addtocartorder_groupby_userid_productid|
+-------+----------+---------------------------------------+-------------------------------------+-------------------------------------+-------------------------------------------+
| 156122|     12962|                                    161|                                   23|                                   42|                                       17.2|
| 135442|      1529|                                      9|                                    4|                                    5|                                        8.0|
| 135442|      3464|                                    127|                                   

In [39]:
# Query 4
table4= spark.sql('with temp as ( \n'
                    'select user_id, \n'
                    '       order_number, \n'
                    '       product_id, \n'
                    '       reordered, \n'
                    '       rank() over(partition by user_id, product_id order by order_number) product_seq_time \n'
                    'from order_products_prior \n'
                ') \n'
                'select product_id, \n'
                       'count(product_id) total_products_groupby_productid, \n'
                       'sum(reordered) total_reordered, \n'
                       'sum(case when product_seq_time = 1 then 1 else 0 end) product_seq_time_1, \n'
                       'sum(case when product_seq_time = 2 then 1 else 0 end) product_seq_time_2 \n'
                'from temp \n'
                'group by product_id;')
#table4.show()




In [40]:
derived_table = table1.join(table2, 'user_id', 'left')
derived_table = derived_table.join(table3, 'user_id', 'left')
derived_table = derived_table.join(table4, 'product_id', 'left')
#derived_table.show()




In [51]:
derived_table.coalesce(1).write.option('header', 'true').mode('overwrite').csv(f's3://{bucket}/{folder}/')


