# Module 3 - Data Integration and Aggregation.ipynb

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
.appName('OlistData') \
.getOrCreate()

25/05/19 03:35:50 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
# DEFINING PATH

hdfs_path = "/data/olist/"

# Data loading 

customers_df = spark.read.csv(hdfs_path + "olist_customers_dataset.csv", header = True, inferSchema = True)
geolocation_df = spark.read.csv(hdfs_path + "olist_geolocation_dataset.csv", header = True, inferSchema = True)
order_items_df = spark.read.csv(hdfs_path + "olist_order_items_dataset.csv", header = True, inferSchema = True)
payments_df = spark.read.csv(hdfs_path + "olist_order_payments_dataset.csv", header = True, inferSchema = True)
reviews_df = spark.read.csv(hdfs_path + "olist_order_reviews_dataset.csv", header = True, inferSchema = True)
orders_df = spark.read.csv(hdfs_path + "olist_orders_dataset.csv", header = True, inferSchema = True)
products_df = spark.read.csv(hdfs_path + "olist_products_dataset.csv", header = True, inferSchema = True)
sellers_df = spark.read.csv(hdfs_path + "olist_sellers_dataset.csv", header = True, inferSchema = True)
category_translation_df = spark.read.csv(hdfs_path + "product_category_name_translation.csv", header = True, inferSchema = True)

                                                                                

In [3]:
customers_df.cache()
order_items_df.cache()
orders_df.cache()

DataFrame[order_id: string, customer_id: string, order_status: string, order_purchase_timestamp: timestamp, order_approved_at: timestamp, order_delivered_carrier_date: timestamp, order_delivered_customer_date: timestamp, order_estimated_delivery_date: timestamp]

In [8]:
orders_items_joined_df = orders_df.join(order_items_df,'order_id','inner')

orders_items_products_joined_df = orders_items_joined_df.join(products_df,'product_id','inner')

orders_items_products_sellers_joined_df = orders_items_products_joined_df.join(sellers_df,'seller_id','inner')

full_orders_df = orders_items_products_sellers_joined_df.join(customers_df,'customer_id','inner')

In [10]:
# Geoloction Data

full_orders_df = full_orders_df \
    .join(geolocation_df, customers_df.customer_zip_code_prefix == geolocation_df.geolocation_zip_code_prefix , 'left')

In [11]:
full_orders_df = full_orders_df.join(reviews_df, 'order_id', 'left')

In [12]:
full_orders_df = full_orders_df.join(payments_df, 'order_id', 'left')

In [13]:
# full_orders_df conatains all the significant data from all the datasets which can be used for the orders analysis
    
full_orders_df.cache()

DataFrame[order_id: string, customer_id: string, seller_id: string, product_id: string, order_status: string, order_purchase_timestamp: timestamp, order_approved_at: timestamp, order_delivered_carrier_date: timestamp, order_delivered_customer_date: timestamp, order_estimated_delivery_date: timestamp, order_item_id: int, shipping_limit_date: timestamp, price: double, freight_value: double, product_category_name: string, product_name_lenght: int, product_description_lenght: int, product_photos_qty: int, product_weight_g: int, product_length_cm: int, product_height_cm: int, product_width_cm: int, seller_zip_code_prefix: int, seller_city: string, seller_state: string, customer_unique_id: string, customer_zip_code_prefix: int, customer_city: string, customer_state: string, geolocation_zip_code_prefix: int, geolocation_lat: double, geolocation_lng: double, geolocation_city: string, geolocation_state: string, review_id: string, review_score: string, review_comment_title: string, review_commen

# Total Revenues per Seller

In [15]:
full_orders_df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)
 |-- product_category_name: string (nullable = true)
 |-- product_name_lenght: integer (nullable = true)
 |-- product_description_lenght: integer (nullable = true)
 |-- product_photos_qty: integer (nullable = true)
 |-- product_weight_g: integer (nullable = true)
 |-- product_length_cm: integer (null

In [16]:
from pyspark.sql.functions import *

In [27]:
seller_revenue_df = full_orders_df.groupBy("seller_id").agg(sum(col("price") + col("freight_value")).alias("total_revenue"))

seller_revenue_df.show()



+--------------------+--------------------+
|           seller_id|       total_revenue|
+--------------------+--------------------+
|7a67c85e85bb2ce85...| 2.317876284999997E7|
|9d213f303afae4983...|   5510.160000000009|
|d2374cbcbb3ca4ab1...|   4949060.059999982|
|1835b56ce799e6a4d...|   7631999.199999983|
|d650b663c3b5f6fb3...|          2358343.84|
|aded58c8142dedc54...|   84221.19000000044|
|cca3071e3e9bb7d12...|1.2451246459999999E7|
|2b3e4a2a3ea8e0193...|           2202727.3|
|dc4a0fc896dc34b0d...|   5110040.860000005|
|92eb0f42c21942b65...|  1574163.7700000051|
|f3b80352b986ab4d1...|  1735357.1700000013|
|e9779976487b77c6d...|   7744218.899999996|
|2a5b78b41cd05baea...|  4649139.3000000045|
|d94a40fd42351c259...|   2650991.170000001|
|f63d8f293af3a6454...|   382841.4100000047|
|d71d863e5ef30d94e...|  1710338.4099999983|
|5f2684dab12e59f83...|   1393462.500000002|
|28f10b1c5e5abb9d4...|  102040.75000000009|
|4e922959ae960d389...|   7964395.799999994|
|31561f325664a8a7a...|          

                                                                                

# Total Orders per customer 

In [28]:
total_order_df = full_orders_df.groupBy('customer_id').agg(countDistinct('order_id').alias('total_orders'))

total_order_df.show()



+--------------------+------------+
|         customer_id|total_orders|
+--------------------+------------+
|8f8b7a582620f3717...|           1|
|d84f2788580b9d5e4...|           1|
|35c922a335b8a9141...|           1|
|333d057b8000f250b...|           1|
|9a5d3906304996948...|           1|
|d9f57c5a009cd22a4...|           1|
|8f4758b55faa41da5...|           1|
|2d8ac47c29ec811ab...|           1|
|9e805b75d4ed5080a...|           1|
|40e7a663ffaa7aa2f...|           1|
|136f8c475ac7abd11...|           1|
|7b1d4d024295cc2b1...|           1|
|3a832bedadec4eeb6...|           1|
|52f7baf30ea546558...|           1|
|ac9b518157bd24e32...|           1|
|2aec499f94f5e8278...|           1|
|ca85f1de72b224e6e...|           1|
|d927892e658a6af91...|           1|
|82c91e8e62d9d79bf...|           1|
|d0615859a639a94c1...|           1|
+--------------------+------------+
only showing top 20 rows



                                                                                

# Average Review Score 

In [None]:
# First converting review_score to 'INT'

avg_review_score = full_orders_df.withColumn("review_score_int", col("review_score").cast("int")) \
    .groupBy("seller_id") \
    .agg(avg("review_score_int").alias("avg_review_score"))

avg_review_score.show()



# Most Sold Products (Top 10)

In [39]:
# To find the most sold products by number of units

most_sold_product = full_orders_df.groupBy('product_id') \
    .agg(count('*').alias('total_items_sold')) \
    .orderBy(desc('total_items_sold')).limit(10)

most_sold_product.show()



+--------------------+----------------+
|          product_id|total_items_sold|
+--------------------+----------------+
|aca2eb7d00ea1a7b8...|           86740|
|422879e10f4668299...|           81110|
|99a4788cb24856965...|           78775|
|389d119b48cf3043d...|           60248|
|d1c427060a0f73f6b...|           59274|
|368c6c730842d7801...|           58358|
|53759a2ecddad2bb8...|           52654|
|53b36df67ebb7c415...|           52105|
|154e7e31ebfa09220...|           42700|
|3dd2a17168ec895c7...|           40787|
+--------------------+----------------+



                                                                                

# Top Customers by Spending

In [41]:
# There are 2 'customer_id' & 'customer_unique_id'
# I'm calculating on 'customer_id'

top_customers_by_spending = full_orders_df.groupBy("customer_id") \
    .agg(sum("payment_value").alias("total_spent")) \
    .orderBy(desc("total_spent")).limit(10)

top_customers_by_spending.show()



+--------------------+--------------------+
|         customer_id|         total_spent|
+--------------------+--------------------+
|1ff773612ab8934db...| 1.756825199999893E7|
|05455dfa7cd02f13d...|1.3282083359999327E7|
|ec5b2ba62e5743423...|1.0388528640000112E7|
|0c792d32a3251b4f6...|   8254681.600000529|
|78fc46047c4a639e8...|   7488519.999999339|
|1617b1357756262bf...|   7433259.520000033|
|1dbc055ccab23ed89...|   7216273.400000708|
|d5f2b3f597c7ccafb...|   6800018.119998923|
|dd3f1762eb601f41c...|  6746388.4800006235|
|10de381f8a8d23fff...|   5184499.500000076|
+--------------------+--------------------+



                                                                                

In [None]:
spark.stop()