In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
print(pyspark.__version__)

3.5.5


In [3]:
spark = SparkSession.builder \
    .appName("ecom_joins") \
    .master("local[*]") \
    .getOrCreate()

25/05/02 08:58:42 WARN Utils: Your hostname, Ameys-Mac-mini.local resolves to a loopback address: 127.0.0.1; using 192.168.1.7 instead (on interface en1)
25/05/02 08:58:42 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/02 08:58:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
customers_df = spark.read.csv("customers.csv", header=True, inferSchema=True)
orders_df = spark.read.csv("orders.csv", header=True, inferSchema=True)
products_df = spark.read.csv("products.csv", header=True, inferSchema=True)

## Data Exploration

### Print the schemas and show the first 5 records from each DataFrame.

In [5]:
customers_df.show(5)

+-----------+---------------+--------------------+--------------------+
|customer_id|  customer_name|               email|             country|
+-----------+---------------+--------------------+--------------------+
|          1|Kenneth Schmidt|elijahjensen@hotm...|               Gabon|
|          2|Sandy Hernandez|  billy63@peters.biz|Bosnia and Herzeg...|
|          3|  Emily Rosales|medinacarrie@gmai...|Lao People's Demo...|
|          4|  Belinda Banks|joshuavalenzuela@...|             Morocco|
|          5|      Guy Gibbs|edwardssamantha@g...|Heard Island and ...|
+-----------+---------------+--------------------+--------------------+
only showing top 5 rows



In [6]:
orders_df.show(5)

+--------+-----------+----------+--------+----------+
|order_id|customer_id|product_id|quantity|order_date|
+--------+-----------+----------+--------+----------+
|       1|         23|         2|       1|2024-05-26|
|       2|          5|         3|       5|2024-07-05|
|       3|         27|         9|       5|2024-11-19|
|       4|         10|         5|       3|2024-06-07|
|       5|         10|         3|       4|2025-02-16|
+--------+-----------+----------+--------+----------+
only showing top 5 rows



In [49]:
products_df.show()

+----------+------------+-----+
|product_id|product_name|price|
+----------+------------+-----+
|         1|       Mouse| 1973|
|         2|     Monitor|  243|
|         3|  Headphones| 1646|
|         4|       Mouse| 1979|
|         5|      Laptop|  304|
|         6|      Camera|  570|
|         7|       Mouse|  283|
|         8|      Tablet|  340|
|         9|     Monitor|  931|
|        10|       Mouse| 1476|
+----------+------------+-----+



In [8]:
customers_df.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- country: string (nullable = true)



In [9]:
orders_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- order_date: date (nullable = true)



In [10]:
products_df.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: integer (nullable = true)



### How many unique customers have placed at least one order?

In [11]:
customers_df.createOrReplaceTempView("customers")
orders_df.createOrReplaceTempView("orders")
products_df.createOrReplaceTempView("products")

In [12]:
spark.sql(
    """
    SELECT
        count(distinct customer_id) as unique_customers
    FROM orders
    """
).show()

+----------------+
|unique_customers|
+----------------+
|              28|
+----------------+



In [13]:
unique_users = orders_df.select("customer_id").distinct().count()
print(unique_users)

28


In [14]:
orders_df.select(countDistinct("customer_id").alias("unique_customers")).show()

+----------------+
|unique_customers|
+----------------+
|              28|
+----------------+



### Find how many orders each customer has placed.

In [15]:
spark.sql(
    """
    SELECT
        customer_id,
        count(distinct order_id) as order_count
    FROM orders
    GROUP BY 1
    """
).show()

+-----------+-----------+
|customer_id|order_count|
+-----------+-----------+
|         28|          1|
|         27|          2|
|         26|          1|
|         12|          3|
|         22|          1|
|          1|          2|
|         13|          1|
|         16|          3|
|          6|          2|
|          3|          2|
|         20|          1|
|          5|          3|
|         19|          3|
|         17|          1|
|          9|          3|
|          4|          2|
|          8|          2|
|         23|          4|
|          7|          7|
|         10|          3|
+-----------+-----------+
only showing top 20 rows



In [16]:
orders_df.select("customer_id", "order_id") \
    .distinct() \
    .groupBy("customer_id") \
    .agg((count("order_id")).alias("order_count")) \
    .distinct() \
    .show()

+-----------+-----------+
|customer_id|order_count|
+-----------+-----------+
|         28|          1|
|         27|          2|
|         26|          1|
|         12|          3|
|         22|          1|
|          1|          2|
|         13|          1|
|         16|          3|
|          6|          2|
|          3|          2|
|         20|          1|
|          5|          3|
|         19|          3|
|         17|          1|
|          9|          3|
|          4|          2|
|          8|          2|
|         23|          4|
|          7|          7|
|         10|          3|
+-----------+-----------+
only showing top 20 rows



### List all customers who haven't placed any orders.

In [17]:
spark.sql(
    """
    SELECT
        c.customer_id,
        c.customer_name
    FROM customers c
        LEFT JOIN orders o
        USING (customer_id)
    WHERE o.order_id IS NULL
    """
).show()

+-----------+----------------+
|customer_id|   customer_name|
+-----------+----------------+
|         11|   Ashley Cannon|
|         15|Dr. Craig Mathis|
+-----------+----------------+



In [18]:
customers_df.join(
    orders_df,
    on=["customer_id"],
    how="left"
).select(
    "customer_id",
    "customer_name"
).filter(
    orders_df.order_id.isNull()
).show()

+-----------+----------------+
|customer_id|   customer_name|
+-----------+----------------+
|         11|   Ashley Cannon|
|         15|Dr. Craig Mathis|
+-----------+----------------+



## Joins Focus

### Perform an inner join between orders and customers. Show order_id, customer_name, and order_date.

In [19]:
spark.sql(
    """
    SELECT
        o.order_id,
        c.customer_name,
        o.order_date
    FROM orders o
    INNER JOIN customers c
    USING(customer_id)
    LIMIT 5
    """
).show()

+--------+----------------+----------+
|order_id|   customer_name|order_date|
+--------+----------------+----------+
|       1|  Erica Martinez|2024-05-26|
|       2|       Guy Gibbs|2024-07-05|
|       3| Robert Mitchell|2024-11-19|
|       4|Samuel Gillespie|2024-06-07|
|       5|Samuel Gillespie|2025-02-16|
+--------+----------------+----------+



In [20]:
orders_df.join(
    customers_df,
    on=["customer_id"],
    how="inner"
).select(
    "order_id",
    "customer_name",
    "order_date"
).show(5)

+--------+----------------+----------+
|order_id|   customer_name|order_date|
+--------+----------------+----------+
|       1|  Erica Martinez|2024-05-26|
|       2|       Guy Gibbs|2024-07-05|
|       3| Robert Mitchell|2024-11-19|
|       4|Samuel Gillespie|2024-06-07|
|       5|Samuel Gillespie|2025-02-16|
+--------+----------------+----------+
only showing top 5 rows



### Perform a left join between customers and orders to find customers without orders.

In [21]:
spark.sql(
    """
    SELECT
        c.customer_id,
        c.customer_name,
        o.order_id
    FROM customers c
    LEFT JOIN orders o
    USING(customer_id)
    WHERE o.order_id IS NULL
    """
).show()

+-----------+----------------+--------+
|customer_id|   customer_name|order_id|
+-----------+----------------+--------+
|         11|   Ashley Cannon|    NULL|
|         15|Dr. Craig Mathis|    NULL|
+-----------+----------------+--------+



In [26]:
customers_df.join(
    orders_df,
    on=["customer_id"],
    how="left"
).select(
    "customer_id",
    "customer_name",
    "order_id"
).filter(
    orders_df.order_id.isNull()
).show()

+-----------+----------------+--------+
|customer_id|   customer_name|order_id|
+-----------+----------------+--------+
|         11|   Ashley Cannon|    NULL|
|         15|Dr. Craig Mathis|    NULL|
+-----------+----------------+--------+



### Join orders, customers, and products to create a full order summary showing:

- customer_name, product_name, quantity, order_date, price
- Expected Output: Full flattened table.

In [27]:
spark.sql(
    """
    SELECT
        c.customer_name,
        p.product_name,
        o.quantity,
        o.order_date,
        p.price
    FROM customers c
    INNER JOIN orders o
    USING(customer_id)
    INNER JOIN products p
    USING(product_id)
    """
).show(5)

+----------------+------------+--------+----------+-----+
|   customer_name|product_name|quantity|order_date|price|
+----------------+------------+--------+----------+-----+
|  Erica Martinez|     Monitor|       1|2024-05-26|  243|
|       Guy Gibbs|  Headphones|       5|2024-07-05| 1646|
| Robert Mitchell|     Monitor|       5|2024-11-19|  931|
|Samuel Gillespie|      Laptop|       3|2024-06-07|  304|
|Samuel Gillespie|  Headphones|       4|2025-02-16| 1646|
+----------------+------------+--------+----------+-----+
only showing top 5 rows



In [28]:
customers_df.join(
    orders_df,
    on=["customer_id"],
    how="inner"
).join(
    products_df,
    on=["product_id"],
    how="inner"
).select(
    "customer_name",
    "product_name",
    "quantity",
    "order_date",
    "price"
).show(5)

+----------------+------------+--------+----------+-----+
|   customer_name|product_name|quantity|order_date|price|
+----------------+------------+--------+----------+-----+
|  Erica Martinez|     Monitor|       1|2024-05-26|  243|
|       Guy Gibbs|  Headphones|       5|2024-07-05| 1646|
| Robert Mitchell|     Monitor|       5|2024-11-19|  931|
|Samuel Gillespie|      Laptop|       3|2024-06-07|  304|
|Samuel Gillespie|  Headphones|       4|2025-02-16| 1646|
+----------------+------------+--------+----------+-----+
only showing top 5 rows



### Find the total amount spent per order (quantity * price).

In [29]:
spark.sql(
    """
    SELECT
        o.order_id,
        o.quantity * p.price AS totoal_amount
    FROM orders o
    INNER JOIN products p
    USING(product_id)
    """
).show(5)

+--------+-------------+
|order_id|totoal_amount|
+--------+-------------+
|       1|          243|
|       2|         8230|
|       3|         4655|
|       4|          912|
|       5|         6584|
+--------+-------------+
only showing top 5 rows



In [30]:
orders_df.join(
    products_df,
    on=["product_id"],
    how="inner"
).select(
    "order_id",
    (orders_df.quantity * products_df.price).alias("total_amount")
).show(5)

+--------+------------+
|order_id|total_amount|
+--------+------------+
|       1|         243|
|       2|        8230|
|       3|        4655|
|       4|         912|
|       5|        6584|
+--------+------------+
only showing top 5 rows



### Find the top 5 customers who spent the most money overall.

In [33]:
spark.sql(
    """
    SELECT
        c.customer_name,
        o.quantity * p.price AS total_spent
    FROM customers c
    INNER JOIN orders o
    USING(customer_id)
    INNER JOIN products p
    USING(product_id)
    ORDER BY total_spent DESC
    LIMIT 5
    """
).show()

+-----------------+-----------+
|    customer_name|total_spent|
+-----------------+-----------+
|Kristine Anderson|       9895|
|     Jeffrey Reed|       9865|
|        Guy Gibbs|       9865|
|        Guy Gibbs|       8230|
|    Belinda Banks|       7916|
+-----------------+-----------+



In [36]:
customers_df.join(
    orders_df,
    on=["customer_id"],
    how="inner"
).join(
    products_df,
    on=["product_id"],
    how="inner"
).select(
    "customer_name",
    (orders_df.quantity * products_df.price).alias("total_spent")
).orderBy(
    "total_spent", ascending=False
).show(5)

+-----------------+-----------+
|    customer_name|total_spent|
+-----------------+-----------+
|Kristine Anderson|       9895|
|        Guy Gibbs|       9865|
|     Jeffrey Reed|       9865|
|        Guy Gibbs|       8230|
| Kaylee Fernandez|       7916|
+-----------------+-----------+
only showing top 5 rows



### List all orders for products costing more than $1000.

In [37]:
spark.sql(
    """
    SELECT
        o.order_id,
        p.product_name,
        p.price,
        c.customer_name
    FROM orders o
    INNER JOIN products p
    USING(product_id)
    INNER JOIN customers c
    USING(customer_id)
    WHERE p.price > 1000
    """
).show()

+--------+------------+-----+------------------+
|order_id|product_name|price|     customer_name|
+--------+------------+-----+------------------+
|       2|  Headphones| 1646|         Guy Gibbs|
|       5|  Headphones| 1646|  Samuel Gillespie|
|       7|       Mouse| 1476|        Brandi Cox|
|      12|       Mouse| 1973|        Karen Cole|
|      13|       Mouse| 1476|    Nicholas Riley|
|      15|       Mouse| 1476|      Jeffrey Reed|
|      17|       Mouse| 1979|  Kaylee Fernandez|
|      22|       Mouse| 1979| Kristine Anderson|
|      23|  Headphones| 1646|     Emily Rosales|
|      28|       Mouse| 1973|         Guy Gibbs|
|      32|  Headphones| 1646|    Joshua Elliott|
|      34|       Mouse| 1979|     David Simpson|
|      39|       Mouse| 1973|   Sandy Hernandez|
|      40|       Mouse| 1979|     Belinda Banks|
|      45|  Headphones| 1646|    Erica Martinez|
|      51|       Mouse| 1973|   Kenneth Schmidt|
|      52|       Mouse| 1476|        Karen Cole|
|      54|       Mou

In [38]:
orders_df.join(
    products_df,
    on=["product_id"],
    how="inner"
).join(
    customers_df,
    on=["customer_id"],
    how="inner"
).select(
    "order_id",
    "product_name",
    "price",
    "customer_name"
).filter(
    products_df.price > 1000
).show()

+--------+------------+-----+------------------+
|order_id|product_name|price|     customer_name|
+--------+------------+-----+------------------+
|       2|  Headphones| 1646|         Guy Gibbs|
|       5|  Headphones| 1646|  Samuel Gillespie|
|       7|       Mouse| 1476|        Brandi Cox|
|      12|       Mouse| 1973|        Karen Cole|
|      13|       Mouse| 1476|    Nicholas Riley|
|      15|       Mouse| 1476|      Jeffrey Reed|
|      17|       Mouse| 1979|  Kaylee Fernandez|
|      22|       Mouse| 1979| Kristine Anderson|
|      23|  Headphones| 1646|     Emily Rosales|
|      28|       Mouse| 1973|         Guy Gibbs|
|      32|  Headphones| 1646|    Joshua Elliott|
|      34|       Mouse| 1979|     David Simpson|
|      39|       Mouse| 1973|   Sandy Hernandez|
|      40|       Mouse| 1979|     Belinda Banks|
|      45|  Headphones| 1646|    Erica Martinez|
|      51|       Mouse| 1973|   Kenneth Schmidt|
|      52|       Mouse| 1476|        Karen Cole|
|      54|       Mou

### Perform a full outer join between customers and orders. What are the results?

In [39]:
spark.sql(
    """
    SELECT
        c.customer_name,
        o.*
    FROM customers c
    FULL OUTER JOIN orders o
    USING(customer_id)
    """
).show()

+----------------+-----------+--------+----------+--------+----------+
|   customer_name|customer_id|order_id|product_id|quantity|order_date|
+----------------+-----------+--------+----------+--------+----------+
| Kenneth Schmidt|          1|      51|         1|       2|2024-10-14|
| Kenneth Schmidt|          1|      58|         8|       3|2024-05-20|
| Sandy Hernandez|          2|      39|         1|       4|2024-09-27|
| Sandy Hernandez|          2|      53|         7|       5|2024-06-24|
|   Emily Rosales|          3|      23|         3|       2|2024-05-13|
|   Emily Rosales|          3|      46|         6|       3|2024-07-01|
|   Belinda Banks|          4|      24|         7|       5|2025-04-04|
|   Belinda Banks|          4|      40|         4|       4|2024-06-15|
|       Guy Gibbs|          5|       2|         3|       5|2024-07-05|
|       Guy Gibbs|          5|      28|         1|       5|2024-12-03|
|       Guy Gibbs|          5|      47|         9|       3|2024-10-13|
|   Da

In [41]:
customers_df.join(
    orders_df,
    on=["customer_id"],
    how="fullouter"
).show()

+-----------+----------------+--------------------+--------------------+--------+----------+--------+----------+
|customer_id|   customer_name|               email|             country|order_id|product_id|quantity|order_date|
+-----------+----------------+--------------------+--------------------+--------+----------+--------+----------+
|          1| Kenneth Schmidt|elijahjensen@hotm...|               Gabon|      51|         1|       2|2024-10-14|
|          1| Kenneth Schmidt|elijahjensen@hotm...|               Gabon|      58|         8|       3|2024-05-20|
|          2| Sandy Hernandez|  billy63@peters.biz|Bosnia and Herzeg...|      39|         1|       4|2024-09-27|
|          2| Sandy Hernandez|  billy63@peters.biz|Bosnia and Herzeg...|      53|         7|       5|2024-06-24|
|          3|   Emily Rosales|medinacarrie@gmai...|Lao People's Demo...|      23|         3|       2|2024-05-13|
|          3|   Emily Rosales|medinacarrie@gmai...|Lao People's Demo...|      46|         6|    

### Find the number of different products each customer has ordered.

In [45]:
spark.sql(
    """
    SELECT
        c.customer_name,
        count(o.order_id) as order_count
    FROM customers c
    LEFT JOIN orders o
    USING(customer_id)
    GROUP BY 1
    ORDER BY 2 DESC
    """
).show()

+------------------+-----------+
|     customer_name|order_count|
+------------------+-----------+
|  Gregory Lawrence|          7|
|    Erica Martinez|          4|
|Danielle Robertson|          3|
|      Jeffrey Reed|          3|
|         Guy Gibbs|          3|
|      Tammy Carter|          3|
|  Samuel Gillespie|          3|
|        Karen Cole|          3|
|    Scott Fletcher|          2|
| Kristine Anderson|          2|
|     David Simpson|          2|
|       Alicia Dunn|          2|
|   Kenneth Schmidt|          2|
|   Sandy Hernandez|          2|
|     Belinda Banks|          2|
|     Emily Rosales|          2|
|       Bryan Lopez|          2|
|   Robert Mitchell|          2|
|     Joshua Harris|          2|
|      Michael King|          1|
+------------------+-----------+
only showing top 20 rows



In [46]:
customers_df.join(
    orders_df,
    on=["customer_id"],
    how="left"
).groupBy(
    "customer_name"
).agg(
    count("order_id").alias("order_count")
).orderBy(
    "order_count", ascending=False
).show()

+------------------+-----------+
|     customer_name|order_count|
+------------------+-----------+
|  Gregory Lawrence|          7|
|    Erica Martinez|          4|
|Danielle Robertson|          3|
|      Jeffrey Reed|          3|
|         Guy Gibbs|          3|
|      Tammy Carter|          3|
|  Samuel Gillespie|          3|
|        Karen Cole|          3|
|    Scott Fletcher|          2|
| Kristine Anderson|          2|
|     David Simpson|          2|
|       Alicia Dunn|          2|
|   Kenneth Schmidt|          2|
|   Sandy Hernandez|          2|
|     Belinda Banks|          2|
|     Emily Rosales|          2|
|       Bryan Lopez|          2|
|   Robert Mitchell|          2|
|     Joshua Harris|          2|
|      Michael King|          1|
+------------------+-----------+
only showing top 20 rows



### Using SQL, calculate the total revenue generated for each product.

In [50]:
spark.sql(
    """
    SELECT
        p.product_id,
        p.product_name,
        SUM(o.quantity * p.price) AS total_amount
    FROM orders o
    INNER JOIN products p
    USING(product_id)
    GROUP BY 1, 2
    ORDER BY 3 DESC
    """
).show()

+----------+------------+------------+
|product_id|product_name|total_amount|
+----------+------------+------------+
|         1|       Mouse|       47352|
|         4|       Mouse|       31664|
|         3|  Headphones|       29628|
|        10|       Mouse|       19188|
|         6|      Camera|       12540|
|         9|     Monitor|       12103|
|         8|      Tablet|        7820|
|         5|      Laptop|        5776|
|         7|       Mouse|        5094|
|         2|     Monitor|        4860|
+----------+------------+------------+



### Which country has the highest number of customers who made orders?

In [51]:
spark.sql(
    """
    SELECT
        c.country,
        COUNT(customer_id) AS customer_count
    FROM customers c
    GROUP BY 1
    ORDER BY 2 DESC
    """
).show()

+--------------------+--------------+
|             country|customer_count|
+--------------------+--------------+
|Libyan Arab Jamah...|             2|
|Heard Island and ...|             1|
|Turks and Caicos ...|             1|
|British Virgin Is...|             1|
|             Reunion|             1|
|          San Marino|             1|
|               Ghana|             1|
|        Sierra Leone|             1|
|          Tajikistan|             1|
|               Chile|             1|
|               Gabon|             1|
|Central African R...|             1|
|       Liechtenstein|             1|
|             Morocco|             1|
|             Georgia|             1|
|   Wallis and Futuna|             1|
|             Mayotte|             1|
|            Ethiopia|             1|
|      Czech Republic|             1|
|               Kenya|             1|
+--------------------+--------------+
only showing top 20 rows



In [52]:
customers_df.groupBy(
    "country"
).agg(
    count("customer_id").alias("customer_count")
).orderBy(
    "customer_count", ascending=False
).show()

+--------------------+--------------+
|             country|customer_count|
+--------------------+--------------+
|Libyan Arab Jamah...|             2|
|Heard Island and ...|             1|
|Turks and Caicos ...|             1|
|British Virgin Is...|             1|
|             Reunion|             1|
|          San Marino|             1|
|               Ghana|             1|
|        Sierra Leone|             1|
|          Tajikistan|             1|
|               Chile|             1|
|               Gabon|             1|
|Central African R...|             1|
|       Liechtenstein|             1|
|             Morocco|             1|
|             Georgia|             1|
|   Wallis and Futuna|             1|
|             Mayotte|             1|
|            Ethiopia|             1|
|      Czech Republic|             1|
|               Kenya|             1|
+--------------------+--------------+
only showing top 20 rows



### Create a ranking of customers based on total amount spent using ROW_NUMBER() window function.

In [55]:
spark.sql(
    """
    SELECT
        row_number() over(order by p.price * o.quantity DESC) AS rank,
        c.customer_name,
        p.price * o.quantity AS total_spent
    FROM customers c
    INNER JOIN orders o
    USING(customer_id)
    INNER JOIN products p
    USING(product_id)
    ORDER BY 3 DESC
    """
).show()

+----+------------------+-----------+
|rank|     customer_name|total_spent|
+----+------------------+-----------+
|   1| Kristine Anderson|       9895|
|   2|         Guy Gibbs|       9865|
|   3|      Jeffrey Reed|       9865|
|   4|         Guy Gibbs|       8230|
|   5|  Kaylee Fernandez|       7916|
|   6|     Belinda Banks|       7916|
|   7|        Karen Cole|       7892|
|   8|   Sandy Hernandez|       7892|
|   9| Kristine Anderson|       7892|
|  10|      Jeffrey Reed|       7380|
|  11|  Samuel Gillespie|       6584|
|  12|Danielle Robertson|       6584|
|  13|     David Simpson|       5937|
|  14|        Karen Cole|       5904|
|  15|   Robert Mitchell|       4655|
|  16|    Nicholas Riley|       4428|
|  17|   Kenneth Schmidt|       3946|
|  18|     Emily Rosales|       3292|
|  19|    Joshua Elliott|       3292|
|  20|  Gregory Lawrence|       2793|
+----+------------------+-----------+
only showing top 20 rows



25/05/02 09:54:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/02 09:54:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/02 09:54:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/02 09:54:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/02 09:54:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/02 09:54:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/02 0

25/05/02 13:05:45 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 235789 ms exceeds timeout 120000 ms
25/05/02 13:05:45 WARN SparkContext: Killing executors is not supported by current scheduler.
25/05/02 13:05:51 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$