In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import os

spark = SparkSession.builder.master('local[1]').appName('bbi').getOrCreate()

24/01/11 11:45:35 WARN Utils: Your hostname, Tech-Buddy resolves to a loopback address: 127.0.1.1; using 172.17.35.137 instead (on interface eth0)
24/01/11 11:45:35 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/11 11:45:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
from pyspark.sql import functions as F

# Get all attributes (functions) in the pyspark.sql.functions module
all_functions = [attr for attr in dir(F) if callable(getattr(F, attr))]

# Filter only for functions (excluding classes, private methods, etc.)
aggregate_functions = [func for func in all_functions if not func.startswith("_") and callable(getattr(F, func))]
li = []
# Print the names of aggregate functions
print("Aggregate Functions in PySpark:")
for func_name in aggregate_functions:
    li.append(func_name)
print(len(li))

Aggregate Functions in PySpark:
451


In [3]:
# Sample data
data = [
    (1, "A", 10),
    (2, "B", 20),
    (3, "A", 30),
    (4, "B", 40),
    (5, "A", 50)
]

# Create a DataFrame
columns = ["id", "category", "value"]
df = spark.createDataFrame(data, columns)
df.show()

                                                                                

+---+--------+-----+
| id|category|value|
+---+--------+-----+
|  1|       A|   10|
|  2|       B|   20|
|  3|       A|   30|
|  4|       B|   40|
|  5|       A|   50|
+---+--------+-----+



In [4]:
# 1. Sum
result_sum = df.select(sum("value").alias("total_value"))
print("\n1. Sum:")
result_sum.show()


1. Sum:
+-----------+
|total_value|
+-----------+
|        150|
+-----------+



In [5]:
# 2. Average
result_avg = df.select(avg("value").alias("average_value"))
print("\n2. Average:")
result_avg.show()


2. Average:
+-------------+
|average_value|
+-------------+
|         30.0|
+-------------+



In [6]:
# 3. Maximum
result_max = df.select(max("value").alias("max_value"))
print("\n3. Maximum:")
result_max.show()



3. Maximum:
+---------+
|max_value|
+---------+
|       50|
+---------+



In [7]:
# 4. Minimum
result_min = df.select(min("value").alias("min_value"))
print("\n4. Minimum:")
result_min.show()


4. Minimum:
+---------+
|min_value|
+---------+
|       10|
+---------+



In [8]:
# 5. Count
result_count = df.select(count("*").alias("row_count"))
print("\n5. Count:")
result_count.show()


5. Count:
+---------+
|row_count|
+---------+
|        5|
+---------+



In [9]:
# 6. CountDistinct
result_count_distinct = df.select(countDistinct("category").alias("distinct_categories"))
print("\n6. CountDistinct:")
result_count_distinct.show()


6. CountDistinct:
+-------------------+
|distinct_categories|
+-------------------+
|                  2|
+-------------------+



In [28]:
# 7. First
result_first = df.select(first("value").alias("first_value"))
df.show()
print("\n7. First:")
result_first.show()

+---+--------+-----+
| id|category|value|
+---+--------+-----+
|  1|       A|   10|
|  2|       B|   20|
|  3|       A|   30|
|  4|       B|   40|
|  5|       A|   50|
+---+--------+-----+


7. First:
+-----------+
|first_value|
+-----------+
|         10|
+-----------+



In [11]:
# 8. Last
result_last = df.select(last("value").alias("last_value"))
print("\n8. Last:")
result_last.show()


8. Last:
+----------+
|last_value|
+----------+
|        50|
+----------+



In [12]:
# 9. Mean
result_mean = df.select(mean("value").alias("mean_value"))
print("\n9. Mean:")
result_mean.show()



9. Mean:
+----------+
|mean_value|
+----------+
|      30.0|
+----------+



In [13]:
# 10. Standard Deviation
result_stddev = df.select(stddev("value").alias("stddev_value"))
print("\n10. Standard Deviation:")
result_stddev.show()


10. Standard Deviation:
+------------------+
|      stddev_value|
+------------------+
|15.811388300841896|
+------------------+



In [14]:
# 11. Variance
result_variance = df.select(variance("value").alias("variance_value"))
print("\n11. Variance:")
result_variance.show()


11. Variance:
+--------------+
|variance_value|
+--------------+
|         250.0|
+--------------+



In [15]:
# 12. Correlation
result_corr = df.select(corr("id", "value").alias("correlation"))
print("\n12. Correlation:")
result_corr.show()


12. Correlation:
+-----------+
|correlation|
+-----------+
|        1.0|
+-----------+



In [16]:
# 13. Covariance
result_covar = df.select(covar_pop("id", "value").alias("covariance"))
print("\n13. Covariance:")
result_covar.show()


13. Covariance:
+----------+
|covariance|
+----------+
|      20.0|
+----------+



In [17]:
# 14. Collect List
result_collect_list = df.groupBy("category").agg(collect_list("value").alias("values_list"))
print("\n14. Collect List:")
result_collect_list.show(truncate=False)


14. Collect List:
+--------+------------+
|category|values_list |
+--------+------------+
|B       |[20, 40]    |
|A       |[10, 30, 50]|
+--------+------------+



In [18]:
# 15. Collect Set
result_collect_set = df.groupBy("category").agg(collect_set("value").alias("values_set"))
print("\n15. Collect Set:")
result_collect_set.show(truncate=False)


15. Collect Set:
+--------+------------+
|category|values_set  |
+--------+------------+
|B       |[20, 40]    |
|A       |[30, 50, 10]|
+--------+------------+



In [19]:

# 16. Skewness
result_skewness = df.select(skewness("value").alias("skewness_value"))
print("\n16. Skewness:")
result_skewness.show()


16. Skewness:
+--------------+
|skewness_value|
+--------------+
|           0.0|
+--------------+



In [20]:
# 17. Kurtosis
result_kurtosis = df.select(kurtosis("value").alias("kurtosis_value"))
print("\n17. Kurtosis:")
result_kurtosis.show()


17. Kurtosis:
+--------------+
|kurtosis_value|
+--------------+
|          -1.3|
+--------------+



In [21]:
# 18. Approximate Quantile
result_quantile = df.stat.approxQuantile("value", [0.25, 0.5, 0.75], 0.01)
print("\n18. Approximate Quantile:")
print(result_quantile)


18. Approximate Quantile:
[20.0, 30.0, 40.0]


In [29]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import dense_rank

# Create a Spark session
spark = SparkSession.builder.appName("partitionby_dense_rank_example").getOrCreate()

# Sample data
data = [
    (1, "A", 100),
    (2, "A", 150),
    (3, "B", 200),
    (4, "B", 180),
    (5, "A", 120),
    (6, "B", 160),
]

# Create a DataFrame
columns = ["id", "category", "value"]
df = spark.createDataFrame(data, columns)

# Define a window specification with partitionBy
window_spec = Window().partitionBy("category").orderBy("value")

# Use dense_rank function with partitionBy
result = df.withColumn("dense_rank", dense_rank().over(window_spec))

# Show the result
result.show()

24/01/11 00:58:26 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


+---+--------+-----+----------+
| id|category|value|dense_rank|
+---+--------+-----+----------+
|  1|       A|  100|         1|
|  5|       A|  120|         2|
|  2|       A|  150|         3|
|  6|       B|  160|         1|
|  4|       B|  180|         2|
|  3|       B|  200|         3|
+---+--------+-----+----------+



In [32]:
# Order the DataFrame by the "value" column in ascending order
result_asc = df.orderBy("value")

# Show the result
print("Ordered DataFrame (Ascending):")
result_asc.show()

# Order the DataFrame by the "value" column in descending order
result_desc = df.orderBy(col("value").desc())

# Show the result
print("\nOrdered DataFrame (Descending):")
result_desc.show()


Ordered DataFrame (Ascending):
+---+--------+-----+
| id|category|value|
+---+--------+-----+
|  1|       A|  100|
|  5|       A|  120|
|  2|       B|  150|
|  6|       B|  160|
|  4|       B|  180|
|  3|       A|  200|
+---+--------+-----+


Ordered DataFrame (Descending):
+---+--------+-----+
| id|category|value|
+---+--------+-----+
|  3|       A|  200|
|  4|       B|  180|
|  6|       B|  160|
|  2|       B|  150|
|  5|       A|  120|
|  1|       A|  100|
+---+--------+-----+



In [33]:
# Define a window specification with rangeBetween
window_spec = Window().partitionBy("category").orderBy("id").rangeBetween(-1, 1)

# Use sum window function with rangeBetween
result = df.withColumn("sum_value", sum("value").over(window_spec))

# Show the result
result.show()

+---+--------+-----+---------+
| id|category|value|sum_value|
+---+--------+-----+---------+
|  1|       A|  100|      100|
|  3|       A|  200|      200|
|  5|       A|  120|      120|
|  2|       B|  150|      150|
|  4|       B|  180|      180|
|  6|       B|  160|      160|
+---+--------+-----+---------+



In [38]:
# Define a window specification with rowsBetween
window_spec = Window().partitionBy("category").orderBy("id").rowsBetween(-1, 1)
# Use sum window function with rowsBetween
result = df.withColumn("sum_value", sum("value").over(window_spec))

# Show the result
result.show()

+---+--------+-----+---------+
| id|category|value|sum_value|
+---+--------+-----+---------+
|  1|       A|  100|      300|
|  3|       A|  200|      420|
|  5|       A|  120|      320|
|  2|       B|  150|      330|
|  4|       B|  180|      490|
|  6|       B|  160|      340|
+---+--------+-----+---------+



In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col

# Create a Spark session
spark = SparkSession.builder.appName("rank_example").getOrCreate()

# Sample sales data with ties in rank 3
data = [
    (1, "Electronics", 500),
    (2, "Electronics", 700),
    (3, "Electronics", 600),
    (4, "Clothing", 300),
    (5, "Clothing", 250),
    (6, "Clothing", 400),
    (7, "Clothing", 350),
    (8, "Clothing", 400),  # Adding a tied row for rank 3
]

# Create a DataFrame
columns = ["product_id", "category", "sales_amount"]
df = spark.createDataFrame(data, columns)

# Define a window specification for ranking within each category
window_spec = Window().partitionBy("category").orderBy(col("sales_amount").desc())

# Use rank function to assign ranks to products within each category
df_ranked = df.withColumn("rank", rank().over(window_spec))
df_ranked.show()
# Filter for the details of the rank 4 product
rank_4_details = df_ranked.filter(col("rank") == 4)

# Show the result
rank_4_details.show()


24/01/11 21:51:58 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

+----------+-----------+------------+----+
|product_id|   category|sales_amount|rank|
+----------+-----------+------------+----+
|         6|   Clothing|         400|   1|
|         8|   Clothing|         400|   1|
|         7|   Clothing|         350|   3|
|         4|   Clothing|         300|   4|
|         5|   Clothing|         250|   5|
|         2|Electronics|         700|   1|
|         3|Electronics|         600|   2|
|         1|Electronics|         500|   3|
+----------+-----------+------------+----+

+----------+--------+------------+----+
|product_id|category|sales_amount|rank|
+----------+--------+------------+----+
|         4|Clothing|         300|   4|
+----------+--------+------------+----+

