<a href="https://colab.research.google.com/github/abelsare348/codes/blob/pyspark/Pyspark/window_functions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=912a289d25ead764fb2ede8457707ff34d2257c4e3dada6d85044eb8e8529fd7
  Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.3


In [79]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [80]:
from pyspark.sql.types import StructType,StructField,IntegerType,StringType,FloatType

In [81]:
spark=SparkSession.builder.appName("spark_app").master("local").getOrCreate()

In [82]:
Employee_Schema=StructType([StructField("id",IntegerType(),True),
                            StructField("first_name",StringType(),True),
                            StructField("last_name",StringType(),True),
                            StructField("gender",StringType(),True),
                            StructField("salary",StringType(),True),
                            StructField("dept",StringType(),True)])

In [83]:
Employee_df=spark.read.schema(Employee_Schema).option("header",True).csv("/content/Employee1.csv")

In [84]:
Employee_df=Employee_df.withColumn("salary",F.substring("salary",2,len("salary")-1)).withColumn("salary",F.col("salary").cast("integer"))

In [85]:
Employee_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- dept: string (nullable = true)



In [86]:
Employee_Address_Schema=StructType([StructField("id",IntegerType(),True),
                                    StructField("email",StringType(),True),
                                    StructField("ip_address",StringType(),True)])

In [87]:
Employee_Address_df=spark.read.schema(Employee_Address_Schema).option("header",True).csv("/content/Employee_Address.csv")
Employee_Address_df.show(10)


+---+--------------------+---------------+
| id|               email|     ip_address|
+---+--------------------+---------------+
|  1|swinterburn0@dail...|   34.78.27.108|
|  2|aarnaldo1@eepurl.com| 190.52.131.147|
|  3|    dbuzek2@1688.com| 152.198.169.97|
|  4| sduffyn3@tumblr.com| 10.233.235.146|
|  5|   mmeeke4@jigsy.com|    59.27.18.36|
|  6|prosenblath5@live...|   12.66.142.55|
|  7|   bbuddle6@yale.edu| 58.191.128.163|
|  8|slindeberg7@topli...|  180.250.88.89|
|  9|ltwiddell8@zdnet.com|212.122.136.101|
| 10|  fgiberd9@prweb.com|   18.6.137.101|
+---+--------------------+---------------+
only showing top 10 rows



In [88]:
Employee_Address_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- email: string (nullable = true)
 |-- ip_address: string (nullable = true)



In [89]:
from pyspark.sql.window import Window
from pyspark.sql import functions as F

<b> Average Function <b>

In [90]:
Employee_df.withColumn("Avergae_salary",F.avg("salary").over(Window.partitionBy())).show(5)

+---+----------+---------+------+------+--------------------+--------------+
| id|first_name|last_name|gender|salary|                dept|Avergae_salary|
+---+----------+---------+------+------+--------------------+--------------+
|  1| Marmaduke| Akenhead|  Male| 36478|            Training|      35499.17|
|  2|    Corbin|    Prahm|  Male| 34566|Business Development|      35499.17|
|  3|   Halette|   Yersin|Female| 43506|Research and Deve...|      35499.17|
|  4|     Essie|  Quixley|Female| 20111|           Marketing|      35499.17|
|  5|  Brandice|   Boyles|Female| 28720|            Training|      35499.17|
+---+----------+---------+------+------+--------------------+--------------+
only showing top 5 rows



**Count Function**

In [91]:
Employee_df.withColumn("Gender_based_Count",F.count("*").over(Window.partitionBy("dept"))).show(55)

+---+----------+----------+-----------+------+--------------------+------------------+
| id|first_name| last_name|     gender|salary|                dept|Gender_based_Count|
+---+----------+----------+-----------+------+--------------------+------------------+
| 11|   Thornie|  Stidever|       Male| 38823|          Accounting|                10|
| 12|     Dawna|    Ambage|    Agender| 37374|          Accounting|                10|
| 13|   Hoebart|   Sillito|       Male| 29249|          Accounting|                10|
| 14|   Micaela|Josifovitz|     Female| 26098|          Accounting|                10|
| 32|    Nancee|   Sibbert|     Female| 22281|          Accounting|                10|
| 69|     Sheba|    Edwins|     Female| 34164|          Accounting|                10|
| 84|     Avrit|    Marrow|     Female| 48311|          Accounting|                10|
| 85|   Ardelia|    Bayley|     Female| 47814|          Accounting|                10|
| 91|       Ray|    Collop|     Female| 389

**Sum Functions**

In [93]:
Employee_df.withColumn("total_money_spent_on_salaries",F.sum("salary").over(Window.partitionBy("dept").orderBy("salary"))).show()

+---+----------+----------+-------+------+--------------------+-----------------------------+
| id|first_name| last_name| gender|salary|                dept|total_money_spent_on_salaries|
+---+----------+----------+-------+------+--------------------+-----------------------------+
| 32|    Nancee|   Sibbert| Female| 22281|          Accounting|                        22281|
| 14|   Micaela|Josifovitz| Female| 26098|          Accounting|                        48379|
| 13|   Hoebart|   Sillito|   Male| 29249|          Accounting|                        77628|
| 69|     Sheba|    Edwins| Female| 34164|          Accounting|                       111792|
| 12|     Dawna|    Ambage|Agender| 37374|          Accounting|                       149166|
| 11|   Thornie|  Stidever|   Male| 38823|          Accounting|                       187989|
| 91|       Ray|    Collop| Female| 38947|          Accounting|                       226936|
| 85|   Ardelia|    Bayley| Female| 47814|          Accounti

**Row_Number Function**

In [102]:
Employee_df.withColumn("Row_Number",F.row_number().over(Window.partitionBy("dept").orderBy(F.col("salary").desc()))).show()

+---+----------+----------+-------+------+--------------------+----------+
| id|first_name| last_name| gender|salary|                dept|Row_Number|
+---+----------+----------+-------+------+--------------------+----------+
| 92|    Mellie|   O'Brien| Female| 48672|          Accounting|         1|
| 84|     Avrit|    Marrow| Female| 48311|          Accounting|         2|
| 85|   Ardelia|    Bayley| Female| 47814|          Accounting|         3|
| 91|       Ray|    Collop| Female| 38947|          Accounting|         4|
| 11|   Thornie|  Stidever|   Male| 38823|          Accounting|         5|
| 12|     Dawna|    Ambage|Agender| 37374|          Accounting|         6|
| 69|     Sheba|    Edwins| Female| 34164|          Accounting|         7|
| 13|   Hoebart|   Sillito|   Male| 29249|          Accounting|         8|
| 14|   Micaela|Josifovitz| Female| 26098|          Accounting|         9|
| 32|    Nancee|   Sibbert| Female| 22281|          Accounting|        10|
| 60|    Kylila|  Redholl

**Rank**

In [104]:
Employee_df.withColumn("rank",F.rank().over(Window.partitionBy("dept").orderBy(F.col("salary").desc()))).show()

+---+----------+----------+-------+------+--------------------+----+
| id|first_name| last_name| gender|salary|                dept|rank|
+---+----------+----------+-------+------+--------------------+----+
| 92|    Mellie|   O'Brien| Female| 48672|          Accounting|   1|
| 84|     Avrit|    Marrow| Female| 48311|          Accounting|   2|
| 85|   Ardelia|    Bayley| Female| 47814|          Accounting|   3|
| 91|       Ray|    Collop| Female| 38947|          Accounting|   4|
| 11|   Thornie|  Stidever|   Male| 38823|          Accounting|   5|
| 12|     Dawna|    Ambage|Agender| 37374|          Accounting|   6|
| 69|     Sheba|    Edwins| Female| 34164|          Accounting|   7|
| 13|   Hoebart|   Sillito|   Male| 29249|          Accounting|   8|
| 14|   Micaela|Josifovitz| Female| 26098|          Accounting|   9|
| 32|    Nancee|   Sibbert| Female| 22281|          Accounting|  10|
| 60|    Kylila|  Redholls| Female| 46474|Business Development|   1|
| 71|  Waldemar|   Akister|   Male

**Dense Rank**

In [107]:
Employee_df.withColumn("dense_rank",F.dense_rank().over(Window.partitionBy("dept").orderBy(F.col("salary").desc()))).show()

+---+----------+----------+-------+------+--------------------+----------+
| id|first_name| last_name| gender|salary|                dept|dense_rank|
+---+----------+----------+-------+------+--------------------+----------+
| 92|    Mellie|   O'Brien| Female| 48672|          Accounting|         1|
| 84|     Avrit|    Marrow| Female| 48311|          Accounting|         2|
| 85|   Ardelia|    Bayley| Female| 47814|          Accounting|         3|
| 91|       Ray|    Collop| Female| 38947|          Accounting|         4|
| 11|   Thornie|  Stidever|   Male| 38823|          Accounting|         5|
| 12|     Dawna|    Ambage|Agender| 37374|          Accounting|         6|
| 69|     Sheba|    Edwins| Female| 34164|          Accounting|         7|
| 13|   Hoebart|   Sillito|   Male| 29249|          Accounting|         8|
| 14|   Micaela|Josifovitz| Female| 26098|          Accounting|         9|
| 32|    Nancee|   Sibbert| Female| 22281|          Accounting|        10|
| 60|    Kylila|  Redholl

In [114]:
Employee_df.withColumn("Ntile",F.ntile(4).over(Window.partitionBy().orderBy(F.col("id")))).show(100)

+---+----------+------------+-----------+------+--------------------+-----+
| id|first_name|   last_name|     gender|salary|                dept|Ntile|
+---+----------+------------+-----------+------+--------------------+-----+
|  1| Marmaduke|    Akenhead|       Male| 36478|            Training|    1|
|  2|    Corbin|       Prahm|       Male| 34566|Business Development|    1|
|  3|   Halette|      Yersin|     Female| 43506|Research and Deve...|    1|
|  4|     Essie|     Quixley|     Female| 20111|           Marketing|    1|
|  5|  Brandice|      Boyles|     Female| 28720|            Training|    1|
|  6|   Mellisa|       Jinks|     Female| 37349|         Engineering|    1|
|  7|     Berty|     Gulston|       Male| 48303|Research and Deve...|    1|
|  8|   Chantal|       Judge|     Female| 38240|         Engineering|    1|
|  9|   Jeniece|     Burnand|     Female| 22241|            Training|    1|
| 10|    Laurie|     Burcher|       Male| 42245|Business Development|    1|
| 11|   Thor

In [118]:
Employee_df.withColumn("first_value_of_department_top_salaried",F.first_value("salary").over(Window.partitionBy("dept").orderBy(F.col("salary").desc()))).orderBy("dept").show(100)

+---+----------+------------+-----------+------+--------------------+--------------------------------------+
| id|first_name|   last_name|     gender|salary|                dept|first_value_of_department_top_salaried|
+---+----------+------------+-----------+------+--------------------+--------------------------------------+
| 92|    Mellie|     O'Brien|     Female| 48672|          Accounting|                                 48672|
| 84|     Avrit|      Marrow|     Female| 48311|          Accounting|                                 48672|
| 85|   Ardelia|      Bayley|     Female| 47814|          Accounting|                                 48672|
| 91|       Ray|      Collop|     Female| 38947|          Accounting|                                 48672|
| 11|   Thornie|    Stidever|       Male| 38823|          Accounting|                                 48672|
| 12|     Dawna|      Ambage|    Agender| 37374|          Accounting|                                 48672|
| 69|     Sheba|   