<a href="https://colab.research.google.com/github/abelsare348/codes/blob/pyspark/Pyspark/limit_distinct_alias_filtering_between_like_casewhen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [5]:
from pyspark.sql.types import StructType,StructField,IntegerType,StringType

In [8]:
spark=SparkSession.builder.appName("spark_app").master("local").getOrCreate()

In [6]:
Employee_Schema=StructType([StructField("id",IntegerType(),True),
                            StructField("first_name",StringType(),True),
                            StructField("last_name",StringType(),True),
                            StructField("gender",StringType(),True)])

In [18]:
Employee_df=spark.read.schema(Employee_Schema).option("header",True).csv("/content/Employee.csv")

In [20]:
Employee_df.show(5)

+---+----------+----------+------+
| id|first_name| last_name|gender|
+---+----------+----------+------+
|  1|     Moise|    Coveny|  Male|
|  2|  Claudius|Wickersham|  Male|
|  3|     Mateo|   Redmond|  Male|
|  4|    Finlay|     Salla|  Male|
|  5|   Rudyard| Brockbank|  Male|
+---+----------+----------+------+
only showing top 5 rows



In [21]:
Employee_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)



In [22]:
Employee_Address_Schema=StructType([StructField("id",IntegerType(),True),
                                    StructField("email",StringType(),True),
                                    StructField("ip_address",StringType(),True)])

In [24]:
Employee_Address_df=spark.read.schema(Employee_Address_Schema).option("header",True).csv("/content/Employee_Address.csv")
Employee_Address_df.show(10)


+---+--------------------+---------------+
| id|               email|     ip_address|
+---+--------------------+---------------+
|  1|swinterburn0@dail...|   34.78.27.108|
|  2|aarnaldo1@eepurl.com| 190.52.131.147|
|  3|    dbuzek2@1688.com| 152.198.169.97|
|  4| sduffyn3@tumblr.com| 10.233.235.146|
|  5|   mmeeke4@jigsy.com|    59.27.18.36|
|  6|prosenblath5@live...|   12.66.142.55|
|  7|   bbuddle6@yale.edu| 58.191.128.163|
|  8|slindeberg7@topli...|  180.250.88.89|
|  9|ltwiddell8@zdnet.com|212.122.136.101|
| 10|  fgiberd9@prweb.com|   18.6.137.101|
+---+--------------------+---------------+
only showing top 10 rows



In [25]:
Employee_Address_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- email: string (nullable = true)
 |-- ip_address: string (nullable = true)



Union Keyword

In [29]:
male_df=Employee_df.filter("gender='Male'")

In [31]:
female_df=Employee_df.filter("gender='Female'")

In [33]:
unisex_df=male_df.union(female_df)

In [38]:
unisex_df.take(10)

[Row(id=1, first_name='Moise', last_name='Coveny', gender='Male'),
 Row(id=2, first_name='Claudius', last_name='Wickersham', gender='Male'),
 Row(id=3, first_name='Mateo', last_name='Redmond', gender='Male'),
 Row(id=4, first_name='Finlay', last_name='Salla', gender='Male'),
 Row(id=5, first_name='Rudyard', last_name='Brockbank', gender='Male'),
 Row(id=7, first_name='Zane', last_name='Cahillane', gender='Male'),
 Row(id=10, first_name='Ari', last_name='Larkworthy', gender='Male'),
 Row(id=12, first_name='Maynard', last_name='Ponten', gender='Male'),
 Row(id=13, first_name='Dom', last_name='Dallaway', gender='Male'),
 Row(id=14, first_name='Dene', last_name='Philliskirk', gender='Male')]

Limit Keyword

In [40]:
Employee_df.limit(10).show()

+---+----------+----------+------+
| id|first_name| last_name|gender|
+---+----------+----------+------+
|  1|     Moise|    Coveny|  Male|
|  2|  Claudius|Wickersham|  Male|
|  3|     Mateo|   Redmond|  Male|
|  4|    Finlay|     Salla|  Male|
|  5|   Rudyard| Brockbank|  Male|
|  6|    Joanie|   Kembley|Female|
|  7|      Zane| Cahillane|  Male|
|  8|    Davida|  Doubrava|Female|
|  9|    Anjela|    Wackly|Female|
| 10|       Ari|Larkworthy|  Male|
+---+----------+----------+------+



Distinct Keyword

In [43]:
Employee_df.select("gender","first_name").distinct().show(5)

+------+----------+
|gender|first_name|
+------+----------+
|  Male|     Tommy|
|Female|  Hilliary|
|Female|  Shaylynn|
|  Male|   Vincenz|
|  Male|     Olvan|
+------+----------+
only showing top 5 rows



In [None]:
alias keyword

In [45]:
from pyspark.sql import functions as F

In [47]:

Employee_df.select(F.col("first_name").alias("FName"),F.col("last_name").alias("LName")).show(5)

+--------+----------+
|   FName|     LName|
+--------+----------+
|   Moise|    Coveny|
|Claudius|Wickersham|
|   Mateo|   Redmond|
|  Finlay|     Salla|
| Rudyard| Brockbank|
+--------+----------+
only showing top 5 rows



Filtering on multiple condition

In [49]:
Employee_df.filter("gender='Male' and id<20 and id>10").show()

+---+----------+-----------+------+
| id|first_name|  last_name|gender|
+---+----------+-----------+------+
| 12|   Maynard|     Ponten|  Male|
| 13|       Dom|   Dallaway|  Male|
| 14|      Dene|Philliskirk|  Male|
| 15|   Phillip|    Tillard|  Male|
| 16|     Tommy|   Blackway|  Male|
+---+----------+-----------+------+



between keyword

In [51]:
Employee_df.filter(F.col("id").between(1,50)).show()

+---+----------+-----------+------+
| id|first_name|  last_name|gender|
+---+----------+-----------+------+
|  1|     Moise|     Coveny|  Male|
|  2|  Claudius| Wickersham|  Male|
|  3|     Mateo|    Redmond|  Male|
|  4|    Finlay|      Salla|  Male|
|  5|   Rudyard|  Brockbank|  Male|
|  6|    Joanie|    Kembley|Female|
|  7|      Zane|  Cahillane|  Male|
|  8|    Davida|   Doubrava|Female|
|  9|    Anjela|     Wackly|Female|
| 10|       Ari| Larkworthy|  Male|
| 11|    Elmira| Goldthorpe|Female|
| 12|   Maynard|     Ponten|  Male|
| 13|       Dom|   Dallaway|  Male|
| 14|      Dene|Philliskirk|  Male|
| 15|   Phillip|    Tillard|  Male|
| 16|     Tommy|   Blackway|  Male|
| 17|      Wren|     Geator|Female|
| 18|  Caroline|    Daubney|Female|
| 19|Margarethe|    Tollett|Female|
| 20|      Sula|  Pitkethly|Female|
+---+----------+-----------+------+
only showing top 20 rows



In [54]:
Employee_Address_df.select("email").distinct().show(5)

+--------------------+
|               email|
+--------------------+
|ageroldo1v@arizon...|
|llidden1i@newsvin...|
|cloines2r@domainm...|
|rtitt2l@amazon.co.jp|
|vfahrenbachy@mash...|
+--------------------+
only showing top 5 rows



Like Keyword

In [55]:
Employee_Address_df.filter(F.col("email").like("%@amazon.co.jp")).show()

+---+--------------------+--------------+
| id|               email|    ip_address|
+---+--------------------+--------------+
| 94|rtitt2l@amazon.co.jp|160.219.21.134|
+---+--------------------+--------------+



Case when in pyspark

In [60]:
Employee_df.withColumn("Priority",F.when(F.col("id").between(1,30),"High").when(F.col("id").between(31,60),"Mid").otherwise("Low")).show(5)

+---+----------+----------+------+--------+
| id|first_name| last_name|gender|Priority|
+---+----------+----------+------+--------+
|  1|     Moise|    Coveny|  Male|    High|
|  2|  Claudius|Wickersham|  Male|    High|
|  3|     Mateo|   Redmond|  Male|    High|
|  4|    Finlay|     Salla|  Male|    High|
|  5|   Rudyard| Brockbank|  Male|    High|
+---+----------+----------+------+--------+
only showing top 5 rows

