## Using SQL queries and where() method on spark dataframes

### Build SparkSession:

In [2]:
import findspark
findspark.init()

In [3]:
import pyspark
from pyspark.sql import SparkSession


In [4]:
spark = SparkSession.builder.getOrCreate()

### Read the json file:

In [5]:
json_Df = spark.read.json("DataFrames_sample.json")

### Display the schema:


In [6]:
json_Df.printSchema()

root
 |-- D: double (nullable = true)
 |-- H: double (nullable = true)
 |-- HDD: string (nullable = true)
 |-- Id: long (nullable = true)
 |-- Model: string (nullable = true)
 |-- RAM: string (nullable = true)
 |-- ScreenSize: string (nullable = true)
 |-- W: double (nullable = true)
 |-- Weight: double (nullable = true)
 |-- Year: long (nullable = true)



In [7]:
json_Df.show()

+----+----+---------+---+-----------+----+----------+-----+------+----+
|   D|   H|      HDD| Id|      Model| RAM|ScreenSize|    W|Weight|Year|
+----+----+---------+---+-----------+----+----------+-----+------+----+
|9.48|0.61|512GB SSD|  1|MacBook Pro|16GB|       15"|13.75|  4.02|2015|
|7.74|0.52|256GB SSD|  2|    MacBook| 8GB|       12"|11.04|  2.03|2016|
|8.94|0.68|128GB SSD|  3|MacBook Air| 8GB|     13.3"| 12.8|  2.96|2016|
| 8.0|20.3|  1TB SSD|  4|       iMac|64GB|       27"| 25.6|  20.8|2017|
+----+----+---------+---+-----------+----+----------+-----+------+----+



### Get all the data when "Model" equal "MacBook Pro":




In [8]:
json_Df.where("Model == 'MacBook Pro'").show()

+----+----+---------+---+-----------+----+----------+-----+------+----+
|   D|   H|      HDD| Id|      Model| RAM|ScreenSize|    W|Weight|Year|
+----+----+---------+---+-----------+----+----------+-----+------+----+
|9.48|0.61|512GB SSD|  1|MacBook Pro|16GB|       15"|13.75|  4.02|2015|
+----+----+---------+---+-----------+----+----------+-----+------+----+



### Create TempView:

In [9]:
json_Df.createOrReplaceTempView("json_Df_view")

### Display "RAM"column and count "RAM" column:

In [10]:
spark.sql(""" SELECT RAM
                From json_Df_view""" ).show()

+----+
| RAM|
+----+
|16GB|
| 8GB|
| 8GB|
|64GB|
+----+



In [11]:
count = spark.sql(""" SELECT count(RAM)
                From json_Df_view""" )
count.collect()[0][0]


4

### Get all columns when "Year" column equal "2015"  

In [12]:
spark.sql(""" SELECT *
              From json_Df_view
              WHERE Year == 2015""" ).show()

+----+----+---------+---+-----------+----+----------+-----+------+----+
|   D|   H|      HDD| Id|      Model| RAM|ScreenSize|    W|Weight|Year|
+----+----+---------+---+-----------+----+----------+-----+------+----+
|9.48|0.61|512GB SSD|  1|MacBook Pro|16GB|       15"|13.75|  4.02|2015|
+----+----+---------+---+-----------+----+----------+-----+------+----+



### Get all when "Model" start with "M":

In [13]:
spark.sql(""" SELECT *
              From json_Df_view
              WHERE Model LIKE 'M%' """ ).show()

+----+----+---------+---+-----------+----+----------+-----+------+----+
|   D|   H|      HDD| Id|      Model| RAM|ScreenSize|    W|Weight|Year|
+----+----+---------+---+-----------+----+----------+-----+------+----+
|9.48|0.61|512GB SSD|  1|MacBook Pro|16GB|       15"|13.75|  4.02|2015|
|7.74|0.52|256GB SSD|  2|    MacBook| 8GB|       12"|11.04|  2.03|2016|
|8.94|0.68|128GB SSD|  3|MacBook Air| 8GB|     13.3"| 12.8|  2.96|2016|
+----+----+---------+---+-----------+----+----------+-----+------+----+



### Get all data when "Model" column equal "MacBook Pro"

In [14]:
spark.sql(""" SELECT *
              From json_Df_view
              WHERE Model == 'MacBook Pro'""" ).show()

+----+----+---------+---+-----------+----+----------+-----+------+----+
|   D|   H|      HDD| Id|      Model| RAM|ScreenSize|    W|Weight|Year|
+----+----+---------+---+-----------+----+----------+-----+------+----+
|9.48|0.61|512GB SSD|  1|MacBook Pro|16GB|       15"|13.75|  4.02|2015|
+----+----+---------+---+-----------+----+----------+-----+------+----+



### Get all data with Multiple Conditions when "RAM" column equal "8GB" and "Model" column is "Macbook".

In [15]:
spark.sql("""
            SELECT * 
            FROM json_Df_view
            WHERE Model = 'MacBook' AND RAM = '8GB'
        """).show()

+----+----+---------+---+-------+---+----------+-----+------+----+
|   D|   H|      HDD| Id|  Model|RAM|ScreenSize|    W|Weight|Year|
+----+----+---------+---+-------+---+----------+-----+------+----+
|7.74|0.52|256GB SSD|  2|MacBook|8GB|       12"|11.04|  2.03|2016|
+----+----+---------+---+-------+---+----------+-----+------+----+



### Get all data with Multiple Conditions when "D" greater than or equal "8" and "Model" column is "iMac".

In [16]:
spark.sql("""
            SELECT * 
            FROM json_Df_view
            WHERE D >= 8 AND Model = 'iMac'
        """).show()

+---+----+-------+---+-----+----+----------+----+------+----+
|  D|   H|    HDD| Id|Model| RAM|ScreenSize|   W|Weight|Year|
+---+----+-------+---+-----+----+----------+----+------+----+
|8.0|20.3|1TB SSD|  4| iMac|64GB|       27"|25.6|  20.8|2017|
+---+----+-------+---+-----+----+----------+----+------+----+



## Task 2


### Read "test1" dataset:

In [17]:
test1_df = spark.read.csv("test1.csv", header=True , inferSchema=True)
test1_df.show(5)

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
+---------+---+----------+------+
only showing top 5 rows



### Display Salary of the people less than or equal to 20000

In [18]:
test1_df.where("Salary <= 20000").show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
|  Sunny| 29|         4| 20000|
|   Paul| 24|         3| 20000|
| Harsha| 21|         1| 15000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



In [19]:
test1_df.select("Salary").where("Salary <= 20000").show()

+------+
|Salary|
+------+
| 20000|
| 20000|
| 15000|
| 18000|
+------+



### Display Salary of the people less than or equal to 20000 and greater than or equal 15000

In [20]:
test1_df.where("Salary <= 20000 AND Salary<=15000").show()

+------+---+----------+------+
|  Name|age|Experience|Salary|
+------+---+----------+------+
|Harsha| 21|         1| 15000|
+------+---+----------+------+



In [21]:
test1_df.select("Salary").where("Salary <= 20000 AND Salary<=15000").show()

+------+
|Salary|
+------+
| 15000|
+------+



## Task 3 

### Read "test3" dataset:

In [22]:
test3_df = spark.read.csv("test3.csv", header=True , inferSchema=True)


### Display dataset

In [23]:
test3_df.show()

+---------+------------+------+
|     Name| Departments|salary|
+---------+------------+------+
|    Krish|Data Science| 10000|
|    Krish|         IOT|  5000|
|   Mahesh|    Big Data|  4000|
|    Krish|    Big Data|  4000|
|   Mahesh|Data Science|  3000|
|Sudhanshu|Data Science| 20000|
|Sudhanshu|         IOT| 10000|
|Sudhanshu|    Big Data|  5000|
|    Sunny|Data Science| 10000|
|    Sunny|    Big Data|  2000|
+---------+------------+------+



### Display schema

In [24]:
test3_df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Departments: string (nullable = true)
 |-- salary: integer (nullable = true)



In [25]:
import pyspark.sql.functions as F

### Group by "Name" column and using sum function on "Name" column

In [26]:
test3_df.groupBy("Name").agg(F.sum(F.col("Salary"))).show()

+---------+-----------+
|     Name|sum(Salary)|
+---------+-----------+
|Sudhanshu|      35000|
|    Sunny|      12000|
|    Krish|      19000|
|   Mahesh|       7000|
+---------+-----------+



### Group by "Name" column and using avg function on "Name" column

In [27]:
test3_df.groupBy("Name").agg(F.avg(F.col("Salary"))).show()

+---------+------------------+
|     Name|       avg(Salary)|
+---------+------------------+
|Sudhanshu|11666.666666666666|
|    Sunny|            6000.0|
|    Krish| 6333.333333333333|
|   Mahesh|            3500.0|
+---------+------------------+



### Group by "Departments" column and using sum function on "Departments" column

In [28]:
test3_df.groupBy("Departments").agg(F.sum(F.col("Salary"))).show()

+------------+-----------+
| Departments|sum(Salary)|
+------------+-----------+
|         IOT|      15000|
|    Big Data|      15000|
|Data Science|      43000|
+------------+-----------+



### Group by "Departments" column and using mean function on "Departments" column:

In [29]:
test3_df.groupBy("Departments").agg(F.avg(F.col("Salary"))).show()

+------------+-----------+
| Departments|avg(Salary)|
+------------+-----------+
|         IOT|     7500.0|
|    Big Data|     3750.0|
|Data Science|    10750.0|
+------------+-----------+



Group by "Departments" column and using count function on "Departments" column:

In [30]:
test3_df.groupBy("Departments").agg(F.count(F.col("Departments"))).show()

+------------+------------------+
| Departments|count(Departments)|
+------------+------------------+
|         IOT|                 2|
|    Big Data|                 4|
|Data Science|                 4|
+------------+------------------+



### Apply agg to using sum function get the total of salaries

In [31]:
test3_df.agg(F.sum("salary")).collect()[0][0]

73000