In [1]:

import findspark
findspark.init()

#import functions/Classes for sparkml

from pyspark.sql import SparkSession

# Examples


## Task 1 - Create a spark session


In [2]:

spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

## Task 2 - Load csv file into a dataframe


Load the dataset into the spark dataframe


In [3]:
# Load mpg dataset
mpg_data = spark.read.csv("mpg.csv", header=True, inferSchema=True)


## Task 3 - Create a temporary view


Create a temporary view of the DataFrame named mileage


In [4]:

mpg_data.createOrReplaceTempView("mileage")


## Task 4 - Run a SQL query on the dataframe


Select all cars whose mileage is more than 40


In [5]:
results = spark.sql("SELECT * FROM mileage WHERE MPG > 40")

In [6]:
# show the results
results.show()

+----+---------+-----------+----------+------+----------+----+--------+
| MPG|Cylinders|Engine Disp|Horsepower|Weight|Accelerate|Year|  Origin|
+----+---------+-----------+----------+------+----------+----+--------+
|43.1|        4|       90.0|        48|  1985|      21.5|  78|European|
|43.4|        4|       90.0|        48|  2335|      23.7|  80|European|
|41.5|        4|       98.0|        76|  2144|      14.7|  80|European|
|44.3|        4|       90.0|        48|  2085|      21.7|  80|European|
|40.8|        4|       85.0|        65|  2110|      19.2|  80|Japanese|
|44.6|        4|       91.0|        67|  1850|      13.8|  80|Japanese|
|46.6|        4|       86.0|        65|  2110|      17.9|  80|Japanese|
|44.0|        4|       97.0|        52|  2130|      24.6|  82|European|
+----+---------+-----------+----------+------+----------+----+--------+



## Task 5 - Analyze the dataset


List all the unique Origins


In [7]:
spark.sql("SELECT distinct Origin FROM mileage").show()

+--------+
|  Origin|
+--------+
|European|
|Japanese|
|American|
+--------+



Show the count of Japanese cars


In [8]:
spark.sql("SELECT count(*) FROM mileage where Origin ='Japanese' ").show()

+--------+
|count(1)|
+--------+
|      79|
+--------+



Count the number of cars with mileage greater than 40


In [9]:
spark.sql("SELECT count(*) FROM mileage where MPG > 40").show()

+--------+
|count(1)|
+--------+
|       8|
+--------+



List the number of cars made in different Years


In [10]:
spark.sql("SELECT Year, count(Year) FROM mileage group by Year").show()

+----+-----------+
|Year|count(Year)|
+----+-----------+
|  78|         36|
|  81|         28|
|  76|         34|
|  72|         28|
|  77|         28|
|  82|         30|
|  80|         27|
|  73|         40|
|  70|         29|
|  75|         30|
|  71|         27|
|  79|         29|
|  74|         26|
+----+-----------+



Print the maximum MPG


In [11]:
spark.sql("SELECT max(MPG) FROM mileage").show()

+--------+
|max(MPG)|
+--------+
|    46.6|
+--------+



Stop Spark Session


In [12]:
spark.stop()

<!--
|Date (YYYY-MM-DD)|Version|Changed By|Change Description|
|-|-|-|-|
|2023-05-04|0.1|Ramesh Sannareddy|Initial Version Created|
-->
