# SparkSQL

### Initialize Spark Engine

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession, functions
spark = SparkSession.builder.appName('SparkSQL').getOrCreate()
#spark

In [3]:
from pyspark.sql.functions import to_date, to_timestamp, to_number, cast
from pyspark.sql.functions  import col, lit

### Import data from file

In [4]:
df_utilization = spark.read \
    .json('utilization.json')\
    .withColumn('event_datetime',to_timestamp(col('event_datetime'),'MM/dd/yyyy HH:mm:ss'))

df_utilization.show(5, truncate=False)

+---------------+-------------------+-----------+---------+-------------+
|cpu_utilization|event_datetime     |free_memory|server_id|session_count|
+---------------+-------------------+-----------+---------+-------------+
|0.57           |2019-03-05 08:06:14|0.51       |100      |47           |
|0.47           |2019-03-05 08:11:14|0.62       |100      |43           |
|0.56           |2019-03-05 08:16:14|0.57       |100      |62           |
|0.57           |2019-03-05 08:21:14|0.56       |100      |50           |
|0.35           |2019-03-05 08:26:14|0.46       |100      |43           |
+---------------+-------------------+-----------+---------+-------------+
only showing top 5 rows



In [5]:
df_utilization.count()

500000

### Create a temporary table for the dataframe in Spark

In [6]:
df_utilization.createOrReplaceTempView('utilization')

In [7]:
df_sql = spark.sql(
    '''
    SELECT 
    * 
    FROM utilization
    LIMIT 5
    '''
    )
df_sql.show(3)
df_sql.count()


+---------------+-------------------+-----------+---------+-------------+
|cpu_utilization|     event_datetime|free_memory|server_id|session_count|
+---------------+-------------------+-----------+---------+-------------+
|           0.57|2019-03-05 08:06:14|       0.51|      100|           47|
|           0.47|2019-03-05 08:11:14|       0.62|      100|           43|
|           0.56|2019-03-05 08:16:14|       0.57|      100|           62|
+---------------+-------------------+-----------+---------+-------------+
only showing top 3 rows



5

In [8]:
df_sql = spark.sql(
    '''
    SELECT 
        cpu_utilization as cpu_util,
        event_datetime as event_time
    FROM utilization
    '''
    )
df_sql.show(3)
df_sql.count()

+--------+-------------------+
|cpu_util|         event_time|
+--------+-------------------+
|    0.57|2019-03-05 08:06:14|
|    0.47|2019-03-05 08:11:14|
|    0.56|2019-03-05 08:16:14|
+--------+-------------------+
only showing top 3 rows



500000

### Filtering Dataframe with SQL

In [9]:
df_sql = spark.sql(
    '''
    SELECT 
    *
    FROM utilization
    WHERE server_id IN (100,120) AND session_count < 70
    ORDER BY session_count DESC, free_memory ASC
    ''' 
)
df_sql.show(3)
df_sql.count()


+---------------+-------------------+-----------+---------+-------------+
|cpu_utilization|     event_datetime|free_memory|server_id|session_count|
+---------------+-------------------+-----------+---------+-------------+
|           0.48|2019-03-09 17:21:48|       0.25|      120|           69|
|           0.49|2019-03-15 04:36:48|       0.25|      120|           69|
|           0.65|2019-03-10 12:01:48|       0.25|      120|           69|
+---------------+-------------------+-----------+---------+-------------+
only showing top 3 rows



16299

### Aggregrating Dataframe with SQL

In [10]:
df_sql = spark.sql(
'''
    SELECT 
        server_id,
        count(*)  count,
        round(avg(free_memory),3) avg,
        avg(cpu_utilization) cpu,
        std(session_count) std_session
    FROM utilization
    GROUP BY server_id
    ORDER BY count(*) DESC
''' 
)
df_sql.show()
df_sql.count()

+---------+-----+-----+-------------------+------------------+
|server_id|count|  avg|                cpu|       std_session|
+---------+-----+-----+-------------------+------------------+
|      103|10000|0.239| 0.7614389999999999|10.127371965435678|
|      100|10000|0.531|  0.467506000000003|10.198979446894468|
|      101|10000|0.203| 0.7985559999999872|10.044359210780733|
|      102|10000|0.244| 0.7583949999999904|10.133512315440532|
|      107|10000|0.349| 0.6505060000000022|10.143694072819049|
|      104|10000|0.285| 0.7108530000000015|10.086652928337022|
|      106|10000|0.575| 0.4220220000000024|10.163578156441464|
|      105|10000|0.509|0.49256400000000206| 10.12206970631569|
|      110|10000|0.444| 0.5537749999999991|10.114235813278661|
|      108|10000|0.255| 0.7476360000000036|10.085062512874618|
|      109|10000|0.438| 0.5630629999999914|10.089432276613316|
|      112|10000|0.286| 0.7153870000000067|10.116853899967271|
|      113|10000|0.216| 0.7833319999999914|10.122367206

50

### Joining Dataframes with SQL

In [16]:
spark.read.csv('server_name.csv', header=True).printSchema()
df_utilization.printSchema()

root
 |-- server_id: string (nullable = true)
 |-- server_name: string (nullable = true)

root
 |-- cpu_utilization: double (nullable = true)
 |-- event_datetime: timestamp (nullable = true)
 |-- free_memory: double (nullable = true)
 |-- server_id: long (nullable = true)
 |-- session_count: long (nullable = true)



In [17]:
df_svrs = spark.read.csv('server_name.csv', header=True)

In [18]:
df_svrs.createOrReplaceTempView('servers')

In [21]:
spark.sql(
'''
    SELECT 
        u.*,
        v.*
    FROM utilizations u
    INNER JOIN servers v
    ON convert(u.server_id , VARCHAR) = v.server_id
'''
).show()

AnalysisException: [UNRESOLVED_ROUTINE] Cannot resolve function `convert` on search path [`system`.`builtin`, `system`.`session`, `spark_catalog`.`default`].; line 7 pos 7