-- Notepad to myself --

# Working with Spark DataFrames

### Basic DataFrames Operations

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [2]:
data_path = 'data/'

In [4]:
file_path = data_path + "location_temp.csv"
df1 = spark.read.csv(file_path, header=True, inferSchema=True)

In [9]:
df1.printSchema()

root
 |-- event_date: string (nullable = true)
 |-- location_id: string (nullable = true)
 |-- temp_celcius: integer (nullable = true)



In [6]:
df1.show(5, truncate=False)

+-------------------+-----------+------------+
|event_date         |location_id|temp_celcius|
+-------------------+-----------+------------+
|03/04/2019 19:48:06|loc0       |29          |
|03/04/2019 19:53:06|loc0       |27          |
|03/04/2019 19:58:06|loc0       |28          |
|03/04/2019 20:03:06|loc0       |30          |
|03/04/2019 20:08:06|loc0       |27          |
+-------------------+-----------+------------+
only showing top 5 rows



In [8]:
df1.describe().show()

+-------+-------------------+-----------+------------------+
|summary|         event_date|location_id|      temp_celcius|
+-------+-------------------+-----------+------------------+
|  count|             500000|     500000|            500000|
|   mean|               null|       null|         28.065484|
| stddev|               null|       null|3.8101229481235555|
|    min|03/04/2019 19:48:06|       loc0|                21|
|    max|03/08/2019 07:04:55|      loc99|                40|
+-------+-------------------+-----------+------------------+



In [10]:
df2_no_header = data_path + "utilization.csv"
df2 = spark.read.csv(df2_no_header, header=False, inferSchema=True)

In [12]:
df2.show(5, truncate=False)

+-------------------+---+----+----+---+
|_c0                |_c1|_c2 |_c3 |_c4|
+-------------------+---+----+----+---+
|03/05/2019 08:06:14|100|0.57|0.51|47 |
|03/05/2019 08:11:14|100|0.47|0.62|43 |
|03/05/2019 08:16:14|100|0.56|0.57|62 |
|03/05/2019 08:21:14|100|0.57|0.56|50 |
|03/05/2019 08:26:14|100|0.35|0.46|43 |
+-------------------+---+----+----+---+
only showing top 5 rows



In [13]:
df2 = df2.withColumnRenamed("_c0", "event_datetime") \
    .withColumnRenamed ("_c1", "server_id") \
    .withColumnRenamed("_c2", "cpu_utilization") \
    .withColumnRenamed("_c3", "free_memory") \
    .withColumnRenamed("_c4", "session_count")

In [14]:
df2.printSchema()

root
 |-- event_datetime: string (nullable = true)
 |-- server_id: integer (nullable = true)
 |-- cpu_utilization: double (nullable = true)
 |-- free_memory: double (nullable = true)
 |-- session_count: integer (nullable = true)



In [18]:
df2.describe().show()

+-------+-------------------+-------------------+------------------+------------------+------------------+
|summary|    cpu_utilization|     event_datetime|       free_memory|         server_id|     session_count|
+-------+-------------------+-------------------+------------------+------------------+------------------+
|  count|             500000|             500000|            500000|            500000|            500000|
|   mean| 0.6205177400000123|               null|0.3791280999999977|             124.5|          69.59616|
| stddev|0.15875173872912837|               null|0.1583093127837622|14.430884120553253|14.850676696352865|
|    min|               0.22|03/05/2019 08:06:14|               0.0|               100|                32|
|    max|                1.0|04/09/2019 01:22:46|              0.78|               149|               105|
+-------+-------------------+-------------------+------------------+------------------+------------------+



In [15]:
df2.show(5, truncate=False)

+-------------------+---------+---------------+-----------+-------------+
|event_datetime     |server_id|cpu_utilization|free_memory|session_count|
+-------------------+---------+---------------+-----------+-------------+
|03/05/2019 08:06:14|100      |0.57           |0.51       |47           |
|03/05/2019 08:11:14|100      |0.47           |0.62       |43           |
|03/05/2019 08:16:14|100      |0.56           |0.57       |62           |
|03/05/2019 08:21:14|100      |0.57           |0.56       |50           |
|03/05/2019 08:26:14|100      |0.35           |0.46       |43           |
+-------------------+---------+---------------+-----------+-------------+
only showing top 5 rows



#### Write and Read json files

In [None]:
# to write df1 as json
df1_json = data_path + "location_temp.json"
df1.write.json(df1_json)

In [None]:
# to write df2 as json
df2_json = data_path + "utilization.json"
df2.write.json(df2_json)

In [16]:
# to read json file
df2_json_path = data_path + "utilization.json"
df2 = spark.read.json(df2_json_path)

In [17]:
df2.columns

['cpu_utilization',
 'event_datetime',
 'free_memory',
 'server_id',
 'session_count']

#### Sampling

In [19]:
df2_sample = df2.sample(False, fraction=0.1)

In [23]:
df2_sample.count()

49877

In [25]:
df2_sample.describe().show()

+-------+------------------+-------------------+-------------------+------------------+------------------+
|summary|   cpu_utilization|     event_datetime|        free_memory|         server_id|     session_count|
+-------+------------------+-------------------+-------------------+------------------+------------------+
|  count|             49877|              49877|              49877|             49877|             49877|
|   mean|0.6218393247388593|               null|0.37813541311626636|124.48850171421698| 69.68624817049943|
| stddev|0.1590928260944207|               null|0.15815794139331793|14.454386106764167|14.894331401576759|
|    min|              0.22|03/05/2019 08:06:43|                0.0|               100|                32|
|    max|               1.0|04/09/2019 01:22:09|               0.78|               149|               105|
+-------+------------------+-------------------+-------------------+------------------+------------------+



In [24]:
df2_sample.sort('event_datetime').show(10, truncate=False)

+---------------+-------------------+-----------+---------+-------------+
|cpu_utilization|event_datetime     |free_memory|server_id|session_count|
+---------------+-------------------+-----------+---------+-------------+
|0.71           |03/05/2019 08:06:43|0.61       |117      |60           |
|0.78           |03/05/2019 08:06:45|0.45       |118      |68           |
|0.51           |03/05/2019 08:06:46|0.49       |119      |53           |
|0.64           |03/05/2019 08:06:51|0.48       |122      |65           |
|0.52           |03/05/2019 08:07:06|0.49       |130      |77           |
|0.5            |03/05/2019 08:07:09|0.49       |132      |65           |
|0.39           |03/05/2019 08:07:39|0.63       |147      |49           |
|0.59           |03/05/2019 08:11:17|0.13       |102      |70           |
|0.62           |03/05/2019 08:12:09|0.63       |132      |59           |
|0.95           |03/05/2019 08:12:12|0.28       |133      |90           |
+---------------+-------------------+-

### Filtering using DataFrames API

In [26]:
df1.show(5, truncate=False)

+-------------------+-----------+------------+
|event_date         |location_id|temp_celcius|
+-------------------+-----------+------------+
|03/04/2019 19:48:06|loc0       |29          |
|03/04/2019 19:53:06|loc0       |27          |
|03/04/2019 19:58:06|loc0       |28          |
|03/04/2019 20:03:06|loc0       |30          |
|03/04/2019 20:08:06|loc0       |27          |
+-------------------+-----------+------------+
only showing top 5 rows



In [27]:
df1.filter(df1["location_id"] == "loc1").show(5, truncate=False)

+-------------------+-----------+------------+
|event_date         |location_id|temp_celcius|
+-------------------+-----------+------------+
|03/04/2019 19:48:06|loc1       |31          |
|03/04/2019 19:53:06|loc1       |26          |
|03/04/2019 19:58:06|loc1       |31          |
|03/04/2019 20:03:06|loc1       |26          |
|03/04/2019 20:08:06|loc1       |28          |
+-------------------+-----------+------------+
only showing top 5 rows



In [29]:
df1.filter("location_id = 'loc1'").show(5, truncate=False) #alternative usage

+-------------------+-----------+------------+
|event_date         |location_id|temp_celcius|
+-------------------+-----------+------------+
|03/04/2019 19:48:06|loc1       |31          |
|03/04/2019 19:53:06|loc1       |26          |
|03/04/2019 19:58:06|loc1       |31          |
|03/04/2019 20:03:06|loc1       |26          |
|03/04/2019 20:08:06|loc1       |28          |
+-------------------+-----------+------------+
only showing top 5 rows



In [28]:
df1.filter(df1["location_id"] == "loc1").count()

1000

### Aggregating using DataFrames API

In [30]:
df1.groupBy("location_id").count().show(5)

+-----------+-----+
|location_id|count|
+-----------+-----+
|      loc22| 1000|
|      loc31| 1000|
|      loc82| 1000|
|      loc90| 1000|
|     loc118| 1000|
+-----------+-----+
only showing top 5 rows



In [32]:
df1.orderBy("location_id").show(5, truncate=False)

+-------------------+-----------+------------+
|event_date         |location_id|temp_celcius|
+-------------------+-----------+------------+
|03/04/2019 20:08:06|loc0       |27          |
|03/04/2019 19:58:06|loc0       |28          |
|03/04/2019 20:03:06|loc0       |30          |
|03/04/2019 19:48:06|loc0       |29          |
|03/04/2019 19:53:06|loc0       |27          |
+-------------------+-----------+------------+
only showing top 5 rows



In [34]:
df1.groupBy("location_id").count().orderBy('count', ascending=False).show(5) #in descending order

+-----------+-----+
|location_id|count|
+-----------+-----+
|      loc22| 1000|
|      loc31| 1000|
|      loc82| 1000|
|      loc90| 1000|
|     loc118| 1000|
+-----------+-----+
only showing top 5 rows



In [33]:
df1.groupby('location_id').agg({'temp_celcius': 'mean'}).show(5)

+-----------+-----------------+
|location_id|avg(temp_celcius)|
+-----------+-----------------+
|      loc22|           28.251|
|      loc31|           25.196|
|      loc82|           27.355|
|      loc90|           23.216|
|     loc118|           24.219|
+-----------+-----------------+
only showing top 5 rows



In [35]:
df1.groupby('location_id').agg({'temp_celcius': 'max'}).show(5)

+-----------+-----------------+
|location_id|max(temp_celcius)|
+-----------+-----------------+
|      loc22|               35|
|      loc31|               32|
|      loc82|               34|
|      loc90|               30|
|     loc118|               31|
+-----------+-----------------+
only showing top 5 rows



In [43]:
df1.groupby('location_id').agg({'temp_celcius': 'mean'}).orderBy('avg(temp_celcius)', ascending=False).show(5)

+-----------+-----------------+
|location_id|avg(temp_celcius)|
+-----------+-----------------+
|     loc435|           33.427|
|     loc438|           33.371|
|     loc112|           33.359|
|     loc306|           33.357|
|     loc426|           33.348|
+-----------+-----------------+
only showing top 5 rows



In [47]:
from pyspark.sql.functions import mean
df1.groupby('location_id').agg(mean('temp_celcius').alias('avg_temp')).orderBy('avg_temp', ascending=False).show(5)

+-----------+--------+
|location_id|avg_temp|
+-----------+--------+
|     loc435|  33.427|
|     loc438|  33.371|
|     loc112|  33.359|
|     loc306|  33.357|
|     loc426|  33.348|
+-----------+--------+
only showing top 5 rows



### Sampling using DataFrames API

In [48]:
df_s1 = df1.sample(fraction=0.1, withReplacement=False)
df_s1.count()

49884

In [49]:
df_s2 = df1.sample(fraction=0.001, withReplacement=False)
df_s2.count()

515

In [51]:
df_s1.groupBy("location_id").agg({'temp_celcius': 'mean'}).orderBy("location_id").show(5)

+-----------+------------------+
|location_id| avg(temp_celcius)|
+-----------+------------------+
|       loc0|29.009708737864077|
|       loc1| 28.15909090909091|
|      loc10|25.528301886792452|
|     loc100|27.128712871287128|
|     loc101|25.454545454545453|
+-----------+------------------+
only showing top 5 rows



In [52]:
df_s2.groupBy("location_id").agg({'temp_celcius': 'mean'}).orderBy("location_id").show(5)

+-----------+------------------+
|location_id| avg(temp_celcius)|
+-----------+------------------+
|       loc0|              29.0|
|       loc1|26.333333333333332|
|      loc10|              27.5|
|     loc101|              25.0|
|     loc107|              34.0|
+-----------+------------------+
only showing top 5 rows



In [53]:
df1.groupBy("location_id").agg({'temp_celcius': 'mean'}).orderBy("location_id").show(5) #original dataset

+-----------+-----------------+
|location_id|avg(temp_celcius)|
+-----------+-----------------+
|       loc0|           29.176|
|       loc1|           28.246|
|      loc10|           25.337|
|     loc100|           27.297|
|     loc101|           25.317|
+-----------+-----------------+
only showing top 5 rows

