-- Notepad to myself --

# Exploratory Data Analysis

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [2]:
data_path = 'data/'

In [3]:
df2_path = data_path + "utilization.json"
df2 = spark.read.json(df2_path)

In [None]:
#df2_csv_path = data_path + "utilization.csv"
#df2 = spark.read.csv(df2_csv_path, header=True, inferSchema=True)

In [5]:
df2.show(5, truncate=False)

+---------------+-------------------+-----------+---------+-------------+
|cpu_utilization|event_datetime     |free_memory|server_id|session_count|
+---------------+-------------------+-----------+---------+-------------+
|0.57           |03/05/2019 08:06:14|0.51       |100      |47           |
|0.47           |03/05/2019 08:11:14|0.62       |100      |43           |
|0.56           |03/05/2019 08:16:14|0.57       |100      |62           |
|0.57           |03/05/2019 08:21:14|0.56       |100      |50           |
|0.35           |03/05/2019 08:26:14|0.46       |100      |43           |
+---------------+-------------------+-----------+---------+-------------+
only showing top 5 rows



In [6]:
df2.createOrReplaceTempView("utilization") #for use of Spark-SQL

In [7]:
df2.describe().show()

+-------+-------------------+-------------------+------------------+------------------+------------------+
|summary|    cpu_utilization|     event_datetime|       free_memory|         server_id|     session_count|
+-------+-------------------+-------------------+------------------+------------------+------------------+
|  count|             500000|             500000|            500000|            500000|            500000|
|   mean| 0.6205177400000123|               null|0.3791280999999977|             124.5|          69.59616|
| stddev|0.15875173872912837|               null|0.1583093127837622|14.430884120553253|14.850676696352865|
|    min|               0.22|03/05/2019 08:06:14|               0.0|               100|                32|
|    max|                1.0|04/09/2019 01:22:46|              0.78|               149|               105|
+-------+-------------------+-------------------+------------------+------------------+------------------+



In [9]:
df2.stat.corr('cpu_utilization', 'free_memory')

-0.47047715730807443

In [10]:
df2.stat.corr('session_count', 'free_memory')

-0.5008320848876572

In [11]:
df2.stat.freqItems(('server_id', 'session_count')).show()

+--------------------+-----------------------+
| server_id_freqItems|session_count_freqItems|
+--------------------+-----------------------+
|[146, 137, 101, 1...|   [92, 101, 83, 104...|
+--------------------+-----------------------+



In [15]:
spark.sql('SELECT min(cpu_utilization) min_cpu, ROUND(mean(cpu_utilization),2) mean_cpu, \
                  max(cpu_utilization) max_cpu, ROUND(stddev(cpu_utilization),2) stddev_cpu \
          FROM utilization') \
.show()

+-------+--------+-------+----------+
|min_cpu|mean_cpu|max_cpu|stddev_cpu|
+-------+--------+-------+----------+
|   0.22|    0.62|    1.0|      0.16|
+-------+--------+-------+----------+



In [22]:
spark.sql('SELECT server_id, min(cpu_utilization) min_cpu, ROUND(mean(cpu_utilization),2) mean_cpu, \
                  max(cpu_utilization) max_cpu, ROUND(stddev(cpu_utilization),2) stddev_cpu \
           FROM utilization \
           GROUP BY server_id \
           ORDER BY server_id ASC') \
.show(10)

+---------+-------+--------+-------+----------+
|server_id|min_cpu|mean_cpu|max_cpu|stddev_cpu|
+---------+-------+--------+-------+----------+
|      100|   0.27|    0.47|   0.67|      0.12|
|      101|    0.6|     0.8|    1.0|      0.12|
|      102|   0.56|    0.76|   0.96|      0.12|
|      103|   0.56|    0.76|   0.96|      0.12|
|      104|   0.51|    0.71|   0.91|      0.12|
|      105|   0.29|    0.49|   0.69|      0.12|
|      106|   0.22|    0.42|   0.62|      0.12|
|      107|   0.45|    0.65|   0.85|      0.12|
|      108|   0.55|    0.75|   0.95|      0.12|
|      109|   0.36|    0.56|   0.76|      0.12|
+---------+-------+--------+-------+----------+
only showing top 10 rows



#### Bucketizing

To calculate statistics on buckets or histograms: the idea here is, rather than look at each server individually, let's bucket values according to how frequently they occur in certain ranges. So if we want to know how often does a CPU utilization fall in the range of 1-10 or 11-20 or 21-30, all the way up to 91-100, we could put each of those CPU utilization measures into its own bucket and count how many times a CPU utilization goes into that bucket.

In [25]:
spark.sql('SELECT server_id, cpu_utilization, FLOOR(cpu_utilization*100/10) bucket \
          FROM utilization') \
.show(10)

+---------+---------------+------+
|server_id|cpu_utilization|bucket|
+---------+---------------+------+
|      100|           0.57|     5|
|      100|           0.47|     4|
|      100|           0.56|     5|
|      100|           0.57|     5|
|      100|           0.35|     3|
|      100|           0.41|     4|
|      100|           0.57|     5|
|      100|           0.41|     4|
|      100|           0.53|     5|
|      100|           0.51|     5|
+---------+---------------+------+
only showing top 10 rows



In [29]:
spark.sql('SELECT FLOOR(cpu_utilization*100/10) bucket, \
                  count(*) count \
          FROM utilization \
          GROUP BY bucket \
          ORDER BY bucket DESC') \
.show()

+------+------+
|bucket| count|
+------+------+
|    10|    57|
|     9| 20207|
|     8| 56598|
|     7| 88242|
|     6|116725|
|     5|104910|
|     4| 68046|
|     3| 37029|
|     2|  8186|
+------+------+



### Time-series Analysis (window function)

In [38]:
spark.sql("SELECT event_datetime, server_id, cpu_utilization, \
                  avg(cpu_utilization) OVER (PARTITION BY server_id) avg_server_util \
          FROM utilization") \
.show(10, truncate=False)

+-------------------+---------+---------------+------------------+
|event_datetime     |server_id|cpu_utilization|avg_server_util   |
+-------------------+---------+---------------+------------------+
|03/05/2019 08:06:31|110      |0.68           |0.5537749999999892|
|03/05/2019 08:11:31|110      |0.58           |0.5537749999999892|
|03/05/2019 08:16:31|110      |0.55           |0.5537749999999892|
|03/05/2019 08:21:31|110      |0.63           |0.5537749999999892|
|03/05/2019 08:26:31|110      |0.63           |0.5537749999999892|
|03/05/2019 08:31:31|110      |0.71           |0.5537749999999892|
|03/05/2019 08:36:31|110      |0.67           |0.5537749999999892|
|03/05/2019 08:41:31|110      |0.55           |0.5537749999999892|
|03/05/2019 08:46:31|110      |0.37           |0.5537749999999892|
|03/05/2019 08:51:31|110      |0.7            |0.5537749999999892|
+-------------------+---------+---------------+------------------+
only showing top 10 rows



In [41]:
spark.sql("SELECT event_datetime, server_id, cpu_utilization, \
                  ROUND(avg(cpu_utilization) OVER (PARTITION BY server_id), 2) avg_server_util, \
                  ROUND(cpu_utilization - avg(cpu_utilization) OVER (PARTITION BY server_id), 2) delta_server_util \
          FROM utilization") \
.show(10, truncate=False)

+-------------------+---------+---------------+---------------+-----------------+
|event_datetime     |server_id|cpu_utilization|avg_server_util|delta_server_util|
+-------------------+---------+---------------+---------------+-----------------+
|03/05/2019 08:06:31|110      |0.68           |0.55           |0.13             |
|03/05/2019 08:11:31|110      |0.58           |0.55           |0.03             |
|03/05/2019 08:16:31|110      |0.55           |0.55           |0.0              |
|03/05/2019 08:21:31|110      |0.63           |0.55           |0.08             |
|03/05/2019 08:26:31|110      |0.63           |0.55           |0.08             |
|03/05/2019 08:31:31|110      |0.71           |0.55           |0.16             |
|03/05/2019 08:36:31|110      |0.67           |0.55           |0.12             |
|03/05/2019 08:41:31|110      |0.55           |0.55           |0.0              |
|03/05/2019 08:46:31|110      |0.37           |0.55           |-0.18            |
|03/05/2019 08:5

In [42]:
spark.sql("SELECT event_datetime, server_id, cpu_utilization, \
                  avg(cpu_utilization) OVER (PARTITION BY server_id ORDER BY event_datetime \
                                             ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) avg_server_util \
          FROM utilization") \
.show(10, truncate=False)

+-------------------+---------+---------------+------------------+
|event_datetime     |server_id|cpu_utilization|avg_server_util   |
+-------------------+---------+---------------+------------------+
|03/05/2019 08:06:31|110      |0.68           |0.63              |
|03/05/2019 08:11:31|110      |0.58           |0.6033333333333334|
|03/05/2019 08:16:31|110      |0.55           |0.5866666666666666|
|03/05/2019 08:21:31|110      |0.63           |0.6033333333333334|
|03/05/2019 08:26:31|110      |0.63           |0.6566666666666666|
|03/05/2019 08:31:31|110      |0.71           |0.6699999999999999|
|03/05/2019 08:36:31|110      |0.67           |0.6433333333333333|
|03/05/2019 08:41:31|110      |0.55           |0.5300000000000001|
|03/05/2019 08:46:31|110      |0.37           |0.54              |
|03/05/2019 08:51:31|110      |0.7            |0.58              |
+-------------------+---------+---------------+------------------+
only showing top 10 rows



In [43]:
(0.68+0.58)/2

0.63

In [44]:
(0.68+0.58+0.55)/3

0.6033333333333334