### Dependencies
___

In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, FloatType

### Create Session
____

In [8]:
csv_file = "/tmp/walking/dataset.csv"

spark = SparkSession.builder.appName('sampleApplication').getOrCreate()



#### Create DataFrame
______

In [9]:
df = spark.read.csv(path=csv_file,header=True)
df.printSchema()

root
 |-- time: string (nullable = true)
 |-- avg_rss12: string (nullable = true)
 |-- var_rss12: string (nullable = true)
 |-- avg_rss13: string (nullable = true)
 |-- var_rss13: string (nullable = true)
 |-- avg_rss23: string (nullable = true)
 |-- var_rss23: string (nullable = true)



In [10]:
df.show(10)

+----+---------+---------+---------+---------+---------+---------+
|time|avg_rss12|var_rss12|avg_rss13|var_rss13|avg_rss23|var_rss23|
+----+---------+---------+---------+---------+---------+---------+
|   0|    35.00|     3.67|    16.50|     3.77|    14.00|     1.63|
| 250|    28.50|     3.35|    17.50|     3.77|    12.25|     3.90|
| 500|    35.50|     2.87|    15.75|     2.86|    17.75|     5.07|
| 750|    29.75|    12.19|    16.25|     2.17|    20.75|     2.59|
|1000|    27.00|     2.12|    16.75|     0.83|    24.75|     1.64|
|1250|    36.00|     5.61|    12.25|     2.28|    19.25|     2.68|
|1500|    38.50|     4.33|    16.00|     5.70|    15.25|     2.38|
|1750|    35.25|     2.49|    16.00|     1.41|    15.00|     2.83|
|2000|    35.75|     7.05|    11.00|     4.42|    12.50|     8.08|
|2250|    36.00|     7.04|    15.00|     2.12|    16.75|     4.02|
+----+---------+---------+---------+---------+---------+---------+
only showing top 10 rows



In [11]:
df.describe().show()

+-------+----------------+-----------------+------------------+------------------+------------------+------------------+------------------+
|summary|            time|        avg_rss12|         var_rss12|         avg_rss13|         var_rss13|         avg_rss23|         var_rss23|
+-------+----------------+-----------------+------------------+------------------+------------------+------------------+------------------+
|  count|            7200|             7200|              7200|              7200|              7200|              7200|              7200|
|   mean|         59875.0|34.43367638888891| 4.337393055555574|15.394559722222217|3.2036083333333623|16.025058333333345| 3.343612500000015|
| stddev|34643.3468469953|4.808602849860674|2.4434541249050907| 2.912519317367833| 1.624393703577679|3.1124831754792432|1.7012372763482624|
|    min|               0|            12.50|              0.00|             10.00|              0.00|             10.00|              0.00|
|    max|           

#### Schema Casting
_____

In [18]:
data_schema = [StructField('time', IntegerType(), False),
               StructField('avg_rss12', FloatType(), False),
               StructField('var_rss12', FloatType(), False),
               StructField('avg_rss13', FloatType(), False),
               StructField('var_rss13', FloatType(), False),
               StructField('avg_rss23', FloatType(), False),
               StructField('var_rss23', FloatType(), False),
              ]

final_struct = StructType(fields=data_schema)

df = spark.read.csv(path=csv_file, header=True, schema=final_struct)
df.printSchema()

root
 |-- time: integer (nullable = true)
 |-- avg_rss12: float (nullable = true)
 |-- var_rss12: float (nullable = true)
 |-- avg_rss13: float (nullable = true)
 |-- var_rss13: float (nullable = true)
 |-- avg_rss23: float (nullable = true)
 |-- var_rss23: float (nullable = true)

