In [3]:
df = sqlContext.read.format("com.mongodb.spark.sql.DefaultSource").load()

In [4]:
import numpy as np

In [63]:
from collections import OrderedDict
from pyspark.sql.functions import udf, array
from pyspark.sql.types import IntegerType, DateType, DoubleType
from datetime import datetime

ref_date = datetime.strptime("2000-1-1", "%Y-%m-%d")

udf_convert_to_dt = udf(lambda i1, i2, i3: datetime.strptime("%s-%s-%s" %(i1, i2, i3), "%Y-%m-%d"), returnType=DateType())
udf_is_missing_data = udf(lambda element: 0 if element else 1, returnType=IntegerType())

udf_to_date = udf(lambda item: datetime.strftime(item, "%Y-%m-%d"))

from pyspark.sql.functions import array, udf, round
from pyspark.sql.types import FloatType

def a(xs1):
    return 1

median_udf = udf(a, FloatType())
mean = udf(a, DoubleType())

data = df\
    .select(
        col("serial_number").alias("SN"), 
        col("time").alias("Time"),
        round(col("WL1.WL1-MEAN"), 4).alias("WL1-MEAN"),
        round(col("WL2.WL2-MEAN"), 4).alias("WL2-MEAN"),
        round(col("WL3.WL3-MEAN"), 4).alias("WL3-MEAN"),
        col("WHRL1.WHRL1-ACC").alias("WHRL1-ACC"),
        col("WHRL2.WHRL2-ACC").alias("WHRL2-ACC"),
        col("WHRL3.WHRL3-ACC").alias("WHRL3-ACC")
    )\
    .filter(df['time'] > ref_date)\
    .withColumn('missing', udf_is_missing_data('WL1-MEAN'))\
    .withColumn('date', udf_to_date("time"))\
    .withColumn('WL-MEAN', round((col("WL1-MEAN") + col("WL2-MEAN") + col("WL3-MEAN"))/3, 4))\
    .groupBy("date")\
    .agg({'WL1-MEAN': "mean", 'WL2-MEAN': "mean", "WL3-MEAN": "mean", "missing":"sum", 'WL-MEAN':'mean'})\
    .orderBy("date")\
    .drop('Time')\
    .select('date', col('sum(missing)').alias("Missing points"), round(col('avg(WL-MEAN)'), 4).alias('WL-MEAN'))


data.show(20)
    

+----------+--------------+-------+
|      date|Missing points|WL-MEAN|
+----------+--------------+-------+
|2016-05-23|             8|   null|
|2016-05-24|            69| 2.2767|
|2016-05-25|            88| 2.3139|
|2016-05-26|            96|   null|
|2016-05-27|            64| 0.3946|
|2016-05-28|             1| 0.2333|
|2016-05-29|             0| 0.2643|
|2016-05-30|             1| 0.6755|
|2016-05-31|             0| 0.4205|
|2016-06-01|             0| 0.3694|
|2016-06-02|             1| 0.2767|
|2016-06-03|             0| 0.3441|
|2016-06-04|             0| 0.2782|
|2016-06-05|             1| 0.2776|
|2016-06-06|            14| 0.4987|
|2016-06-07|             0| 0.4154|
|2016-06-08|            10| 0.4063|
|2016-06-09|             0| 0.3973|
|2016-06-10|             0| 0.4263|
|2016-06-11|             0| 0.2742|
+----------+--------------+-------+
only showing top 20 rows

