In [None]:
# Spark Integration library for Python
import pyspark

# Matplotlib and numpy for data visualization - inline shows plots in notebook
%pylab --no-import-all notebook

import pyarrow as pa

In [None]:
def to_num(string):
    try:
        return int(string)
    except ValueError:
        try:
            return float(string)
        except:
            return None

# Creating new resilient distributed dataset from file on hdfs
def get_raw_rdd_from_file(hdfs_file_name, hdfs_root_path='hdfs://192.168.178.19:9000/'):
    return sc.textFile(hdfs_root_path + hdfs_file_name, 1)

def clean_rdd(rdd):
    start_index = rdd.first()[0]
    return rdd.filter(lambda x: None not in x and
                                     x[0] > start_index and 
                                     x[1] > 0 and 
                                     0 <= x[3] <= 1023 and 
                                     0 <= x[4] <= 1023 and 
                                     0 <= x[5] <= 1023)

# Creating new resilient distributed dataset from file on hdfs, fitting lines into matching format
def get_rdd_from_file(hdfs_file_name, hdfs_root_path='hdfs://192.168.178.19:9000/'):
    """Applying the following to clean the data
    1. Check if line contains 6 values
    2. Splitting the line and convert substrings into numbers"""
    return clean_rdd(sc.textFile(hdfs_root_path + hdfs_file_name, 1).\
                                 filter(lambda x: len(x.split('\t')) == 6).\
                                 map(lambda x: x.split('\t')).\
                                 map(lambda x: tuple(map(to_num, x))))

In [None]:
# Creating Spark Context 
sc = pyspark.SparkContext()
print('Spark connection established')

# Starting Spark session, so data can be processed as data frame
spark = pyspark.sql.SparkSession(sc)
print('Spark session enabled')

# Connecting to HDFS
fs = pa.hdfs.connect('master', 9000)
print('HDFS Connection established')

In [None]:
# Getting files from HDFS
filenames = fs.ls('/flume')

In [None]:
# Loading the first file to Spark RDD
test_rdd = get_rdd_from_file(filenames[0])
test_rdd.first()

In [None]:
test_df = test_rdd.toDF(schema=['index', 'timestamp', 'temperature', 'red', 'green', 'blue'])

In [None]:
# Bad entries in Data Frame
type(test_df)

In [None]:
test_df.count()

In [None]:
# Removing bad entries in Data Frame
test_df = test_df.filter((test_df.temperature < 50)&
                         (test_df.temperature > -30))

In [None]:
pd_df = test_df.toPandas()

In [None]:
first_ts = pd_df.head(1).timestamp.values[0]
last_ts = pd_df.tail(1).timestamp.values[0]

In [None]:
pd_df = pd_df[(pd_df.timestamp < last_ts) & (pd_df.timestamp > first_ts)]

In [None]:
f, axes = plt.subplots(nrows=2, ncols=1, sharex=True)
axes[0].plot(pd_df['timestamp'] / 1000 / 60, pd_df['temperature'], 'm', label='Temperature [°C]')
axes[1].plot(pd_df['timestamp'] / 1000 / 60, pd_df['red'],   'r', label='Red value in light')
axes[1].plot(pd_df['timestamp'] / 1000 / 60, pd_df['green'], 'g', label='Green value in light')
axes[1].plot(pd_df['timestamp'] / 1000 / 60, pd_df['blue'],  'b', label='Blue value in light')
for axis in axes:
    axis.grid()
    axis.legend()

In [None]:
# Stopping Spark Context
sc.stop()

In [25]:
bene_numbers = [10,11,12,23,24,29,30,31,35,36,39,40,41,42,46,47,48,49,50,51,56,57,62,72,73,79,80,83,86,87,88,89,90,91,96,97,100,101,107,115,116,117,118,119,120,121,123,125,126,127,136,137,138,139,140,141,142,145,151,152,153,154,161,162,163,164,165,166,169,170,171,172,173,174,175,177,178,179,180,181,183,184,185,187,188,189,191,192,193,197,199,201,203,206,207,208,209,212,213,214,216,217,218,219,220,221,222,223,224,225,226,227,230,231,232,233,237,238,239,240,241,243,244,245,249,250,251,270,271,272,273,274,275,282,283,284,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,304,305,306,309,310,311,312,314,315,316,317,318,319,324,328,329,330,331,332,334,336,337,343,344,345,346,347,348,349,350,351,353,354,357,358,361,362,369,370,377,378,379,380,381,382,383,384,386,396,397,398,401,402,403,404,405,406,407,408,409,410,411,415,416,417,418,419,424,427,428,431,432,433,438,439,440,441,442,446,447,448,451,452,455,456,457,458,459,460,462,463,465,466,467,468,469,472,473,474,475,476,478,484,485,487,488]
flo_numbers  = [9,13,14,15,27,28,29,30,31,32,33,34,41,42,43,44,45,46,47,52,53,54,55,56,57,60,61,62,69,70,72,73,74,75,76,79,80,84,85,88,89,95,106,107,108,109,110,113,114,115,118,119,120,121,123,125,127,132,140,143,145,146,147,148,150,161,162,163,164,165,166,167,168,169,170,171,172,175,176,182,183,184,185,186,187,188,189,190,194,195,200,202,203,204,205,206,207,208,209,210,211,213,214,218,219,223,224,225,228,229,230,231,232,238,239,240,242,243,244,245,249,250,270,271,272,276,277,278,279,283,284,290,291,292,298,300,301,303,311,312,313,314,315,316,317,318,319,322,323,324,325,326,331,332,334,336,337,339,340,341,342,343,344,345,346,349,350,357,358,360,369,373,377,378,379,380,381,382,383,384,395,398,403,404,405,406,407,408,409,410,411,415,416,417,418,419,424,429,431,432,433,437,438,439,440,441,442,446,447,448,452,455,456,457,458,460,463,465,466,468,469,472,473,474,475,478,484,485,487,488]
domi_numbers = [32,33,34,35,36,45,60,61,62,69,70,83,84,85,95,106,108,113,114,115,116,117,118,119,120,121,131,140,141,143,145,165,166,167,168,169,170,171,172,173,174,175,176,185,186,190,193,199,203,206,207,208,209,210,211,213,214,216,217,222,223,224,225,227,228,229,230,231,234,237,240,241,243,244,245,249,250,251,254,256,257,270,271,272,280,281,282,283,284,287,288,289,290,291,292,298,307,308,309,310,314,320,321,322,323,324,330,331,332,336,337,338,345,346,350,357,358,359,369,371,372,373,377,378,379,380,381,382,383,384,393,394,395,406,407,408,409,410,411,415,416,417,418,419,424,431,432,433,438,439,440,441,442,446,447,448,455,456,457,458,463,465,468,469,472,473,475,478,484,485,487,488]

In [26]:
domi_release = [str(i) for i in range(1,500) if i not in bene_numbers and i not in  flo_numbers]
bene_release = [str(i) for i in range(1,500) if i not in domi_numbers and i not in  flo_numbers]
flo_release  = [str(i) for i in range(1,500) if i not in bene_numbers and i not in domi_numbers]

In [28]:
','.join(bene_release)

'1,2,3,4,5,6,7,8,10,11,12,16,17,18,19,20,21,22,23,24,25,26,37,38,39,40,48,49,50,51,58,59,63,64,65,66,67,68,71,77,78,81,82,86,87,90,91,92,93,94,96,97,98,99,100,101,102,103,104,105,111,112,122,124,126,128,129,130,133,134,135,136,137,138,139,142,144,149,151,152,153,154,155,156,157,158,159,160,177,178,179,180,181,191,192,196,197,198,201,212,215,220,221,226,233,235,236,246,247,248,252,253,255,258,259,260,261,262,263,264,265,266,267,268,269,273,274,275,285,286,293,294,295,296,297,299,302,304,305,306,327,328,329,333,335,347,348,351,352,353,354,355,356,361,362,363,364,365,366,367,368,370,374,375,376,385,386,387,388,389,390,391,392,396,397,399,400,401,402,412,413,414,420,421,422,423,425,426,427,428,430,434,435,436,443,444,445,449,450,451,453,454,459,461,462,464,467,470,471,476,477,479,480,481,482,483,486,489,490,491,492,493,494,495,496,497,498,499'