In [1]:
import pyspark as ps
# Uses all 4 cores
sc = ps.SparkContext('local[4]')

In [2]:
from settings import *
import numpy as np

# Summary
We're loading the data from a file but we're going to assume that in fact the data lives on a hadoop cluster.  This means that the data is too big to clean using bash or python - we have to clean it using spark.  It would be a lot easier to clean on one machine, since we're assuming it's big data we'll have to use some spark magic

In [3]:
HOME

'/Users/Brian/workplace/truemotion_project'

In [4]:
DATA_DIR

'/data/data-science-ml'

In [5]:
train_data = sc.textFile('{0}/{1}/train.txt'.format(HOME, DATA_DIR)).cache()

In [6]:
train_data.take(2)

[u'1.635533 0.024848 0.432087 -0.361914 -0.074776 -0.693481 -0.229621 0.261503 -0.089421 -0.020431 -0.008612 0.139754 ',
 u'1.547694 0.008754 0.319101 -0.297440 -0.007617 -0.636042 -0.296480 0.262700 -0.118514 -0.085128 0.030408 0.118123 ']

In [7]:
# Give each line a unique id, without having to traverse the entire dataset.
# I'm assuming we have massive data in this demented format already and I need to be able to clean it up efficiently.
zipped = train_data.zipWithUniqueId()

In [8]:
# Collect the indices of the breaks.  This is massively smaller than the full dataset and we can 
# use it as a broadcast variable.

In [9]:
breaks = zipped.filter(lambda x: x[0] == '').collect()
breaks_list = sorted([v for (k, v) in breaks])
breaks_list_broadcast = sc.broadcast(np.array(breaks_list))

In [10]:
def f(x):
    '''
    Find the smallest value greater than x[1] in breaks_list_broadcast.values()
    Output is: block_id, (row_id, row)
    '''
    
    # Index of the first double line break after the given line
    i = np.argmax(breaks_list_broadcast.value > x[1])
    br = breaks_list_broadcast.value[i]
    
    return br, (x[1], x[0])

In [11]:
# Now we remove the whitespace lines and merge the values into a single list

combined = zipped.filter(lambda x: len(x[0]) > 0).map(f).combineByKey(lambda value: [value],
                           lambda x, value: x + [value],
                           lambda x, y: x + y
                          )



In [12]:
# The data format is now
# key - index of the double line break indicating the end of the block
# value - (index of the line, line)

# We want this to eventually be: 
# key - index of the *block*
# value - numpy array representing the block

tst_value = combined.take(1)

In [40]:
def mapper(x):
    # Construct the numpy array of data
    key = x[0]
    value = x[1]
    
    cols = len(value[0][1].strip().split(' '))
    rows = len(value)
    
    arr = np.empty((rows, cols))
    
    for i in range(len(value)):
        arr[i] = np.array(map(float, value[i][1].strip().split(' ')))
        
    # find the index of the block by looking it up in the broadcast array
    block_index = np.where(breaks_list_broadcast.value == key)[0][0]
    return block_index, arr

In [41]:
combined.map(mapper).take(3)

[(249,
  array([[ 0.301151, -0.592762,  0.15638 , -0.765473,  0.07254 , -0.434564,
          -0.196727,  0.301915, -0.158216, -0.21333 , -0.100951,  0.116247],
         [ 0.347257, -0.515353,  0.095802, -0.673797,  0.133813, -0.392264,
          -0.180933,  0.212392, -0.186571, -0.233595, -0.087982,  0.165806],
         [ 0.299408, -0.669139,  0.144305, -0.558871,  0.158865, -0.309973,
          -0.146647,  0.083578, -0.255492, -0.276738, -0.033544,  0.21064 ],
         [ 0.260817, -0.740615,  0.16282 , -0.529698,  0.222024, -0.305218,
          -0.155922,  0.055352, -0.308668, -0.329735, -0.03566 ,  0.253833],
         [ 0.258932, -0.701701,  0.141344, -0.431243,  0.266766, -0.291811,
          -0.121186, -0.058712, -0.327745, -0.293294, -0.047923,  0.322501],
         [ 0.376241, -0.714511,  0.11607 , -0.312707,  0.331248, -0.289035,
          -0.108383, -0.138205, -0.377107, -0.193136, -0.009548,  0.282938],
         [ 0.15448 , -0.59784 ,  0.069385, -0.123061,  0.415896, -0.319115,