## Initializing Spark

In [1]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setAppName("appName").setMaster("local")
sc = SparkContext()

22/11/01 20:05:19 WARN Utils: Your hostname, Alexs-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.1.130 instead (on interface en0)
22/11/01 20:05:19 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/01 20:05:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
import os
os.environ["PYTHONHASHSEED"]=str('myseed') # Set seed to maintain reproducibility accross clusters

## Loading data

In [3]:
data_rdd = sc.textFile("./data/data_ok.csv")

In [4]:
data_rdd.take(1)

                                                                                

['18.0 1526.0 12.0 8377.0 1.0 7.0 1460.0 240.0 38.0 46336.0 29200.0 0.0 1.0 9.0 1332.0 1453.0 66.0 29696.0 26400.0 0.0 14.049179621571296 4637.634193080685 71.16666666666667 703.0 0.0 151.81657426718016 114.44444444444444 703.0 0.0 182.97061762365087 177.91666666666666 1016.0 0.0 333.0308940457159 639.5 1271.0 0.0 475.7358335603209 310.0 703.0 101.0 202.27868564598364 18.0 1526.0 13.0 9845.0 1.0 7.0 1460.0 240.0 38.0 46336.0 29200.0 0.0 1.0 9.0 1332.0 1468.0 66.0 29696.0 26400.0 0.0 14.51397157040704 5323.818410551563 68.90322580645162 703.0 0.0 149.86151776130927 114.44444444444444 703.0 0.0 182.97061762365087 164.30769230769232 1016.0 0.0 323.420037582799 639.5 1271.0 0.0 475.7358335603209 310.0 703.0 101.0 202.27868564598364 18.0 1526.0 14.0 11165.0 1.0 7.0 1460.0 240.0 38.0 46336.0 29200.0 0.0 1.0 9.0 1332.0 1468.0 66.0 29696.0 26400.0 0.0 14.980565057551118 5941.198473293163 66.75 703.0 0.0 147.98775287164813 114.44444444444444 703.0 0.0 182.97061762365087 152.57142857142858 1016.

# V1

In [5]:
column_rdd = data_rdd.map(lambda line: float(line.split(" ")[3]))

In [6]:
column_rdd.take(10)

[8377.0,
 29632.0,
 107392.0,
 188032.0,
 261472.0,
 342112.0,
 5811.0,
 9414.0,
 8640.0,
 1428.0]

The mean is:

In [7]:
mean_v1 = sc.broadcast(column_rdd.reduce(lambda x,y: x+y) / column_rdd.count())
mean_v1.value

1409608.9152

The variance is:

In [8]:
variance_v1 = column_rdd.map(lambda x: (x- mean_v1.value)**2).reduce(lambda x,y: x+y) / column_rdd.count()
variance_v1

5883755024032.205

# V2

In [9]:
import numpy as np

In [10]:
data_nparray = np.array(column_rdd.collect())

In [11]:
data_nparray.mean()

1409608.9152

In [12]:
data_nparray.var()

5883755024032.192

# V3

In [13]:
def create_rdd(line):
    numbers = line.split(" ")
    return ((index, number) for index, number in enumerate(numbers))

In [14]:
column_number_rdd = data_rdd.flatMap(create_rdd)

In [15]:
count_by_key = sc.broadcast(column_number_rdd.countByKey())
count_by_key.value.values()

dict_values([5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 500

In [16]:
sum_per_column_rdd = column_number_rdd.aggregateByKey(0, lambda x,y: x+float(y), lambda x,y: x+y)
sum_per_column_rdd.take(5)

                                                                                

[(0, 2139192.0), (2, 5033856.0), (4, 5000.0), (6, 7300000.0), (8, 701197.0)]

In [17]:
mean_rdd = sum_per_column_rdd.map(lambda x: (x[0], x[1]/count_by_key.value[x[0]]))
mean_rdd.lookup(3)

[1409608.9152]

In [18]:
mean_dict = sc.broadcast(mean_rdd.collectAsMap())
mean_dict.value[3]

1409608.9152

In [19]:
var_rdd = column_number_rdd.map(lambda x: (x[0], (float(x[1]) - mean_dict.value[x[0]])**2)). \
   reduceByKey(lambda x,y: x+y).map(lambda x: (x[0], x[1]/count_by_key.value[x[0]]))
var_rdd.lookup(3)

                                                                                

[5883755024032.205]