In [1]:
!pip install pyspark



In [2]:
import random
from math import sqrt

In [3]:
try:
    from pyspark.sql import SparkSession
    from pyspark.mllib.stat import Statistics
    from pyspark import SparkContext, SparkConf
except ImportError as e:
    raise ImportError('PySpark is not Configured')
    
# Creating a Spark-Context
sc = SparkContext.getOrCreate(SparkConf().setMaster('local[*]'))
# Spark Builder
spark = SparkSession.builder.getOrCreate()

In [4]:
RDD1 = sc.parallelize([1,2,3,4,5,6,7,8,9,10]) # sc.parallelize(range(100))
RDD2 = sc.parallelize([7,6,5,4,5,6,7,8,9,10]) # sc.parallelize(range(100))

In [5]:
calculateMean   = lambda RDD : RDD.sum() / RDD.count()
calculateStdDev = lambda RDD, mean, numElems : sqrt(RDD.map(lambda x: pow(x - mean, 2)).sum() / numElems)

In [6]:
numElems = RDD1.count()

In [7]:
RDD1_mean = calculateMean(RDD1)
RDD2_mean = calculateMean(RDD2)

RDD1_sd   = calculateStdDev(RDD1, RDD1_mean, numElems)
RDD2_sd   = calculateStdDev(RDD2, RDD2_mean, numElems)

In [8]:
print(round(RDD1_mean, 3), round(RDD2_mean, 3))

5.5 6.7


In [9]:
print(round(RDD1_sd, 3), round(RDD2_sd, 3))

2.872 1.792


In [10]:
cov = RDD1.zip(RDD2).map(lambda x: (x[0] - RDD1_mean) * (x[1] - RDD2_mean)).sum() / numElems

In [15]:
print(f'Co-Variance : {round(cov, 5)}')

Co-Variance : 3.65


In [12]:
corr = cov / (RDD1_sd * RDD2_sd)

In [14]:
print(f'Correlation : {round(corr, 5)}')

Correlation : 0.70927


**Correlation Matrix**

In [10]:
Feature1 = sc.parallelize(range(100))
Feature2 = sc.parallelize(range(100, 200))
Feature3 = sc.parallelize(list(reversed(range(100))))
Feature4 = sc.parallelize(random.sample(range(100), 100))

In [19]:
zippedFeatures = Feature1.zip(Feature2).zip(Feature3).zip(Feature4).map(lambda x : [x[0][0][0], x[0][0][1], x[0][1], x[1]])

In [20]:
print(f'Correlation Matrix:\n{Statistics.corr(zippedFeatures)}')

Correlation Matrix:
[[ 1.         1.        -1.         0.0970297]
 [ 1.         1.        -1.         0.0970297]
 [-1.        -1.         1.        -0.0970297]
 [ 0.0970297  0.0970297 -0.0970297  1.       ]]
