In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.6.tgz
!tar xvf spark-2.4.4-bin-hadoop2.6.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.6"
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext(appName="PySpark_dataframe")

# Simple Statistical Analysis

In [None]:
import pyspark
sc = pyspark.SparkContext(appName="rdd_statistical")

## 1st Statistical Moment

The mean is the average of all values in the dataset.
$$Mean = \frac{1}{n}\sum_{i=1}^{n}a_i = \frac{1}{n}(a_1, a_2, \cdots, a_n)$$

In [None]:
rdd = sc.parallelize(list(range(100)))

In [None]:
sum = rdd.sum()
n = rdd.count()
mean = sum / n
print(mean)

49.5


The median is the middle number in a sequance of numbers.
1. Sort the list.
2. Pick the middle number.

In [None]:
rdd = sc.parallelize([101]+list(range(100))+[102])

In [None]:
rdd.sortBy(lambda x: x).collect()

We need to add the index to each entry.

In [None]:
rdd.sortBy(lambda x:x).zipWithIndex().collect()

In [None]:
rdd.sortBy(lambda x:x).zipWithIndex().map(lambda x:(x[1], x[0])).collect()

Notice the key is the value and the value is the index.

In [None]:
sorted_list = rdd.sortBy(lambda x:x).zipWithIndex().map(lambda x:(x[1], x[0]))
n = sorted_list.count()
if (n%2==1):
  index = int(n /2)
  print(sorted_list.lookup(index)) 
else:
  index1 = (n/2) - 1
  index2 = n/2
  value1 = sorted_list.lookup(index1)[0]
  value2 = sorted_list.lookup(index2)[0]
  median = (value1 + value2) / 2
  print(median)

50.5


Lets add some outliers

In [None]:
rdd = sc.parallelize([101]+list(range(100))+[102, 1000, 10000])

In [None]:
sum = rdd.sum()
n = rdd.count()
mean = sum / float(n)
print(mean)

155.31730769230768


In [None]:
sorted_list = rdd.sortBy(lambda x:x).zipWithIndex().map(lambda x:(x[1], x[0]))
n = sorted_list.count()
if (n%2==1):
  index = int(n /2)
  print(sorted_list.lookup(index)) 
else:
  index1 = (n/2) - 1
  index2 = n/2
  value1 = sorted_list.lookup(index1)[0]
  value2 = sorted_list.lookup(index2)[0]
  median = (value1 + value2) / 2
  print(median)

51.5


As we can see, the mean gets pulled into the direction of the outliers, whereas the median is far more resistant to the outliers. Isn't that amazing?

## 2nd Statistical Moment:
### Standard Deviation
* How wide the data is spread around the mean.
* If the $std$ is low most of the data points in the list are close to the mean.
* If the $std$ is high then the data points in the list are much wider spread around the mean.
$$std = \sqrt{\frac{1}{N}\sum_{i=1}^{N}(x_i - \hat{x})^2}$$
where $\hat{x}$ is the mean and $x_i$ is the data point

In [None]:
rdd = sc.parallelize(list(range(100)))
#calculate the mean
sum = rdd.sum()
n = rdd.count()
mean = sum / float(n)
# subtract the mean from each data point in the list
rdd.map(lambda x: pow(x - mean, 2)).sum()

83325.0

In [None]:
# divide it by the size of the list "n"
rdd.map(lambda x: pow(x - mean, 2)).sum() / n

833.25

In [None]:
from math import sqrt
# squre root the result
std = sqrt(rdd.map(lambda x: pow(x - mean, 2)).sum() / n)

In [None]:
print(std)

28.86607004772212


The value of the standard devision is high, ehich make sense as our data range from 0 to 100.

In [None]:
rdd = sc.parallelize([49]*100)
#calculate the mean
sum = rdd.sum()
n = rdd.count()
mean = sum / float(n)
print("The mean is:", mean)
std = sqrt(rdd.map(lambda x: pow(x - mean, 2)).sum() / n)
print("The std is: ",std)

The mean is: 49.0
The std is:  0.0


In [None]:
rdd = sc.parallelize([49]*100+[100])
#calculate the mean
sum = rdd.sum()
n = rdd.count()
mean = sum / float(n)
print("The mean is:", mean)
std = sqrt(rdd.map(lambda x: pow(x - mean, 2)).sum() / n)
print("The std is: ",std)

The mean is: 49.504950495049506
The std is:  5.049504950495049


In [None]:
rdd = sc.parallelize([49]*100+[1000])
#calculate the mean
sum = rdd.sum()
n = rdd.count()
mean = sum / float(n)
print("The mean is:", mean)
std = sqrt(rdd.map(lambda x: pow(x - mean, 2)).sum() / n)
print("The std is: ",std)

The mean is: 58.415841584158414
The std is:  94.15841584158416


### variance
Its the standard deviation to the power of two

## 3rd Statistical Moment: Skewness
* How asymmetric data is spread around the mean.
* The skew value can be postitve, negative or undefined.

**negative Skew** indicates that the tail is on the left side of the distributions.

**positive Skew** indicates that the tail is on the right side of the distributions.

**zero Skew** means that the tail in both sides are balanced.

In [None]:
rdd = sc.parallelize(list(range(100)))
#calculate the mean
sum = rdd.sum()
n = rdd.count()
mean = sum / float(n)
std = sqrt(rdd.map(lambda x: pow(x - mean, 2)).sum() / n)
skew = (1/n) * rdd.map(lambda x: pow(x-mean,3)).sum()/pow(std,3)
print("The mean is: ",mean)
print("The std is: ",std)
print("The skew is: ",skew)

The mean is:  49.5
The std is:  28.86607004772212
The skew is:  0.0


In [None]:
# add outliers (Negative Skew)
rdd = sc.parallelize(list(range(100))+[1000]*1000)
#calculate the mean
sum = rdd.sum()
n = rdd.count()
mean = sum / float(n)
std = sqrt(rdd.map(lambda x: pow(x - mean, 2)).sum() / n)
skew = (1/n) * rdd.map(lambda x: pow(x-mean,3)).sum()/pow(std,3)
print("The mean is: ",mean)
print("The std is: ",std)
print("The skew is: ",skew)

The mean is:  913.5909090909091
The std is:  273.38811224585896
The skew is:  -2.8513343104247486


In [None]:
# add outliers (positive skew)
rdd = sc.parallelize(list(range(100))+[-1000]*1000)
#calculate the mean
sum = rdd.sum()
n = rdd.count()
mean = sum / float(n)
std = sqrt(rdd.map(lambda x: pow(x - mean, 2)).sum() / n)
skew = (1/n) * rdd.map(lambda x: pow(x-mean,3)).sum()/pow(std,3)
print("The mean is: ",mean)
print("The std is: ",std)
print("The skew is: ",skew)

The mean is:  -904.5909090909091
The std is:  301.8355450920072
The skew is:  2.850385714443368


## 4th Statistical Moment: Kurtosis

* Reports on the shape of the data.
* Indicates outliers content within the data.
* The higher the Kurtosis measure is the more outliers are present and the longer the tails of the distribution.

In [None]:
rdd = sc.parallelize(list(range(100)))
#calculate the mean
sum = rdd.sum()
n = rdd.count()
mean = sum / float(n)
std = sqrt(rdd.map(lambda x: pow(x - mean, 2)).sum() / n)
skew = (1/n) * rdd.map(lambda x: pow(x-mean,3)).sum()/pow(std,3)
ku = (1/n) * rdd.map(lambda x: pow(x-mean,4)).sum()/pow(std,4)
print("The mean is: ",mean)
print("The std is: ",std)
print("The skew is: ",skew)
print("The kurtosis is: ",ku)

The mean is:  49.5
The std is:  28.86607004772212
The skew is:  0.0
The kurtosis is:  1.7997599759975997


In [None]:
rdd = sc.parallelize(list(range(100))+[1000]*1000)
#calculate the mean
sum = rdd.sum()
n = rdd.count()
mean = sum / float(n)
std = sqrt(rdd.map(lambda x: pow(x - mean, 2)).sum() / n)
skew = (1/n) * rdd.map(lambda x: pow(x-mean,3)).sum()/pow(std,3)
ku = (1/n) * rdd.map(lambda x: pow(x-mean,4)).sum()/pow(std,4)
print("The mean is: ",mean)
print("The std is: ",std)
print("The skew is: ",skew)
print("The kurtosis is: ",ku)

The mean is:  913.5909090909091
The std is:  273.38811224585896
The skew is:  -2.8513343104247486
The kurtosis is:  9.142332316054704


In [None]:
rdd = sc.parallelize(list(range(100))+[-1000]*1000)
#calculate the mean
sum = rdd.sum()
n = rdd.count()
mean = sum / float(n)
std = sqrt(rdd.map(lambda x: pow(x - mean, 2)).sum() / n)
skew = (1/n) * rdd.map(lambda x: pow(x-mean,3)).sum()/pow(std,3)
ku = (1/n) * rdd.map(lambda x: pow(x-mean,4)).sum()/pow(std,4)
print("The mean is: ",mean)
print("The std is: ",std)
print("The skew is: ",skew)
print("The kurtosis is: ",ku)

The mean is:  -904.5909090909091
The std is:  301.8355450920072
The skew is:  2.850385714443368
The kurtosis is:  9.134733566834258


$$cov(X,Y) = \frac{1}{n}\sum_{i=1}^n(x_i - \hat{x})(y_i - \hat{y})$$

In [None]:
rddx = sc.parallelize(list(range(100)))
rddy = sc.parallelize(list(range(100)))
rddxy = rddx.zip(rddy)
n = rddxy.count()
# Calculate the mean value
meanx = rddx.sum()/rddx.count()
meany = rddy.sum()/rddy.count()
# Calculate the STD
stdx = sqrt(rddx.map(lambda x: pow(x - meanx, 2)).sum() / n)
stdy = sqrt(rddy.map(lambda x: pow(x - meany, 2)).sum() / n)
print("The stdx is: ",stdx)
print("The stdy is: ",stdy)


The stdx is:  28.86607004772212
The stdy is:  28.86607004772212


In [None]:
rddxy.take(10)

[(0, 0),
 (1, 1),
 (2, 2),
 (3, 3),
 (4, 4),
 (5, 5),
 (6, 6),
 (7, 7),
 (8, 8),
 (9, 9)]

In [None]:
conv = (rddxy.map(lambda x:(x[0]-meanx) * (x[1]-meany)).sum())/n

**Measure of dependencay** Correlation
* 1 totally correlated.
* 0 No interaction 
* -1 inverse dependence

$$corr(X,Y) = \frac{conv(X,Y)}{\sigma_{x}.\sigma_{y}}$$

In [None]:
corr = (conv)/(stdx*stdy)

In [None]:
corr

1.0

In [None]:
rddx = sc.parallelize(list(range(100)))
# reverse the second column
rddy = sc.parallelize(reversed(list(range(100))))
rddxy = rddx.zip(rddy)
n = rddxy.count()
# Calculate the mean value
meanx = rddx.sum()/rddx.count()
meany = rddy.sum()/rddy.count()
# Calculate the STD
stdx = sqrt(rddx.map(lambda x: pow(x - meanx, 2)).sum() / n)
stdy = sqrt(rddy.map(lambda x: pow(x - meany, 2)).sum() / n)
print("The stdx is: ",stdx)
print("The stdy is: ",stdy)
conv = (rddxy.map(lambda x:(x[0]-meanx) * (x[1]-meany)).sum())/n
print("The covariance is: ", conv)
corr = (conv)/(stdx*stdy)
print("The correlation value is: ", corr)

The stdx is:  28.86607004772212
The stdy is:  28.86607004772212
The covariance is:  -833.25
The correlation value is:  -1.0


In [None]:
# Generate random samples
import random
rddx = sc.parallelize(random.sample(range(100),100))
rddy = sc.parallelize(random.sample(range(100),100))
rddxy = rddx.zip(rddy)
n = rddxy.count()
# Calculate the mean value
meanx = rddx.sum()/rddx.count()
meany = rddy.sum()/rddy.count()
# Calculate the STD
stdx = sqrt(rddx.map(lambda x: pow(x - meanx, 2)).sum() / n)
stdy = sqrt(rddy.map(lambda x: pow(x - meany, 2)).sum() / n)
print("The stdx is: ",stdx)
print("The stdy is: ",stdy)
conv = (rddxy.map(lambda x:(x[0]-meanx) * (x[1]-meany)).sum())/n
print("The covariance is: ", conv)
corr = (conv)/(stdx*stdy)
print("The correlation value is: ", corr)

The stdx is:  28.86607004772212
The stdy is:  28.86607004772212
The covariance is:  64.98
The correlation value is:  0.07798379837983799


In [None]:
from pyspark.mllib.stat import Statistics

column1 = sc.parallelize(range(100))
column2 = sc.parallelize(range(100,200))
column3 = sc.parallelize(list(reversed(range(100))))
column4 = sc.parallelize(random.sample(range(100),100))
data = column1.zip(column2).zip(column3).zip(column4).map(lambda nested : (nested[0][0][0],nested[0][0][1],nested[0][1],nested[1]))
print(Statistics.corr(data))

[[ 1.          1.         -1.         -0.12255626]
 [ 1.          1.         -1.         -0.12255626]
 [-1.         -1.          1.          0.12255626]
 [-0.12255626 -0.12255626  0.12255626  1.        ]]
