# Powerful Exploratory Data Analysis with Machine Learning	

In [1]:
import findspark
findspark.init()
findspark.find()

'C:\\spark-3.2.1-bin-hadoop3.2'

In [2]:
import pyspark
from pyspark import SparkContext
from pyspark import SparkConf

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [4]:
df = spark.read.format("json").option("multiLine", "true").load("reviews.json")

In [5]:
review = df.select('paper.review').collect()[0][0]

In [6]:
confidence_col = []
evaluation_col = []
orientation_col = []
timespan_col= []
for i in review:
    for j in i:
        confidence_col.append(j[0])
        evaluation_col.append(j[1])
        orientation_col.append(j[4])
        timespan_col.append(j[7])

In [7]:
import pandas as pd
from pyspark.sql import SQLContext

# intialise data of lists.
data = {'confidence':confidence_col,
       'evaluation':evaluation_col,'orientation':orientation_col,'timespan':timespan_col}
  
# Create DataFrame
review_data = pd.DataFrame(data)
review_data = review_data.dropna()
review_data ['confidence']= review_data[['confidence']].astype(int)
review_data ['orientation']= review_data[['orientation']].astype(int)
review_data ['evaluation']= review_data[['evaluation']].astype(int)

review_data = spark.createDataFrame(data=review_data)
review_data.show()


+----------+----------+-----------+----------+
|confidence|evaluation|orientation|  timespan|
+----------+----------+-----------+----------+
|         4|         1|          0|2010-07-05|
|         4|         1|          1|2010-07-05|
|         5|         1|          1|2010-07-05|
|         4|         2|          1|2010-07-05|
|         4|         2|          0|2010-07-05|
|         4|         2|          0|2010-07-05|
|         4|         2|          1|2010-07-05|
|         3|         2|          1|2010-07-05|
|         3|         0|         -1|2010-07-05|
|         4|         2|          2|2010-07-05|
|         2|        -2|         -1|2010-07-05|
|         4|         2|          1|2010-07-05|
|         4|         2|          0|2010-07-05|
|         5|         2|          1|2010-07-05|
|         4|        -1|          0|2010-07-05|
|         4|        -2|         -1|2010-07-05|
|         4|         1|         -1|2010-07-05|
|         5|        -2|         -1|2010-07-05|
|         4| 

In [9]:
from pyspark.mllib.stat import Statistics

confidencerd = review_data.rdd.map(lambda x: [int(x[0])])

summary = Statistics.colStats(confidencerd)
summary.mean()[0]

3.573200992555831

In [10]:
from math import sqrt 
sqrt(summary.variance()[0])  # std. dev.

0.8443410847884276

In [11]:
summary.max()

array([5.])

In [12]:
summary.min()

array([1.])

### Using pearson and spearman to discover correlations

In [13]:
metrics = review_data.rdd.map(lambda x: [x[0], x[1], x[2]])
Statistics.corr(metrics, method="spearman")

array([[ 1.        , -0.04019695, -0.08341732],
       [-0.04019695,  1.        ,  0.78031129],
       [-0.08341732,  0.78031129,  1.        ]])

In [14]:
Statistics.corr(metrics, method="pearson")

array([[ 1.        , -0.03831504, -0.06309566],
       [-0.03831504,  1.        ,  0.76959781],
       [-0.06309566,  0.76959781,  1.        ]])

In [15]:
from pyspark.mllib.linalg import Vectors

visitors_freq = Vectors.dense(0.13, 0.61, 0.8, 0.5, 0.3)
print(Statistics.chiSqTest(visitors_freq))

Chi squared test summary:
method: pearson
degrees of freedom = 4 
statistic = 0.5852136752136753 
pValue = 0.9646925263439344 
No presumption against null hypothesis: observed follows the same distribution as expected..
