In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler,StringIndexer,OneHotEncoder,PCA
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

spark = SparkSession.builder.getOrCreate()
irisDF = spark.read.option('header','true').option('inferSchema','true').csv('datasets/iris-dataset.txt')
indexer = StringIndexer(inputCol='class',outputCol='label')
indexerModel = indexer.fit(irisDF)
irisDF = indexerModel.transform(irisDF)
vec = VectorAssembler(inputCols=irisDF.columns[0:4],outputCol='features1')
irisDF = vec.transform(irisDF)
irisDF.show()
pca = PCA(inputCol='features1',outputCol='features',k=1)
pcaModel = pca.fit(irisDF)
irisDF = pcaModel.transform(irisDF)
irisDF.select('features1','features','label').show(truncate=False)

trainDF, testDF = irisDF.randomSplit([0.75,0.25],seed=1234)

rfClassifier = RandomForestClassifier()
model = rfClassifier.fit(trainDF)

resultDF = model.transform(testDF)
eva = MulticlassClassificationEvaluator(metricName='accuracy')
accuracy = eva.evaluate(resultDF)
print("Accuracy : ",accuracy)

+------------+-----------+------------+-----------+-----------+-----+-----------------+
|sepal-length|sepal-width|petal-length|petal-width|      class|label|        features1|
+------------+-----------+------------+-----------+-----------+-----+-----------------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|  0.0|[5.1,3.5,1.4,0.2]|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|  0.0|[4.9,3.0,1.4,0.2]|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|  0.0|[4.7,3.2,1.3,0.2]|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|  0.0|[4.6,3.1,1.5,0.2]|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|  0.0|[5.0,3.6,1.4,0.2]|
|         5.4|        3.9|         1.7|        0.4|Iris-setosa|  0.0|[5.4,3.9,1.7,0.4]|
|         4.6|        3.4|         1.4|        0.3|Iris-setosa|  0.0|[4.6,3.4,1.4,0.3]|
|         5.0|        3.4|         1.5|        0.2|Iris-setosa|  0.0|[5.0,3.4,1.5,0.2]|
|         4.4|        2.9|      

In [None]:
print(pcaModel.explainedVariance)

[0.9246162071742686,0.05301556785053502,0.017185139525006728]


In [None]:
#Corelation Test

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler,StringIndexer,OneHotEncoder,PCA
from pyspark.ml.stat import Correlation
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

spark = SparkSession.builder.getOrCreate()
irisDF = spark.read.option('header','true').option('inferSchema','true').csv('datasets/iris-dataset.txt')
indexer = StringIndexer(inputCol='class',outputCol='label')
indexerModel = indexer.fit(irisDF)
irisDF = indexerModel.transform(irisDF)
vec = VectorAssembler(inputCols=irisDF.columns[0:4],outputCol='features')
irisDF = vec.transform(irisDF)
irisDF.show()
r1 = Correlation.corr(irisDF, "features").head()
print("Pearson correlation matrix:\n" + str(r1[0]))

r2 = Correlation.corr(irisDF, "features", "spearman").head()
print("Spearman correlation matrix:\n" + str(r2[0]))

+------------+-----------+------------+-----------+-----------+-----+-----------------+
|sepal-length|sepal-width|petal-length|petal-width|      class|label|         features|
+------------+-----------+------------+-----------+-----------+-----+-----------------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|  0.0|[5.1,3.5,1.4,0.2]|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|  0.0|[4.9,3.0,1.4,0.2]|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|  0.0|[4.7,3.2,1.3,0.2]|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|  0.0|[4.6,3.1,1.5,0.2]|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|  0.0|[5.0,3.6,1.4,0.2]|
|         5.4|        3.9|         1.7|        0.4|Iris-setosa|  0.0|[5.4,3.9,1.7,0.4]|
|         4.6|        3.4|         1.4|        0.3|Iris-setosa|  0.0|[4.6,3.4,1.4,0.3]|
|         5.0|        3.4|         1.5|        0.2|Iris-setosa|  0.0|[5.0,3.4,1.5,0.2]|
|         4.4|        2.9|      



Spearman correlation matrix:
DenseMatrix([[ 1.        , -0.15945652,  0.88138639,  0.83442065],
             [-0.15945652,  1.        , -0.30342065, -0.27751107],
             [ 0.88138639, -0.30342065,  1.        ,  0.93600335],
             [ 0.83442065, -0.27751107,  0.93600335,  1.        ]])


In [None]:
from pyspark.ml.feature import MaxAbsScaler
scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures")
scalerModel = scaler.fit(irisDF)
scaledData = scalerModel.transform(irisDF)
scaledData.select("features", "scaledFeatures").show(truncate=False)

+-----------------+-------------------------------------------------------------------------------+
|features         |scaledFeatures                                                                 |
+-----------------+-------------------------------------------------------------------------------+
|[5.1,3.5,1.4,0.2]|[0.6455696202531644,0.7954545454545454,0.20289855072463767,0.08000000000000002]|
|[4.9,3.0,1.4,0.2]|[0.620253164556962,0.6818181818181818,0.20289855072463767,0.08000000000000002] |
|[4.7,3.2,1.3,0.2]|[0.5949367088607594,0.7272727272727273,0.18840579710144928,0.08000000000000002]|
|[4.6,3.1,1.5,0.2]|[0.5822784810126581,0.7045454545454546,0.2173913043478261,0.08000000000000002] |
|[5.0,3.6,1.4,0.2]|[0.6329113924050632,0.8181818181818181,0.20289855072463767,0.08000000000000002]|
|[5.4,3.9,1.7,0.4]|[0.6835443037974683,0.8863636363636364,0.2463768115942029,0.16000000000000003] |
|[4.6,3.4,1.4,0.3]|[0.5822784810126581,0.7727272727272727,0.20289855072463767,0.12]               |
