In [25]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import TrainValidationSplit,ParamGridBuilder

spark = SparkSession.Builder().appName('Spark DataFrame Introduction').getOrCreate()
irisDF = spark.read.option('inferSchema','true')\
                        .option('header','true')\
                        .option('delimiter',',').csv('datasets/iris-dataset.txt')

indexer = StringIndexer(inputCol='class',outputCol='label')
indexerModel = indexer.fit(irisDF) 
                            
irisDF = indexerModel.transform(irisDF)
vec =VectorAssembler(inputCols=['sepal-length', 'sepal-width', 'petal-length', 'petal-width'],outputCol='features')
irisDF = vec.transform(irisDF)
irisDF = irisDF.select('features','label')
trainDF, testDF = irisDF.randomSplit([0.8,0.2],seed=1)
rf = RandomForestClassifier()
eva = MulticlassClassificationEvaluator(metricName='accuracy')

myParams = ParamGridBuilder().addGrid(rf.numTrees,[1,2,3])\
                            .addGrid(rf.maxDepth,[4,5,6])\
                            .addGrid(rf.impurity,['gini','entropy']).build()

validator = TrainValidationSplit(parallelism=4, 
                                 estimator=rf,
                                 estimatorParamMaps=myParams,
                                 trainRatio=0.8,  #TrainDataset / Total Train Daataset ( izim problemimizde %80 in yüzde 80 i)
                                 evaluator=eva)
model = validator.fit(trainDF)
print("Best Model hesaplandı")
print("Best NumTrees : ", model.bestModel.getNumTrees)
print("Best MaxDepth : ", model.bestModel.getMaxDepth())
print("Best Impurity : ", model.bestModel.getImpurity())



Best Model hesaplandı
Best NumTrees :  3
Best MaxDepth :  4
Best Impurity :  gini


DataFrame[features: vector, label: double, rawPrediction: vector, probability: vector, prediction: double]