In [None]:
!pip install pyspark

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Trees").getOrCreate()

In [None]:
data = spark.read.csv(path = "/kaggle/input/pyspark-ml-trees/College.csv",
                      inferSchema = True, header = True)

data.printSchema()

In [None]:
data.show(10)

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [None]:
assembler = VectorAssembler(inputCols = ["Apps", "Accept", "Enroll", "Top10perc", "Top25perc", "F_Undergrad", "P_Undergrad", "Outstate",
                                         "Room_Board", "Books", "Personal", "PhD", "Terminal", "S_F_Ratio", "perc_alumni", "Expend", "Grad_Rate"],
                            outputCol = "features")

output = assembler.transform(data)

In [None]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol = "Private", outputCol = "PrivateIndex")
output = indexer.fit(output).transform(output)

output = output.select("features", "PrivateIndex")

In [None]:
train, test = output.randomSplit(weights = [0.7, 0.3], seed = 42)

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier

In [None]:
dtc = DecisionTreeClassifier(labelCol = "PrivateIndex",
                             featuresCol = "features",
                             predictionCol = "prediction")

rfc = RandomForestClassifier(labelCol = "PrivateIndex",
                             featuresCol = "features",
                             predictionCol = "prediction")

gbt = GBTClassifier(labelCol = "PrivateIndex",
                    featuresCol = "features",
                    predictionCol = "prediction")

In [None]:
dtc_model = dtc.fit(train)
rfc_model = rfc.fit(train)
gbt_model = gbt.fit(train)

In [None]:
dtc_predictions = dtc_model.transform(test)
rfc_predictions = rfc_model.transform(test)
gbt_predictions = gbt_model.transform(test)

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol = "PrivateIndex",
                                              predictionCol = "prediction",
                                              metricName = "accuracy")

In [None]:
dtc_accuracy = evaluator.evaluate(dtc_predictions)
rfc_accuracy = evaluator.evaluate(rfc_predictions)
gbt_accuracy = evaluator.evaluate(gbt_predictions)

In [None]:
print("DecisionTreeClassifier: {}".format(dtc_accuracy*100))
print("-"*50)
print("RandomForestClassifier: {}".format(rfc_accuracy*100))
print("-"*50)
print("GradientBoostingClassifier: {}".format(gbt_accuracy*100))

In [None]:
dtc_model.featureImportances

In [None]:
################################################################################################################################