In [1]:
import findspark
findspark.init('/home/aditya/spark-3.1.1-bin-hadoop2.7')
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("Treg").getOrCreate()

In [3]:
from pyspark.ml.classification import (DecisionTreeClassifier, 
                                       RandomForestClassifier,GBTClassifier)

In [7]:
df = spark.read.csv("College.csv", inferSchema=True, header=True)

In [10]:
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F.Undergrad: integer (nullable = true)
 |-- P.Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room.Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S.F.Ratio: double (nullable = true)
 |-- perc.alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad.Rate: integer (nullable = true)



In [20]:
from pyspark.sql.functions import countDistinct, count
df.select(countDistinct('Private')).show()
df.select(count('Private')).show()

+-----------------------+
|count(DISTINCT Private)|
+-----------------------+
|                      2|
+-----------------------+

+--------------+
|count(Private)|
+--------------+
|           777|
+--------------+



In [21]:
from pyspark.ml.feature import StringIndexer, VectorAssembler

In [25]:
indr = StringIndexer(inputCol='Private',outputCol='PrivateIndex')
dfnew = indr.fit(df).transform(df)

In [46]:

dfnew = dfnew.withColumnRenamed('F.Undergrad','F_Undergrad')
dfnew = dfnew.withColumnRenamed('P.Undergrad','P_Undergrad')
dfnew = dfnew.withColumnRenamed('Room.Board','Room_Board')
dfnew = dfnew.withColumnRenamed('S.F.Ratio','S_F_Ratio')
dfnew = dfnew.withColumnRenamed('perc.alumni','perc_alumni')
dfnew = dfnew.withColumnRenamed('Grad.Rate','Grad_Rate')

dfnew.columns

['_c0',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate',
 'PrivateIndex']

In [49]:
asbler = VectorAssembler(inputCols=['Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 "F_Undergrad",
 "P_Undergrad",
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate',], outputCol='features')

In [50]:
finaldata = asbler.transform(dfnew)

In [52]:
##split data into train and test
train_data, test_data = finaldata.select(['features','PrivateIndex']).randomSplit([0.7,0.3])

In [66]:
dt_classifier = DecisionTreeClassifier(labelCol='PrivateIndex')
rdf_classifier = RandomForestClassifier(labelCol='PrivateIndex',numTrees=100)
gbt_classifier = GBTClassifier(labelCol='PrivateIndex')

In [67]:
dt_model = dt_classifier.fit(train_data)
rdf_model = rdf_classifier.fit(train_data)
gbt_model = gbt_classifier.fit(train_data)

In [68]:
dt_results = dt_model.transform(test_data)
rdf_results = rdf_model.transform(test_data)
gbt_results = gbt_model.transform(test_data)

In [69]:
from pyspark.ml.evaluation import (BinaryClassificationEvaluator, 
                                   MulticlassClassificationEvaluator)

In [70]:
myeval = MulticlassClassificationEvaluator(predictionCol='prediction',
                                      labelCol='PrivateIndex', metricName='accuracy')

In [71]:
myeval.evaluate(dt_results)

0.905829596412556

In [72]:
myeval.evaluate(rdf_results)

0.9282511210762332

In [73]:
myeval.evaluate(gbt_results)

0.9237668161434978

In [65]:
rdf_model.featureImportances

SparseVector(17, {0: 0.0244, 1: 0.0584, 2: 0.1439, 3: 0.013, 4: 0.0134, 5: 0.2482, 6: 0.0777, 7: 0.1693, 8: 0.037, 9: 0.0036, 10: 0.0058, 11: 0.0175, 12: 0.0095, 13: 0.0874, 14: 0.0258, 15: 0.0478, 16: 0.0173})