In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.\
    builder.\
    master("local").\
    appName('tree-methods-implementation-on-real-data').\
    getOrCreate()

In [2]:
df = spark.read.csv(
    path='D:/learn-ab/learning-PySpark/sample-data/college.csv',
    inferSchema=True,
    header=True
)
df.show()

+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|              School|Private|Apps|Accept|Enroll|Top10perc|Top25perc|F_Undergrad|P_Undergrad|Outstate|Room_Board|Books|Personal|PhD|Terminal|S_F_Ratio|perc_alumni|Expend|Grad_Rate|
+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|Abilene Christian...|    Yes|1660|  1232|   721|       23|       52|       2885|        537|    7440|      3300|  450|    2200| 70|      78|     18.1|         12|  7041|       60|
|  Adelphi University|    Yes|2186|  1924|   512|       16|       29|       2683|       1227|   12280|      6450|  750|    1500| 29|      30|     12.2|         16| 10527|       56|
|      Adrian College|    Yes|1428|  1097|   336|       22|       50|       1036|         99|  

In [3]:
df.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [4]:
cols = df.columns
vals = [val for val in df.head(1)[0]]
for col, val in zip(cols, vals):
    print(f'{col} : {val}')


School : Abilene Christian University
Private : Yes
Apps : 1660
Accept : 1232
Enroll : 721
Top10perc : 23
Top25perc : 52
F_Undergrad : 2885
P_Undergrad : 537
Outstate : 7440
Room_Board : 3300
Books : 450
Personal : 2200
PhD : 70
Terminal : 78
S_F_Ratio : 18.1
perc_alumni : 12
Expend : 7041
Grad_Rate : 60


In [5]:
from pyspark.ml.feature import VectorAssembler

In [6]:
df.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [7]:
assembler = VectorAssembler(
    inputCols=[
        'Apps',
        'Accept',
        'Enroll',
        'Top10perc',
        'Top25perc',
        'F_Undergrad',
        'P_Undergrad',
        'Outstate',
        'Room_Board',
        'Books',
        'Personal',
        'PhD',
        'Terminal',
        'S_F_Ratio',
        'perc_alumni',
        'Expend',
        'Grad_Rate'
    ],
    outputCol='features'
)

In [8]:
df_final = assembler.transform(df)
df_final.show()

+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+--------------------+
|              School|Private|Apps|Accept|Enroll|Top10perc|Top25perc|F_Undergrad|P_Undergrad|Outstate|Room_Board|Books|Personal|PhD|Terminal|S_F_Ratio|perc_alumni|Expend|Grad_Rate|            features|
+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+--------------------+
|Abilene Christian...|    Yes|1660|  1232|   721|       23|       52|       2885|        537|    7440|      3300|  450|    2200| 70|      78|     18.1|         12|  7041|       60|[1660.0,1232.0,72...|
|  Adelphi University|    Yes|2186|  1924|   512|       16|       29|       2683|       1227|   12280|      6450|  750|    1500| 29|      30|     12.2|         16| 10527|       56|[2186.0,1924

In [9]:
from pyspark.ml.feature import StringIndexer

In [10]:
indexer = StringIndexer(
    inputCol='Private',
    outputCol='private_index'
)
df_final = indexer.fit(df_final).transform(df_final)
df_final.show()

+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+--------------------+-------------+
|              School|Private|Apps|Accept|Enroll|Top10perc|Top25perc|F_Undergrad|P_Undergrad|Outstate|Room_Board|Books|Personal|PhD|Terminal|S_F_Ratio|perc_alumni|Expend|Grad_Rate|            features|private_index|
+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+--------------------+-------------+
|Abilene Christian...|    Yes|1660|  1232|   721|       23|       52|       2885|        537|    7440|      3300|  450|    2200| 70|      78|     18.1|         12|  7041|       60|[1660.0,1232.0,72...|          0.0|
|  Adelphi University|    Yes|2186|  1924|   512|       16|       29|       2683|       1227|   12280|      6450|  750|    1500| 29|    

In [11]:
df_model = df_final.select('features', 'private_index')
df_model.show()

+--------------------+-------------+
|            features|private_index|
+--------------------+-------------+
|[1660.0,1232.0,72...|          0.0|
|[2186.0,1924.0,51...|          0.0|
|[1428.0,1097.0,33...|          0.0|
|[417.0,349.0,137....|          0.0|
|[193.0,146.0,55.0...|          0.0|
|[587.0,479.0,158....|          0.0|
|[353.0,340.0,103....|          0.0|
|[1899.0,1720.0,48...|          0.0|
|[1038.0,839.0,227...|          0.0|
|[582.0,498.0,172....|          0.0|
|[1732.0,1425.0,47...|          0.0|
|[2652.0,1900.0,48...|          0.0|
|[1179.0,780.0,290...|          0.0|
|[1267.0,1080.0,38...|          0.0|
|[494.0,313.0,157....|          0.0|
|[1420.0,1093.0,22...|          0.0|
|[4302.0,992.0,418...|          0.0|
|[1216.0,908.0,423...|          0.0|
|[1130.0,704.0,322...|          0.0|
|[3540.0,2001.0,10...|          1.0|
+--------------------+-------------+
only showing top 20 rows



In [12]:
df_model.printSchema()

root
 |-- features: vector (nullable = true)
 |-- private_index: double (nullable = false)



In [13]:
train_data, test_data = df_model.randomSplit([0.7, 0.3])

In [14]:
from pyspark.ml.classification import (
    DecisionTreeClassifier,
    RandomForestClassifier,
    GBTClassifier
)
from pyspark.ml import Pipeline

In [15]:
?DecisionTreeClassifier

[1;31mInit signature:[0m
[0mDecisionTreeClassifier[0m[1;33m([0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mfeaturesCol[0m[1;33m:[0m [0mstr[0m [1;33m=[0m [1;34m'features'[0m[1;33m,[0m[1;33m
[0m    [0mlabelCol[0m[1;33m:[0m [0mstr[0m [1;33m=[0m [1;34m'label'[0m[1;33m,[0m[1;33m
[0m    [0mpredictionCol[0m[1;33m:[0m [0mstr[0m [1;33m=[0m [1;34m'prediction'[0m[1;33m,[0m[1;33m
[0m    [0mprobabilityCol[0m[1;33m:[0m [0mstr[0m [1;33m=[0m [1;34m'probability'[0m[1;33m,[0m[1;33m
[0m    [0mrawPredictionCol[0m[1;33m:[0m [0mstr[0m [1;33m=[0m [1;34m'rawPrediction'[0m[1;33m,[0m[1;33m
[0m    [0mmaxDepth[0m[1;33m:[0m [0mint[0m [1;33m=[0m [1;36m5[0m[1;33m,[0m[1;33m
[0m    [0mmaxBins[0m[1;33m:[0m [0mint[0m [1;33m=[0m [1;36m32[0m[1;33m,[0m[1;33m
[0m    [0mminInstancesPerNode[0m[1;33m:[0m [0mint[0m [1;33m=[0m [1;36m1[0m[1;33m,[0m[1;33m
[0m    [0mminInfoGain[0m[1;33m:[0m [0mfl

In [16]:
dc_clf = DecisionTreeClassifier(
    featuresCol='features',
    labelCol='private_index'
)
rf_clf = RandomForestClassifier(
    featuresCol='features',
    labelCol='private_index'
)
gb_clf = GBTClassifier(
    featuresCol='features',
    labelCol='private_index'
)

In [17]:
dc_clf_model = dc_clf.fit(train_data)
rf_clf_model = rf_clf.fit(train_data)
gb_clf_model = gb_clf.fit(train_data)

In [18]:
dc_clf_preds = dc_clf_model.transform(test_data)
rf_clf_preds = rf_clf_model.transform(test_data)
gb_clf_preds = gb_clf_model.transform(test_data)

In [19]:
dc_clf_preds.printSchema()

root
 |-- features: vector (nullable = true)
 |-- private_index: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [20]:
rf_clf_preds.printSchema()

root
 |-- features: vector (nullable = true)
 |-- private_index: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [21]:
gb_clf_preds.printSchema()

root
 |-- features: vector (nullable = true)
 |-- private_index: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [22]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [23]:
tree_clf_eval = BinaryClassificationEvaluator(
    labelCol='private_index'
)

In [24]:
print('AUC for Decision Tree Model     : ', tree_clf_eval.evaluate(dc_clf_preds))
print('AUC for Random Forest Model     : ', tree_clf_eval.evaluate(rf_clf_preds))
print('AUC for Gradient Boosting Model : ', tree_clf_eval.evaluate(gb_clf_preds))

AUC for Decision Tree Model     :  0.8652954224642064
AUC for Random Forest Model     :  0.9811453922161725
AUC for Gradient Boosting Model :  0.9628453317201046


In [25]:
?RandomForestClassifier

[1;31mInit signature:[0m
[0mRandomForestClassifier[0m[1;33m([0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mfeaturesCol[0m[1;33m:[0m [0mstr[0m [1;33m=[0m [1;34m'features'[0m[1;33m,[0m[1;33m
[0m    [0mlabelCol[0m[1;33m:[0m [0mstr[0m [1;33m=[0m [1;34m'label'[0m[1;33m,[0m[1;33m
[0m    [0mpredictionCol[0m[1;33m:[0m [0mstr[0m [1;33m=[0m [1;34m'prediction'[0m[1;33m,[0m[1;33m
[0m    [0mprobabilityCol[0m[1;33m:[0m [0mstr[0m [1;33m=[0m [1;34m'probability'[0m[1;33m,[0m[1;33m
[0m    [0mrawPredictionCol[0m[1;33m:[0m [0mstr[0m [1;33m=[0m [1;34m'rawPrediction'[0m[1;33m,[0m[1;33m
[0m    [0mmaxDepth[0m[1;33m:[0m [0mint[0m [1;33m=[0m [1;36m5[0m[1;33m,[0m[1;33m
[0m    [0mmaxBins[0m[1;33m:[0m [0mint[0m [1;33m=[0m [1;36m32[0m[1;33m,[0m[1;33m
[0m    [0mminInstancesPerNode[0m[1;33m:[0m [0mint[0m [1;33m=[0m [1;36m1[0m[1;33m,[0m[1;33m
[0m    [0mminInfoGain[0m[1;33m:[0m [0mfl

In [26]:
rf_clf_2 = RandomForestClassifier(
    featuresCol='features',
    labelCol='private_index',
    numTrees=150
)
rf_clf_2_model = rf_clf_2.fit(train_data)
rf_clf_2_preds = rf_clf_2_model.transform(test_data)
print('AUC for Random Forest Model 2 : ', tree_clf_eval.evaluate(rf_clf_2_preds))

AUC for Random Forest Model 2 :  0.9815486993345431


In [27]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [28]:
?MulticlassClassificationEvaluator

[1;31mInit signature:[0m
[0mMulticlassClassificationEvaluator[0m[1;33m([0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mpredictionCol[0m[1;33m:[0m [0mstr[0m [1;33m=[0m [1;34m'prediction'[0m[1;33m,[0m[1;33m
[0m    [0mlabelCol[0m[1;33m:[0m [0mstr[0m [1;33m=[0m [1;34m'label'[0m[1;33m,[0m[1;33m
[0m    [0mmetricName[0m[1;33m:[0m [1;34m'MulticlassClassificationEvaluatorMetricType'[0m [1;33m=[0m [1;34m'f1'[0m[1;33m,[0m[1;33m
[0m    [0mweightCol[0m[1;33m:[0m [0mOptional[0m[1;33m[[0m[0mstr[0m[1;33m][0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmetricLabel[0m[1;33m:[0m [0mfloat[0m [1;33m=[0m [1;36m0.0[0m[1;33m,[0m[1;33m
[0m    [0mbeta[0m[1;33m:[0m [0mfloat[0m [1;33m=[0m [1;36m1.0[0m[1;33m,[0m[1;33m
[0m    [0mprobabilityCol[0m[1;33m:[0m [0mstr[0m [1;33m=[0m [1;34m'probability'[0m[1;33m,[0m[1;33m
[0m    [0meps[0m[1;33m:[0m [0mfloat[0m [1;33m=[0m [1;36m1e-15[

In [29]:
tree_clf_eval_multi = MulticlassClassificationEvaluator(
    labelCol='private_index',
    metricName='accuracy'
)

In [30]:
print('Accuracy for Decision Tree Model     : ', tree_clf_eval_multi.evaluate(dc_clf_preds))
print('Accuracy for Random Forest Model     : ', tree_clf_eval_multi.evaluate(rf_clf_preds))
print('Accuracy for Gradient Boosting Model : ', tree_clf_eval_multi.evaluate(gb_clf_preds))
print('Accuracy for Random Forest Model 2   : ', tree_clf_eval_multi.evaluate(rf_clf_2_preds))

Accuracy for Decision Tree Model     :  0.9170305676855895
Accuracy for Random Forest Model     :  0.9475982532751092
Accuracy for Gradient Boosting Model :  0.9126637554585153
Accuracy for Random Forest Model 2   :  0.9344978165938864
