In [40]:
from __future__ import print_function
import findspark
findspark.init()
findspark.find()
import pyspark
findspark.find()
from pyspark.sql import SparkSession
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import StandardScaler, VectorAssembler,VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

In [41]:
if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("DecisionTreeRegression")\
        .getOrCreate()

In [42]:
dataset = spark.read.csv("Admission_Prediction.csv",header=True)

In [43]:
dataset.show()

+---------+-----------+-----------------+----+----+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating| SOP| LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+----+----+----+--------+---------------+
|   337.00|     118.00|                4|4.50|4.50|9.65|    1.00|           0.92|
|   324.00|     107.00|                4|4.00|4.50|8.87|    1.00|           0.76|
|     null|     104.00|                3|3.00|3.50|8.00|    1.00|           0.72|
|   322.00|     110.00|                3|3.50|2.50|8.67|    1.00|           0.80|
|   314.00|     103.00|                2|2.00|3.00|8.21|    0.00|           0.65|
|   330.00|     115.00|                5|4.50|3.00|9.34|    1.00|           0.90|
|   321.00|     109.00|             null|3.00|4.00|8.20|    1.00|           0.75|
|   308.00|     101.00|                2|3.00|4.00|7.90|    0.00|           0.68|
|   302.00|     102.00|                1|2.00|1.50|8.00|    0.00|           0.50|
|   323.00|     

In [44]:
dataset.printSchema()

root
 |-- GRE Score: string (nullable = true)
 |-- TOEFL Score: string (nullable = true)
 |-- University Rating: string (nullable = true)
 |-- SOP: string (nullable = true)
 |-- LOR: string (nullable = true)
 |-- CGPA: string (nullable = true)
 |-- Research: string (nullable = true)
 |-- Chance of Admit: string (nullable = true)



In [45]:
from pyspark.sql.functions import col
new_data = dataset.select(*(col(c).cast("float").alias(c) for c in dataset.columns))

In [46]:
new_data.printSchema()

root
 |-- GRE Score: float (nullable = true)
 |-- TOEFL Score: float (nullable = true)
 |-- University Rating: float (nullable = true)
 |-- SOP: float (nullable = true)
 |-- LOR: float (nullable = true)
 |-- CGPA: float (nullable = true)
 |-- Research: float (nullable = true)
 |-- Chance of Admit: float (nullable = true)



In [47]:
from pyspark.sql.functions import col, count, isnan, when

In [48]:
#checking for null ir nan type values in our columns
new_data.select([count(when(col(c).isNull(), c)).alias(c) for c in new_data.columns]).show()

+---------+-----------+-----------------+---+---+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+---+---+----+--------+---------------+
|       15|         10|               15|  0|  0|   0|       0|              0|
+---------+-----------+-----------------+---+---+----+--------+---------------+



In [49]:
from pyspark.ml.feature import Imputer

In [50]:
imputer = Imputer(inputCols=["GRE Score", "TOEFL Score","University Rating"], 
                  outputCols=["GRE Score", "TOEFL Score","University Rating"])
model = imputer.fit(new_data)

imputed_data = model.transform(new_data)

In [51]:
#checking for null ir nan type values in our columns
imputed_data.select([count(when(col(c).isNull(), c)).alias(c) for c in imputed_data.columns]).show()

+---------+-----------+-----------------+---+---+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+---+---+----+--------+---------------+
|        0|          0|                0|  0|  0|   0|       0|              0|
+---------+-----------+-----------------+---+---+----+--------+---------------+



In [52]:
features = imputed_data.drop('Chance of Admit')

In [53]:
#let's assemble our features together using vectorAssembler
assembler = VectorAssembler(
    inputCols=features.columns,
    outputCol="features")

In [54]:
output = assembler.transform(imputed_data)

In [55]:
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(output)

In [56]:
featureIndexer = featureIndexer.transform(output)

In [57]:
new_indexed_data = featureIndexer.select("indexedFeatures", "Chance of Admit")

In [58]:
training, test = new_indexed_data.randomSplit([0.7, 0.3])

In [59]:
training.show()

+--------------------+---------------+
|     indexedFeatures|Chance of Admit|
+--------------------+---------------+
|[290.0,100.0,1.0,...|           0.47|
|[290.0,104.0,4.0,...|           0.45|
|[293.0,97.0,2.0,2...|           0.64|
|[294.0,95.0,1.0,1...|           0.49|
|[295.0,96.0,2.0,1...|           0.47|
|[295.0,99.0,2.0,2...|           0.57|
|[295.0,101.0,2.0,...|           0.69|
|[296.0,95.0,2.0,3...|           0.44|
|[296.0,99.0,2.0,3...|           0.47|
|[297.0,96.0,2.0,2...|           0.43|
|[297.0,96.0,2.0,2...|           0.34|
|[297.0,98.0,2.0,2...|           0.59|
|[297.0,99.0,4.0,3...|           0.54|
|[297.0,101.0,3.0,...|           0.57|
|[298.0,92.0,1.0,2...|           0.51|
|[298.0,97.0,3.121...|           0.45|
|[298.0,98.0,2.0,1...|           0.44|
|[298.0,98.0,2.0,4...|           0.34|
|[298.0,99.0,1.0,1...|           0.53|
|[298.0,100.0,3.0,...|           0.58|
+--------------------+---------------+
only showing top 20 rows



In [60]:
# Train a DecisionTree model.
dt = DecisionTreeRegressor(featuresCol="indexedFeatures",labelCol="Chance of Admit")

In [61]:
# Train model.  This also runs the indexer.
model = dt.fit(training)

In [62]:
# Make predictions.
predictions = model.transform(test)

In [63]:
predictions.show()

+--------------------+---------------+------------------+
|     indexedFeatures|Chance of Admit|        prediction|
+--------------------+---------------+------------------+
|[294.0,93.0,1.0,1...|           0.46|0.4499999989162792|
|[295.0,93.0,1.0,2...|           0.46|0.4499999989162792|
|[295.0,99.0,1.0,2...|           0.37|0.5015384577787839|
|[296.0,97.0,2.0,1...|           0.49|0.4499999989162792|
|[296.0,99.0,2.0,2...|           0.61|0.5799999833106995|
|[296.0,101.0,1.0,...|            0.6|0.6899999976158142|
|[297.0,100.0,1.0,...|           0.52|0.5799999833106995|
|[298.0,101.0,4.0,...|           0.53|0.5855555468135409|
|[298.0,107.187751...|           0.46|0.5337499864399433|
|[299.0,100.0,1.0,...|           0.59|0.5843749959021807|
|[299.0,100.0,3.0,...|           0.63|0.5843749959021807|
|[299.0,100.0,3.0,...|           0.42|0.5015384577787839|
|[300.0,99.0,1.0,3...|           0.36|0.5015384577787839|
|[301.0,99.0,3.0,2...|           0.68|0.6822641491889954|
|[301.0,102.0,

In [64]:
evaluator = RegressionEvaluator(
    labelCol="Chance of Admit", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print ("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 0.0807329


In [65]:
spark.stop()