# Decision trees with PySpark - solutions

In [2]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

## Your turn 1

In [4]:
weights = spark.read.csv("/FileStore/tables/sl5xl2bb1490305146134/weight.txt",
                         header=True, inferSchema=True)
weights.show(5)

### Part I

In [6]:
va = VectorAssembler(inputCols=['Height', 'Weight'],
                     outputCol='features')
weights = va.transform(weights)
si = StringIndexer(inputCol='Sex',
                   outputCol='Sex (indexed)')
weights = si.fit(weights).transform(weights)
weights.show(5)

In [7]:
dt = DecisionTreeClassifier(featuresCol='features',
                            labelCol='Sex (indexed)',
                            maxDepth=2)
dt_model = dt.fit(weights)

In [8]:
weights2 = dt_model.transform(weights)
print MulticlassClassificationEvaluator(predictionCol="prediction",
                                        labelCol="Sex (indexed)",
                                        metricName="accuracy")\
        .evaluate(weights2)

### Part II

In [10]:
va = VectorAssembler(inputCols=['features', 'Age'],
                     outputCol='features2')
weights = va.transform(weights)
weights.show(5)

If we allow _maxDepth=2_, then the importance of _Age_ is **0**, and even if we allow _maxDepth=10_, then the importance of _Age_ is only **0.0053**. This means that the age is not an important parameter for prediciting the age of the person.

In [12]:
dt = DecisionTreeClassifier(featuresCol='features2',
                            labelCol='Sex (indexed)',
                            maxDepth=2)
dt_model = dt.fit(weights)
print dt_model.featureImportances

In [13]:
dt = DecisionTreeClassifier(featuresCol='features2',
                            labelCol='Sex (indexed)',
                            maxDepth=10)
dt_model = dt.fit(weights)
print dt_model.featureImportances

## Your turn 2

In [15]:
dessert = spark.read.csv("/FileStore/tables/sdztx2671490282633198/dessert.csv", 
                         header=True, inferSchema=True)

dessert = dessert\
  .withColumnRenamed('day.of.week', 'weekday')\
  .withColumnRenamed('num.of.guests', 'num_of_guests')\
  .withColumn('purchase', dessert.dessert.astype('int'))\
  .drop('dessert')

dessert.show(5)

> **NOTE:** The Boolean column "dessert" had to be converted into a different data type, otherwise the decision tree could not understand it as the target column.

### Part I

In [18]:
si = StringIndexer(inputCol='purchase', 
                   outputCol='purchase (Index)')
dessert = si.fit(dessert).transform(dessert)
dessert.show(5)

In [19]:
for feature in ['weekday', 'hour']:
    si = StringIndexer(inputCol=feature, outputCol=feature+'_ix')
    dessert = si.fit(dessert).transform(dessert)
    ohe = OneHotEncoder(inputCol=feature+'_ix', outputCol=feature+'_ohe')
    dessert = ohe.transform(dessert)
    
va = VectorAssembler(inputCols=['weekday_ohe', 'hour_ohe', 'num_of_guests'], 
                     outputCol='features')
dessert = va.transform(dessert)
dessert.show(5)

In [20]:
dt = DecisionTreeClassifier(featuresCol='features',
                            labelCol='purchase (Index)',
                            maxDepth=5)
dt_model = dt.fit(dessert)
dessert = dt_model.transform(dessert)
dessert.show(5)

In [21]:
print MulticlassClassificationEvaluator(predictionCol="prediction",
                                        labelCol="purchase (Index)",
                                        metricName="accuracy")\
        .evaluate(dessert)

### Part II

In [23]:
dessert = dessert.withColumn('num_of_guests_cat', 
                             dessert.num_of_guests.astype('string'))\
                .drop('num_of_guests')\
                .drop('features')\
                .drop('prediction')\
                .drop('rawPrediction')\
                .drop('probability')
si = StringIndexer(inputCol='num_of_guests_cat',
                   outputCol='num_of_guests_cat_ix')
dessert = si.fit(dessert).transform(dessert)
va = VectorAssembler(inputCols=['weekday_ohe', 'hour_ohe', 'num_of_guests_cat_ix'], 
                     outputCol='features')
dessert = va.transform(dessert)
dessert.show(5)

In [24]:
dt = DecisionTreeClassifier(featuresCol='features',
                            labelCol='purchase (Index)',
                            maxDepth=5)
dt_model = dt.fit(dessert)
dessert = dt_model.transform(dessert)
dessert.show(5)

In [25]:
print MulticlassClassificationEvaluator(predictionCol="prediction",
                                        labelCol="purchase (Index)",
                                        metricName="accuracy")\
        .evaluate(dessert)