* Master DAC - BDLE
* Author: Mohamed-Amine Baazizi
* Affiliation: LIP6 - Faculté des Sciences - Sorbonne Université
* Email: mohamed-amine.baazizi@lip6.fr
* October 2024

# Spark Setup

In [None]:
%%capture
!pip install -q pyspark

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/200.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m194.6/200.5 kB[0m [31m5.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip list|grep spark

In [None]:
from pyspark.sql import SparkSession, Row
import pydeequ

spark = SparkSession.builder\
    .master("local")\
    .appName("pyDeequ")\
    .config("spark.jars.packages", pydeequ.deequ_maven_coord)\
    .config("spark.jars.excludes", pydeequ.f2j_maven_coord)\
    .getOrCreate()

In [None]:
spark

# ML

## vectors

In [None]:
from pyspark.ml.linalg import Vectors


In [None]:
vec1 = Vectors.dense(1.0, 1.0, 18.0)
vec2 = Vectors.dense(0.0, 2.0, 20.0)
vec3 = Vectors.sparse(3,[0.0,2.0],[1.0,18.0])
vec4 = Vectors.sparse(3,[0.0,1.2,2.0],[2.0,3.0,11.0])
vectors = spark.sparkContext.parallelize([vec1,vec2,vec3,vec4])
vectors.collect()
# vectors.printSchema()
# vectors.show()

[DenseVector([1.0, 1.0, 18.0]),
 DenseVector([0.0, 2.0, 20.0]),
 SparseVector(3, {0: 1.0, 2: 18.0}),
 SparseVector(3, {0: 2.0, 1: 3.0, 2: 11.0})]

## DT data loading

In [None]:
tuples = [("young","high","no","fair","no"),
               ("young","high","no","excellent","no"),
               ("middle","high","no","fair","yes"),
               ("senior","medium","no","fair","yes"),
               ("senior","low","yes","fair","yes"),
               ("senior","low","yes","excellent","no"),
               ("middle","low","yes","excellent","yes"),
               ("young","medium","no","fair","no"),
               ("young","low","yes","fair","yes"),
               ("senior","medium","yes","fair","yes"),
               ("young","medium","yes","excellent","yes"),
               ("middle","medium","no","excellent","yes"),
               ("middle","high","yes","fair","yes"),
               ("senior","medium","no","excellent","no")]
print(len(tuples))

14


In [None]:
schema = 'age string, income string, student string, credit_rating string, label string'
data = spark.sparkContext.parallelize(tuples).toDF(schema)
data.printSchema()
data.show()

root
 |-- age: string (nullable = true)
 |-- income: string (nullable = true)
 |-- student: string (nullable = true)
 |-- credit_rating: string (nullable = true)
 |-- label: string (nullable = true)

+------+------+-------+-------------+-----+
|   age|income|student|credit_rating|label|
+------+------+-------+-------------+-----+
| young|  high|     no|         fair|   no|
| young|  high|     no|    excellent|   no|
|middle|  high|     no|         fair|  yes|
|senior|medium|     no|         fair|  yes|
|senior|   low|    yes|         fair|  yes|
|senior|   low|    yes|    excellent|   no|
|middle|   low|    yes|    excellent|  yes|
| young|medium|     no|         fair|   no|
| young|   low|    yes|         fair|  yes|
|senior|medium|    yes|         fair|  yes|
| young|medium|    yes|    excellent|  yes|
|middle|medium|     no|    excellent|  yes|
|middle|  high|    yes|         fair|  yes|
|senior|medium|     no|    excellent|   no|
+------+------+-------+-------------+-----+



## Transformations

### String indexer

In [None]:
from  pyspark.ml.feature import StringIndexer

In [None]:
field = 'age'
age_indexer = StringIndexer(inputCol=field,outputCol='indexed_'+field)
df_age_idx = age_indexer.fit(data).transform(data)
df_age_idx.show()


+------+------+-------+-------------+-----+-----------+
|   age|income|student|credit_rating|label|indexed_age|
+------+------+-------+-------------+-----+-----------+
| young|  high|     no|         fair|   no|        1.0|
| young|  high|     no|    excellent|   no|        1.0|
|middle|  high|     no|         fair|  yes|        2.0|
|senior|medium|     no|         fair|  yes|        0.0|
|senior|   low|    yes|         fair|  yes|        0.0|
|senior|   low|    yes|    excellent|   no|        0.0|
|middle|   low|    yes|    excellent|  yes|        2.0|
| young|medium|     no|         fair|   no|        1.0|
| young|   low|    yes|         fair|  yes|        1.0|
|senior|medium|    yes|         fair|  yes|        0.0|
| young|medium|    yes|    excellent|  yes|        1.0|
|middle|medium|     no|    excellent|  yes|        2.0|
|middle|  high|    yes|         fair|  yes|        2.0|
|senior|medium|     no|    excellent|   no|        0.0|
+------+------+-------+-------------+-----+-----

In [None]:
def string_index_cols(cols,prefix):
  outCols = map(lambda c:prefix+c, cols)
  # return list(outCols)
  return StringIndexer(inputCols=cols,outputCols=list(outCols))


# si = index_cols(['age','income'])
# si.getOutputCols()

In [None]:
prefix = 'indexed_'
fields = ['age','income']
age_income_indexer = string_index_cols(fields,prefix)
df_age_income_idx = age_income_indexer.fit(data).transform(data)
df_age_income_idx.show()

+------+------+-------+-------------+-----+-----------+--------------+
|   age|income|student|credit_rating|label|indexed_age|indexed_income|
+------+------+-------+-------------+-----+-----------+--------------+
| young|  high|     no|         fair|   no|        1.0|           1.0|
| young|  high|     no|    excellent|   no|        1.0|           1.0|
|middle|  high|     no|         fair|  yes|        2.0|           1.0|
|senior|medium|     no|         fair|  yes|        0.0|           0.0|
|senior|   low|    yes|         fair|  yes|        0.0|           2.0|
|senior|   low|    yes|    excellent|   no|        0.0|           2.0|
|middle|   low|    yes|    excellent|  yes|        2.0|           2.0|
| young|medium|     no|         fair|   no|        1.0|           0.0|
| young|   low|    yes|         fair|  yes|        1.0|           2.0|
|senior|medium|    yes|         fair|  yes|        0.0|           0.0|
| young|medium|    yes|    excellent|  yes|        1.0|           0.0|
|middl

### IndexToString

In [None]:
from pyspark.ml.feature import IndexToString


In [None]:
age_rev_indexer = IndexToString(inputCol=age_indexer.getOutputCol(),outputCol='original_age')

df_orig_age =age_rev_indexer.transform(df_age_idx)
df_orig_age.show()


+------+------+-------+-------------+-----+-----------+------------+
|   age|income|student|credit_rating|label|indexed_age|original_age|
+------+------+-------+-------------+-----+-----------+------------+
| young|  high|     no|         fair|   no|        1.0|       young|
| young|  high|     no|    excellent|   no|        1.0|       young|
|middle|  high|     no|         fair|  yes|        2.0|      middle|
|senior|medium|     no|         fair|  yes|        0.0|      senior|
|senior|   low|    yes|         fair|  yes|        0.0|      senior|
|senior|   low|    yes|    excellent|   no|        0.0|      senior|
|middle|   low|    yes|    excellent|  yes|        2.0|      middle|
| young|medium|     no|         fair|   no|        1.0|       young|
| young|   low|    yes|         fair|  yes|        1.0|       young|
|senior|medium|    yes|         fair|  yes|        0.0|      senior|
| young|medium|    yes|    excellent|  yes|        1.0|       young|
|middle|medium|     no|    excelle

### one-hot encoder

In [None]:
from pyspark.ml.feature import OneHotEncoder


In [None]:
age_onehotenc = OneHotEncoder(inputCol=age_indexer.getOutputCol(),outputCol='cat_age')
age_onehotenc.setDropLast(False)
df_age_onehot = age_onehotenc.fit(df_age_idx).transform(df_age_idx)
df_age_onehot.show()
#   .setInputCols(Array("indexed_age", "indexed_income"))
#   .setOutputCols(Array("category_age", "category_income"))
#   .setDropLast(false)

# val encoded = oneHotEncoder.fit(data).transform(data)

+------+------+-------+-------------+-----+-----------+-------------+
|   age|income|student|credit_rating|label|indexed_age|      cat_age|
+------+------+-------+-------------+-----+-----------+-------------+
| young|  high|     no|         fair|   no|        1.0|(3,[1],[1.0])|
| young|  high|     no|    excellent|   no|        1.0|(3,[1],[1.0])|
|middle|  high|     no|         fair|  yes|        2.0|(3,[2],[1.0])|
|senior|medium|     no|         fair|  yes|        0.0|(3,[0],[1.0])|
|senior|   low|    yes|         fair|  yes|        0.0|(3,[0],[1.0])|
|senior|   low|    yes|    excellent|   no|        0.0|(3,[0],[1.0])|
|middle|   low|    yes|    excellent|  yes|        2.0|(3,[2],[1.0])|
| young|medium|     no|         fair|   no|        1.0|(3,[1],[1.0])|
| young|   low|    yes|         fair|  yes|        1.0|(3,[1],[1.0])|
|senior|medium|    yes|         fair|  yes|        0.0|(3,[0],[1.0])|
| young|medium|    yes|    excellent|  yes|        1.0|(3,[1],[1.0])|
|middle|medium|     

### vector assembler

In [None]:
from pyspark.ml.feature import VectorAssembler

In [None]:
cols = ['indexed_age','indexed_income']
vec_assembler = VectorAssembler(inputCols= cols, outputCol= 'ageIncomeVec')

df_age_income_vec = vec_assembler.transform(df_age_income_idx)
df_age_income_vec.show()

+------+------+-------+-------------+-----+-----------+--------------+------------+
|   age|income|student|credit_rating|label|indexed_age|indexed_income|ageIncomeVec|
+------+------+-------+-------------+-----+-----------+--------------+------------+
| young|  high|     no|         fair|   no|        1.0|           1.0|   [1.0,1.0]|
| young|  high|     no|    excellent|   no|        1.0|           1.0|   [1.0,1.0]|
|middle|  high|     no|         fair|  yes|        2.0|           1.0|   [2.0,1.0]|
|senior|medium|     no|         fair|  yes|        0.0|           0.0|   (2,[],[])|
|senior|   low|    yes|         fair|  yes|        0.0|           2.0|   [0.0,2.0]|
|senior|   low|    yes|    excellent|   no|        0.0|           2.0|   [0.0,2.0]|
|middle|   low|    yes|    excellent|  yes|        2.0|           2.0|   [2.0,2.0]|
| young|medium|     no|         fair|   no|        1.0|           0.0|   [1.0,0.0]|
| young|   low|    yes|         fair|  yes|        1.0|           2.0|   [1.

### Vector Indexer

In [None]:
from pyspark.ml.feature import VectorIndexer


In [None]:
vecIndexer = VectorIndexer(inputCol='ageIncomeVec',\
                           outputCol='indexed_ageIncomeVec',\
                           maxCategories=3)
df_age_income_vec_idx = vecIndexer.fit(df_age_income_vec).\
    transform(df_age_income_vec)

df_age_income_vec_idx.show()


+------+------+-------+-------------+-----+-----------+--------------+------------+--------------------+
|   age|income|student|credit_rating|label|indexed_age|indexed_income|ageIncomeVec|indexed_ageIncomeVec|
+------+------+-------+-------------+-----+-----------+--------------+------------+--------------------+
| young|  high|     no|         fair|   no|        1.0|           1.0|   [1.0,1.0]|           [1.0,1.0]|
| young|  high|     no|    excellent|   no|        1.0|           1.0|   [1.0,1.0]|           [1.0,1.0]|
|middle|  high|     no|         fair|  yes|        2.0|           1.0|   [2.0,1.0]|           [2.0,1.0]|
|senior|medium|     no|         fair|  yes|        0.0|           0.0|   (2,[],[])|           (2,[],[])|
|senior|   low|    yes|         fair|  yes|        0.0|           2.0|   [0.0,2.0]|           [0.0,2.0]|
|senior|   low|    yes|    excellent|   no|        0.0|           2.0|   [0.0,2.0]|           [0.0,2.0]|
|middle|   low|    yes|    excellent|  yes|        2.0|

## Pipelines

#### string indexer

In [None]:
label = 'label'
features_col = data.columns
features_col.remove(label)

In [None]:
prefix = 'indexed_'

In [None]:
label_string_indexer = StringIndexer(inputCol=label, outputCol=prefix+label)

In [None]:
features_str_col = list(map(lambda c:prefix+c, features_col))
features_string_indexer = StringIndexer(inputCols=features_col,outputCols=features_str_col)


#### vector assembler and indexer

In [None]:
vec_assembler = VectorAssembler(inputCols= features_string_indexer.getOutputCols(), outputCol= 'vector')


In [None]:
vec_indexer = VectorIndexer(inputCol='vector',\
                            outputCol='features',\
                           maxCategories=3)

#### pipeline building

In [None]:
stages = [label_string_indexer,features_string_indexer,vec_assembler,vec_indexer]

In [None]:
from pyspark.ml import Pipeline


In [None]:
pipeline = Pipeline(stages = stages)
train_data = pipeline.fit(data).transform(data).select("features","indexed_label")
train_data.show()


+-----------------+-------------+
|         features|indexed_label|
+-----------------+-------------+
|[1.0,1.0,0.0,0.0]|          1.0|
|[1.0,1.0,0.0,1.0]|          1.0|
|[2.0,1.0,0.0,0.0]|          0.0|
|        (4,[],[])|          0.0|
|[0.0,2.0,1.0,0.0]|          0.0|
|[0.0,2.0,1.0,1.0]|          1.0|
|[2.0,2.0,1.0,1.0]|          0.0|
|    (4,[0],[1.0])|          1.0|
|[1.0,2.0,1.0,0.0]|          0.0|
|    (4,[2],[1.0])|          0.0|
|[1.0,0.0,1.0,1.0]|          0.0|
|[2.0,0.0,0.0,1.0]|          0.0|
|[2.0,1.0,1.0,0.0]|          0.0|
|    (4,[3],[1.0])|          1.0|
+-----------------+-------------+



## DT inference

In [None]:
from pyspark.ml.classification import DecisionTreeClassificationModel, DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [None]:
dt = DecisionTreeClassifier(featuresCol="features", labelCol= "indexed_label")
dtModel = dt.fit(train_data)
dtModel

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_7ec3d9e4e412, depth=4, numNodes=13, numClasses=2, numFeatures=4

In [None]:
print(dtModel.toDebugString)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_7ec3d9e4e412, depth=4, numNodes=13, numClasses=2, numFeatures=4
  If (feature 0 in {2.0})
   Predict: 0.0
  Else (feature 0 not in {2.0})
   If (feature 2 in {1.0})
    If (feature 3 in {0.0})
     Predict: 0.0
    Else (feature 3 not in {0.0})
     If (feature 0 in {1.0})
      Predict: 0.0
     Else (feature 0 not in {1.0})
      Predict: 1.0
   Else (feature 2 not in {1.0})
    If (feature 0 in {0.0})
     If (feature 3 in {0.0})
      Predict: 0.0
     Else (feature 3 not in {0.0})
      Predict: 1.0
    Else (feature 0 not in {0.0})
     Predict: 1.0



## Model Selection and Tuning

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder



In [None]:

dt_paramGrid = ParamGridBuilder()\
        .addGrid(dt.maxBins, [40,42])\
        .addGrid(dt.minInstancesPerNode, [10,100]) \
        .build()
dt_paramGrid

[{Param(parent='DecisionTreeClassifier_7ec3d9e4e412', name='maxBins', doc='Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.'): 40,
  Param(parent='DecisionTreeClassifier_7ec3d9e4e412', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'): 10},
 {Param(parent='DecisionTreeClassifier_7ec3d9e4e412', name='maxBins', doc='Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.'): 40,
  Param(parent='DecisionTreeClassifier_7ec3d9e4e412', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Use BinaryClassificationEvaluator to evaluate our model
evaluatorPR = BinaryClassificationEvaluator(labelCol = "indexed_label", rawPredictionCol = "prediction", metricName = "areaUnderPR")
evaluatorAUC = BinaryClassificationEvaluator(labelCol = "indexed_label", rawPredictionCol = "prediction", metricName = "areaUnderROC")


In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
# Build out the cross validation

#create k folds with k=5.
cv = CrossValidator(estimator=dt, \
                    estimatorParamMaps=dt_paramGrid, \
                    evaluator=evaluatorPR, \
                    numFolds=5, \
                    parallelism=2)


In [None]:
cvModel = cv.fit(train_data)

In [None]:
bestModel = cvModel.bestModel
print(bestModel.toDebugString)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_7ec3d9e4e412, depth=0, numNodes=1, numClasses=2, numFeatures=4
  Predict: 0.0



In [None]:
train_pred = cvModel.transform(train_data)
train_pred.show()

+-----------------+-------------+-------------+--------------------+----------+
|         features|indexed_label|rawPrediction|         probability|prediction|
+-----------------+-------------+-------------+--------------------+----------+
|[1.0,1.0,0.0,0.0]|          1.0|    [9.0,5.0]|[0.64285714285714...|       0.0|
|[1.0,1.0,0.0,1.0]|          1.0|    [9.0,5.0]|[0.64285714285714...|       0.0|
|[2.0,1.0,0.0,0.0]|          0.0|    [9.0,5.0]|[0.64285714285714...|       0.0|
|        (4,[],[])|          0.0|    [9.0,5.0]|[0.64285714285714...|       0.0|
|[0.0,2.0,1.0,0.0]|          0.0|    [9.0,5.0]|[0.64285714285714...|       0.0|
|[0.0,2.0,1.0,1.0]|          1.0|    [9.0,5.0]|[0.64285714285714...|       0.0|
|[2.0,2.0,1.0,1.0]|          0.0|    [9.0,5.0]|[0.64285714285714...|       0.0|
|    (4,[0],[1.0])|          1.0|    [9.0,5.0]|[0.64285714285714...|       0.0|
|[1.0,2.0,1.0,0.0]|          0.0|    [9.0,5.0]|[0.64285714285714...|       0.0|
|    (4,[2],[1.0])|          0.0|    [9.