In [1]:
from pyspark.sql import SparkSession 

spark = SparkSession.builder \
    .master('local[*]') \
    .appName("Intro") \
    .getOrCreate()



In [2]:
from pyspark.sql.types import StructField, StructType, StringType, DoubleType

custom_schema = StructType([
    StructField("Make", StringType(), True),
    StructField("Model", StringType(), True),
    StructField("Vehicle Class", StringType(), True),
    StructField("Cylinders", DoubleType(), True),
    StructField("Transmission", StringType(), True),
    StructField("Fuel Type", StringType(), True),
    StructField("Fuel Consumption City (L/100 km)", DoubleType(), True),
    StructField("Fuel Consumption Hwy (L/100 km)", DoubleType(), True),
    StructField("Fuel Consumption Comb (L/100 km)", DoubleType(), True),
    StructField("Fuel Consumption Comb (mpg)", DoubleType(), True),
    StructField("CO2", DoubleType(), True)])


In [3]:
# load data

co2_data = spark.read.format("csv")\
    .schema(custom_schema) \
    .option("header", True) \
    .load("CO2_Emissions_Canada.csv")

In [4]:
co2_data.take(2)

[Row(Make='ACURA', Model='ILX', Vehicle Class='COMPACT', Cylinders=2.0, Transmission='4', Fuel Type='AS5', Fuel Consumption City (L/100 km)=None, Fuel Consumption Hwy (L/100 km)=9.9, Fuel Consumption Comb (L/100 km)=6.7, Fuel Consumption Comb (mpg)=8.5, CO2=33.0),
 Row(Make='ACURA', Model='ILX', Vehicle Class='COMPACT', Cylinders=2.4, Transmission='4', Fuel Type='M6', Fuel Consumption City (L/100 km)=None, Fuel Consumption Hwy (L/100 km)=11.2, Fuel Consumption Comb (L/100 km)=7.7, Fuel Consumption Comb (mpg)=9.6, CO2=29.0)]

In [5]:
cols_only_continues_values = {'Fuel Consumption City (L/100 km)':0}
#                               "Fuel Consumption Hwy (L/100 km)",
#         "Fuel Consumption Comb (L/100 km)"}

In [6]:
co2_data = co2_data.fillna(0.0)

In [7]:
co2_data.printSchema()

root
 |-- Make: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Vehicle Class: string (nullable = true)
 |-- Cylinders: double (nullable = false)
 |-- Transmission: string (nullable = true)
 |-- Fuel Type: string (nullable = true)
 |-- Fuel Consumption City (L/100 km): double (nullable = false)
 |-- Fuel Consumption Hwy (L/100 km): double (nullable = false)
 |-- Fuel Consumption Comb (L/100 km): double (nullable = false)
 |-- Fuel Consumption Comb (mpg): double (nullable = false)
 |-- CO2: double (nullable = false)



In [8]:
co2_data.take(2)

[Row(Make='ACURA', Model='ILX', Vehicle Class='COMPACT', Cylinders=2.0, Transmission='4', Fuel Type='AS5', Fuel Consumption City (L/100 km)=0.0, Fuel Consumption Hwy (L/100 km)=9.9, Fuel Consumption Comb (L/100 km)=6.7, Fuel Consumption Comb (mpg)=8.5, CO2=33.0),
 Row(Make='ACURA', Model='ILX', Vehicle Class='COMPACT', Cylinders=2.4, Transmission='4', Fuel Type='M6', Fuel Consumption City (L/100 km)=0.0, Fuel Consumption Hwy (L/100 km)=11.2, Fuel Consumption Comb (L/100 km)=7.7, Fuel Consumption Comb (mpg)=9.6, CO2=29.0)]

# Prep the data for regression

turn the feature columns into one indexed column:

In [9]:
from pyspark.ml.feature import FeatureHasher
from pyspark.sql.functions import col

cols = ["Make", "Model", "Vehicle Class","Cylinders","Transmission","Fuel Type",
        "Fuel Consumption City (L/100 km)", "Fuel Consumption Hwy (L/100 km)",
        "Fuel Consumption Comb (L/100 km)","Fuel Consumption Comb (mpg)"]

cols_only_continues = ["Fuel Consumption City (L/100 km)", "Fuel Consumption Hwy (L/100 km)",
        "Fuel Consumption Comb (L/100 km)"]

hasher = FeatureHasher(outputCol="hashed_features", inputCols=cols_only_continues)
data = hasher.transform(co2_data)
                       


In [10]:
data.select("hashed_features").show(5, truncate=False)

+---------------------------------------------+
|hashed_features                              |
+---------------------------------------------+
|(262144,[38607,109231,228390],[0.0,9.9,6.7]) |
|(262144,[38607,109231,228390],[0.0,11.2,7.7])|
|(262144,[38607,109231,228390],[0.0,6.0,5.8]) |
|(262144,[38607,109231,228390],[0.0,12.7,9.1])|
|(262144,[38607,109231,228390],[0.0,12.1,8.7])|
+---------------------------------------------+
only showing top 5 rows



In [11]:
data.select("hashed_features").take(1)

[Row(hashed_features=SparseVector(262144, {38607: 0.0, 109231: 9.9, 228390: 6.7}))]

In [12]:
data.select("hashed_features").show(5, truncate=False)

+---------------------------------------------+
|hashed_features                              |
+---------------------------------------------+
|(262144,[38607,109231,228390],[0.0,9.9,6.7]) |
|(262144,[38607,109231,228390],[0.0,11.2,7.7])|
|(262144,[38607,109231,228390],[0.0,6.0,5.8]) |
|(262144,[38607,109231,228390],[0.0,12.7,9.1])|
|(262144,[38607,109231,228390],[0.0,12.1,8.7])|
+---------------------------------------------+
only showing top 5 rows



In [13]:
data.printSchema()

root
 |-- Make: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Vehicle Class: string (nullable = true)
 |-- Cylinders: double (nullable = false)
 |-- Transmission: string (nullable = true)
 |-- Fuel Type: string (nullable = true)
 |-- Fuel Consumption City (L/100 km): double (nullable = false)
 |-- Fuel Consumption Hwy (L/100 km): double (nullable = false)
 |-- Fuel Consumption Comb (L/100 km): double (nullable = false)
 |-- Fuel Consumption Comb (mpg): double (nullable = false)
 |-- CO2: double (nullable = false)
 |-- hashed_features: vector (nullable = true)



# time for selecting the most meaninful features:

In [14]:
from pyspark.ml.feature import UnivariateFeatureSelector

selector = UnivariateFeatureSelector(outputCol="selectedFeatures", featuresCol="hashed_features", labelCol="CO2")

selector.setFeatureType("continuous")
selector.setLabelType("continuous")

model = selector.fit(data)
data = model.transform(data)

 ## Tryout LDA clustring algo

In [15]:
from pyspark.ml.linalg import Vectors, SparseVector
from pyspark.ml.clustering import LDA


lda = LDA(k=2, seed=1, optimizer="em",featuresCol="selectedFeatures")
lda.setMaxIter(100)


lda.clear(lda.maxIter)
lda_model = lda.fit(data)
lda_model.setSeed(1)

# check if the model itself is distributed across Spark executres
lda_model.isDistributed()

True

In [16]:
lda_model.describeTopics().show()

+-----+-----------+--------------------+
|topic|termIndices|         termWeights|
+-----+-----------+--------------------+
|    0|   [48, 49]|[0.58104675033297...|
|    1|   [48, 49]|[0.58168999987474...|
+-----+-----------+--------------------+



In [17]:
lda_model.vocabSize()

50

In [18]:
lda_predictions = lda_model.transform(data)

In [19]:
lda_predictions.printSchema()

root
 |-- Make: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Vehicle Class: string (nullable = true)
 |-- Cylinders: double (nullable = false)
 |-- Transmission: string (nullable = true)
 |-- Fuel Type: string (nullable = true)
 |-- Fuel Consumption City (L/100 km): double (nullable = false)
 |-- Fuel Consumption Hwy (L/100 km): double (nullable = false)
 |-- Fuel Consumption Comb (L/100 km): double (nullable = false)
 |-- Fuel Consumption Comb (mpg): double (nullable = false)
 |-- CO2: double (nullable = false)
 |-- hashed_features: vector (nullable = true)
 |-- selectedFeatures: vector (nullable = true)
 |-- topicDistribution: vector (nullable = true)



In [20]:
lda_predictions.select("topicDistribution").show(2,truncate=False)

+---------------------------------------+
|topicDistribution                      |
+---------------------------------------+
|[0.5000015176580933,0.4999984823419068]|
|[0.4999995563245016,0.5000004436754983]|
+---------------------------------------+
only showing top 2 rows



# Tryout KMeans

In [21]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.clustering import KMeans



kmeans = KMeans(k=3)
kmeans.setSeed(10)
kmeans.setFeaturesCol("selectedFeatures")

kmeans_model = kmeans.fit(data)
kmeans_model.getDistanceMeasure()




'euclidean'

In [22]:
kmeans_predictions = kmeans_model.transform(data)

In [23]:
kmeans_predictions.select("prediction").show(5, truncate=True)

+----------+
|prediction|
+----------+
|         0|
|         0|
|         0|
|         2|
|         2|
+----------+
only showing top 5 rows



In [24]:
kmeans_predictions.select("prediction").distinct().show()

+----------+
|prediction|
+----------+
|         1|
|         2|
|         0|
+----------+



In [25]:
summary = kmeans_model.summary

In [26]:
summary.cluster.printSchema()

root
 |-- prediction: integer (nullable = false)



# Tryout GaussianMixture

In [27]:
from pyspark.ml.clustering import GaussianMixture

gm = GaussianMixture(k=42, tol=0.01, seed=10, featuresCol="selectedFeatures", maxIter=100)
gm_model = gm.fit(data)

gm_predictions = gm_model.transform(data)

Print the model params using `explainParams()` functionality:

In [28]:
import pprint
pp = pprint.PrettyPrinter(indent=4)
params = gm_model.explainParams()
pp.pprint(params)

('aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)\n'
 'featuresCol: features column name. (default: features, current: '
 'selectedFeatures)\n'
 'k: Number of independent Gaussians in the mixture model. Must be > 1. '
 '(default: 2, current: 42)\n'
 'maxIter: max number of iterations (>= 0). (default: 100, current: 100)\n'
 'predictionCol: prediction column name. (default: prediction)\n'
 'probabilityCol: Column name for predicted class conditional probabilities. '
 'Note: Not all models output well-calibrated probability estimates! These '
 'probabilities should be treated as confidences, not precise probabilities. '
 '(default: probability)\n'
 'seed: random seed. (default: 4621526457424974748, current: 10)\n'
 'tol: the convergence tolerance for iterative algorithms (>= 0). (default: '
 '0.01, current: 0.01)\n'
 'weightCol: weight column name. If this is not set or empty, we treat all '
 'instance weights as 1.0. (undefined)')


# Constructing - The Pipeline API

In [29]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[hasher,selector, gm])
# Fit the pipeline to training data.
pipeline_model = pipeline.fit(co2_data)

In [30]:
transformed_by_pipeline = pipeline_model.transform(co2_data)

In [31]:
transformed_by_pipeline.printSchema()

root
 |-- Make: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Vehicle Class: string (nullable = true)
 |-- Cylinders: double (nullable = false)
 |-- Transmission: string (nullable = true)
 |-- Fuel Type: string (nullable = true)
 |-- Fuel Consumption City (L/100 km): double (nullable = false)
 |-- Fuel Consumption Hwy (L/100 km): double (nullable = false)
 |-- Fuel Consumption Comb (L/100 km): double (nullable = false)
 |-- Fuel Consumption Comb (mpg): double (nullable = false)
 |-- CO2: double (nullable = false)
 |-- hashed_features: vector (nullable = true)
 |-- selectedFeatures: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: integer (nullable = false)



## Evaluating clustring models

Notice we are not using this evaluator for LDA since it outputs topicDistribution and not one numeric prdiction.



In [32]:
from pyspark.ml.evaluation import ClusteringEvaluator

evaluator = ClusteringEvaluator(featuresCol='selectedFeatures')
evaluator.setPredictionCol("prediction")

#evaluate with eucliden distance
print("kmeans: "+str(evaluator.evaluate(kmeans_predictions)))
print("GM: "+ str(evaluator.evaluate(gm_predictions)))

kmeans: 0.6791002214675337
GM: -0.1517797715036008


In [33]:
evaluator.isLargerBetter()

True

In [34]:
evaluator.setDistanceMeasure("cosine")
print("kmeans: "+str(evaluator.evaluate(kmeans_predictions)))
print("GM: "+ str(evaluator.evaluate(gm_predictions)))

kmeans: -0.07958234502129219
GM: -0.19012403274289733


In [35]:
evaluator.isLargerBetter()

True

In [36]:
evaluator.explainParams()

"distanceMeasure: The distance measure. Supported options: 'squaredEuclidean' and 'cosine'. (default: squaredEuclidean, current: cosine)\nfeaturesCol: features column name. (default: features, current: selectedFeatures)\nmetricName: metric name in evaluation (silhouette) (default: silhouette)\npredictionCol: prediction column name. (default: prediction, current: prediction)\nweightCol: weight column name. If this is not set or empty, we treat all instance weights as 1.0. (undefined)"

### Since evaluator output for `isLargerBetter` was true, we can define that kmeans algorithm produced a better model than GM.

# Hyperparameters and Tuning experiments


In [37]:
from pyspark.ml.tuning import TrainValidationSplit , ParamGridBuilder

grid = ParamGridBuilder().addGrid(kmeans.maxIter, [20,50,100]).build()

tvs = TrainValidationSplit(estimator=kmeans, estimatorParamMaps=grid, evaluator=evaluator,
                           collectSubModels=True, parallelism=1, seed=42)
tvs_model = tvs.fit(data)
tvs_model.getTrainRatio()



0.75

In [38]:
tvs_model.validationMetrics

[-0.06270405194965402, -0.06402059325959049, -0.06402059325959049]

In [39]:
from pyspark.ml.tuning import TrainValidationSplit , ParamGridBuilder

grid = ParamGridBuilder().addGrid(kmeans.maxIter, [20,50,100]) \
        .addGrid(kmeans.distanceMeasure, ['euclidean','cosine']).build()


In [40]:
tvs_model.validationMetrics

[-0.06270405194965402, -0.06402059325959049, -0.06402059325959049]

In [41]:
from pyspark.ml.tuning import TrainValidationSplit , ParamGridBuilder

grid = ParamGridBuilder().addGrid(kmeans.maxIter, [20,50,100]) \
        .addGrid(kmeans.distanceMeasure, ['euclidean','cosine']) \
        .addGrid(evaluator.distanceMeasure, ['euclidean','cosine']).build()


tvs = TrainValidationSplit(estimator=kmeans, estimatorParamMaps=grid, evaluator=evaluator,
                           collectSubModels=True, parallelism=1, seed=42, trainRatio=0.8)
tvs_model = tvs.fit(data)
tvs_model.validationMetrics



[-0.06292946960479909,
 -0.06292946960479909,
 0.5520132682136769,
 0.5520132682136769,
 -0.06292946960479909,
 -0.06292946960479909,
 0.5520132682136769,
 0.5520132682136769,
 -0.06292946960479909,
 -0.06292946960479909,
 0.5520132682136769,
 0.5520132682136769]

## Adding evaluator to the grid params:

In [42]:
from pyspark.ml.tuning import TrainValidationSplit , ParamGridBuilder


grid = ParamGridBuilder().addGrid(kmeans.maxIter, [20,50,100]) \
        .addGrid(kmeans.distanceMeasure, ['euclidean','cosine']) \
        .addGrid(evaluator.distanceMeasure, ['euclidean','cosine'])\
        .baseOn({kmeans.featuresCol: 'selectedFeatures'}) \
        .build()

tvs = TrainValidationSplit(estimator=kmeans, estimatorParamMaps=grid, evaluator=evaluator,
                           collectSubModels=True, parallelism=1, seed=42, trainRatio=0.8)
tvs_model = tvs.fit(data)
tvs_model.validationMetrics





[-0.06292946960479909,
 -0.06292946960479909,
 0.5520132682136769,
 0.5520132682136769,
 -0.06292946960479909,
 -0.06292946960479909,
 0.5520132682136769,
 0.5520132682136769,
 -0.06292946960479909,
 -0.06292946960479909,
 0.5520132682136769,
 0.5520132682136769]

In [43]:
tvs_model.subModels

[KMeansModel: uid=KMeans_03239630e943, k=3, distanceMeasure=euclidean, numFeatures=50,
 KMeansModel: uid=KMeans_03239630e943, k=3, distanceMeasure=euclidean, numFeatures=50,
 KMeansModel: uid=KMeans_03239630e943, k=3, distanceMeasure=cosine, numFeatures=50,
 KMeansModel: uid=KMeans_03239630e943, k=3, distanceMeasure=cosine, numFeatures=50,
 KMeansModel: uid=KMeans_03239630e943, k=3, distanceMeasure=euclidean, numFeatures=50,
 KMeansModel: uid=KMeans_03239630e943, k=3, distanceMeasure=euclidean, numFeatures=50,
 KMeansModel: uid=KMeans_03239630e943, k=3, distanceMeasure=cosine, numFeatures=50,
 KMeansModel: uid=KMeans_03239630e943, k=3, distanceMeasure=cosine, numFeatures=50,
 KMeansModel: uid=KMeans_03239630e943, k=3, distanceMeasure=euclidean, numFeatures=50,
 KMeansModel: uid=KMeans_03239630e943, k=3, distanceMeasure=euclidean, numFeatures=50,
 KMeansModel: uid=KMeans_03239630e943, k=3, distanceMeasure=cosine, numFeatures=50,
 KMeansModel: uid=KMeans_03239630e943, k=3, distanceMeasur

In [44]:
arr_models = tvs_model.subModels

# Advanced Split

the subModels are printed here as an example, do not use for real systems!!!

In [45]:
from pyspark.ml.tuning import CrossValidator, CrossValidatorModel


cv = CrossValidator(estimator=kmeans, estimatorParamMaps=grid, evaluator=evaluator,
                           collectSubModels=True,  parallelism=2, numFolds=3)

cv_model = cv.fit(data)
cv_model.subModels

[[KMeansModel: uid=KMeans_03239630e943, k=3, distanceMeasure=euclidean, numFeatures=50,
  KMeansModel: uid=KMeans_03239630e943, k=3, distanceMeasure=euclidean, numFeatures=50,
  KMeansModel: uid=KMeans_03239630e943, k=3, distanceMeasure=cosine, numFeatures=50,
  KMeansModel: uid=KMeans_03239630e943, k=3, distanceMeasure=cosine, numFeatures=50,
  KMeansModel: uid=KMeans_03239630e943, k=3, distanceMeasure=euclidean, numFeatures=50,
  KMeansModel: uid=KMeans_03239630e943, k=3, distanceMeasure=euclidean, numFeatures=50,
  KMeansModel: uid=KMeans_03239630e943, k=3, distanceMeasure=cosine, numFeatures=50,
  KMeansModel: uid=KMeans_03239630e943, k=3, distanceMeasure=cosine, numFeatures=50,
  KMeansModel: uid=KMeans_03239630e943, k=3, distanceMeasure=euclidean, numFeatures=50,
  KMeansModel: uid=KMeans_03239630e943, k=3, distanceMeasure=euclidean, numFeatures=50,
  KMeansModel: uid=KMeans_03239630e943, k=3, distanceMeasure=cosine, numFeatures=50,
  KMeansModel: uid=KMeans_03239630e943, k=3, di

In [46]:
len(cv_model.subModels)

3

In [47]:
len(cv_model.subModels[0])

12

In [48]:
cv_model.avgMetrics

[-0.07863039137980392,
 -0.07863039137980392,
 0.5623024101653663,
 0.5623024101653663,
 -0.0793314856527143,
 -0.0793314856527143,
 0.5623024101653663,
 0.5623024101653663,
 -0.0793314856527143,
 -0.0793314856527143,
 0.5623024101653663,
 0.5623024101653663]