In [None]:
!pip install pyspark spark_sklearn -q

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("Colab_pyspark")\
        .config('spark.ui.port', '4050')\
        .config('spark.executor.memory', '3g')\
        .getOrCreate()

In [None]:
sc = spark.sparkContext
spark

In [None]:
# Выборка https://www.kaggle.com/shivam2503/diamonds
import pandas as pd
pdf = pd.read_csv("diamonds.csv", header=0)
pdf.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [None]:
LableEncoding OneHotEncoding

## Feature engineering

In [None]:
pdf['cut'] = pdf['cut'].replace({'Fair': 0, 'Good': 1, 'Very Good': 2, 'Premium': 3, 'Ideal': 4})
pdf['color'] = pdf['color'].replace({'J': 0, 'I': 1, 'H': 2, 'G': 3, 'F': 4, 'E': 5, 'D': 6})
pdf['clarity'] = pdf['clarity'].replace({'I1': 0, 'SI1': 1, 'SI2': 2, 'VS1': 3, 'VS2': 4, 'VVS1': 5, 'VVS2': 6, 'IF': 7})
pdf.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,4,5,2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,3,5,1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,1,5,3,56.9,65.0,327,4.05,4.07,2.31
3,0.29,3,1,4,62.4,58.0,334,4.2,4.23,2.63
4,0.31,1,0,2,63.3,58.0,335,4.34,4.35,2.75


In [None]:
pdf.dtypes

carat      float64
cut          int64
color        int64
clarity      int64
depth      float64
table      float64
price        int64
x          float64
y          float64
z          float64
dtype: object

In [None]:
labels = pdf['price'].values
featureNames = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z']
features = pdf[featureNames].values

In [None]:
from sklearn.preprocessing import normalize
features = normalize(features, axis=0)
features

array([[0.00106702, 0.00553547, 0.005655  , ..., 0.0029123 , 0.00293078,
        0.00289958],
       [0.00097424, 0.0041516 , 0.005655  , ..., 0.00286806, 0.00282769,
        0.00275639],
       [0.00106702, 0.00138387, 0.005655  , ..., 0.00298603, 0.00299705,
        0.00275639],
       ...,
       [0.00324745, 0.00276773, 0.006786  , ..., 0.00417307, 0.00418262,
        0.00424794],
       [0.00398973, 0.0041516 , 0.002262  , ..., 0.00453434, 0.00450662,
        0.00446272],
       [0.00347941, 0.00553547, 0.006786  , ..., 0.0042984 , 0.00432253,
        0.0043434 ]])

In [None]:
from sklearn import linear_model

model = linear_model.Ridge().fit(features, labels)

In [None]:
model.coef_

array([317631.26407957,  -5570.00639578, -35361.60027062, -12561.03271112,
         -391.85286415,   2893.356447  , 113393.7440294 , 112724.63907073,
       112036.32664852])

## Parameter tuning со Spark

Parameter tuning - это задача тьюнинга (гипер) параметров ML алгоритма с целью повысить качество модели. Тренируются различные модели (каждая со своим набором параметров) на одном и том же наборе данных и далее все полученные модели тестируются на одном и том же отложенном наборе данных, что снижает риск переобучения.

k-fold cross validation:


 - Случайным образом разбиваем данные на к равных частей ("folds")
     -  i = 1, 2, ..., k, откладываем набор данных i как validation set.
     -  training set - все кроме i

     -  для каждого набора параметров тренируем модель, подсчитываем ошибку на k различных validation set, усредняем, находим лучший набор параметров

 - Тренируем модель с лучшим набором параметров на всех данных 


Для каждой пары (fold, parameter set) можно обучать модель независимо от всех остальных. Мы распараллелим эти задания: scikit-learn будет обучать модель на каждом executor'е. Это параллелизация очень эффективна, так как обучение моделей - самая вычислительно сложная часть ML workflow.

Если используются k фолдов и P различных наборов параметров, то во сколько раз можно ускорить вычисление?


### Отложим random test set


In [None]:
from sklearn.model_selection import train_test_split

trainingLabels, testLabels, trainingFeatures, testFeatures = train_test_split(labels, features, test_size=0.3)
ntrain, ntest = len(trainingLabels), len(testLabels)
print('Split data randomly into 2 sets: %d training and %d test instances.' % (ntrain, ntest))

Split data randomly into 2 sets: 37758 training and 16182 test instances.


### Разобьем данные и определим таски, которые будем параллелизировать
Каждое распределенное задание это пара - (fold, parameter set) pair.

In [None]:
from sklearn.model_selection import KFold
numFolds = 3 # more (10 or so) in practice
kf = KFold(n_splits=numFolds)

In [None]:
alphas = [0.0, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
tasks = []
for alpha in alphas:
    for fold in range(numFolds):
        tasks = tasks + [(alpha, fold)]

In [None]:
tasks

[(0.0, 0),
 (0.0, 1),
 (0.0, 2),
 (0.0001, 0),
 (0.0001, 1),
 (0.0001, 2),
 (0.001, 0),
 (0.001, 1),
 (0.001, 2),
 (0.01, 0),
 (0.01, 1),
 (0.01, 2),
 (0.1, 0),
 (0.1, 1),
 (0.1, 2),
 (1.0, 0),
 (1.0, 1),
 (1.0, 2),
 (10.0, 0),
 (10.0, 1),
 (10.0, 2),
 (100.0, 0),
 (100.0, 1),
 (100.0, 2),
 (1000.0, 0),
 (1000.0, 1),
 (1000.0, 2)]

In [None]:
len(tasks)

27

In [None]:
tasksRDD = spark.sparkContext.parallelize(tasks, numSlices = len(tasks))
tasksRDD.getNumPartitions()

27

In [None]:
tasksRDD.take(10)

[(0.0, 0),
 (0.0, 1),
 (0.0, 2),
 (0.0001, 0),
 (0.0001, 1),
 (0.0001, 2),
 (0.001, 0),
 (0.001, 1),
 (0.001, 2),
 (0.01, 0)]

### Broadcast dataset

In [None]:
trainingFeaturesBroadcast = spark.sparkContext.broadcast(trainingFeatures)
trainingLabelsBroadcast = spark.sparkContext.broadcast(trainingLabels)

### Запустим параллельную cross-validation

Определим функцию, которая будет запускаться на каждом worker'e, эта функция берет одну пару (1 hyperparameter alpha value + 1 fold index) и тренируем соотвествующую модель. Используем RDD.map для этого.

In [None]:
from sklearn import linear_model

def trainOneModel(alpha, fold):
    """
    Given 1 task (1 hyperparameter alpha value + 1 fold index), train the corresponding model.
    Return: model, score on the fold's test data, task info.
    """
    localTrainingFeatures = trainingFeaturesBroadcast.value
    localTrainingLabels = trainingLabelsBroadcast.value
    trainIndex, valIndex = [], []
    fold_ = 0 
    
    for trainIndex_, valIndex_ in kf.split(localTrainingFeatures):
        if fold_ == fold:
            trainIndex, valIndex = trainIndex_, valIndex_
            break
        fold_ += 1
    X_train, X_val = localTrainingFeatures[trainIndex], localTrainingFeatures[valIndex]
    Y_train, Y_val = localTrainingLabels[trainIndex], localTrainingLabels[valIndex]

    clf = linear_model.Ridge(alpha=alpha)
    clf.fit(X_train, Y_train)
    score = clf.score(X_val, Y_val)
    return clf, score, alpha, fold

In [None]:
trainedModelAndScores = tasksRDD.map(lambda alpha_fold: trainOneModel(alpha_fold[0], alpha_fold[1]))
trainedModelAndScores.cache()
trainedModelAndScores.count()

27

In [None]:
trainingFeaturesBroadcast.unpersist()
trainingLabelsBroadcast.unpersist()

### Соберем результаты для лучшей hyperparameter alpha

In [None]:
allScores = trainedModelAndScores.map(lambda x: (x[1], x[2], x[3])).collect()
avgScores = dict(map(lambda alpha: (alpha, 0.0), alphas))

In [None]:
for score, alpha, fold in allScores:
    avgScores[alpha] += score
for alpha in alphas:
    avgScores[alpha] /= numFolds
avgScores

{0.0: 0.8952171147519478,
 0.0001: 0.8948778676349275,
 0.001: 0.8925672219680543,
 0.01: 0.8794178595619816,
 0.1: 0.7321894107398945,
 1.0: 0.22994935735200964,
 10.0: 0.028707592359056216,
 100.0: 0.002891690528808185,
 1000.0: 0.00023805836686626355}

Теперь у нас есть список alpha values с соотвествующими средними scores, найдем среди них лучший.

In [None]:
bestAlpha = -1
bestScore = -1
for alpha in alphas:
    if avgScores[alpha] > bestScore:
        bestAlpha = alpha
        bestScore = avgScores[alpha]
print('Found best alpha: %g, which gives score: %g' % (bestAlpha, bestScore))

Found best alpha: 0, which gives score: 0.895217


### Обучим финальную модель с лучшим набором гиперпараметров

Так это только 1 таск, то запустим его на драйвере.

In [None]:
tunedClf = linear_model.Ridge(alpha=bestAlpha)
tunedClf.fit(trainingFeatures, trainingLabels)

Ridge(alpha=0.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

### Spark-sklearn
https://github.com/databricks/spark-sklearn

In [None]:
import sys
from spark_sklearn import GridSearchCV

In [None]:
parameters = {"alpha": alphas}
parameters

{'alpha': [0.0, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]}

In [None]:
est = linear_model.Ridge()

In [None]:
clf = GridSearchCV(spark.sparkContext, est, parameters, n_jobs=4)

In [None]:
clf.fit(trainingFeatures, trainingLabels)

GridSearchCV(cv=3, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'alpha': [0.0, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       sc=<SparkContext master=local appName=Colab_pyspark>, scoring=None,
       verbose=0)

In [None]:
clf.best_estimator_

Ridge(alpha=0.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [None]:
clf.cv_results_

{'mean_fit_time': array([0.01274284, 0.00614285, 0.01570074, 0.01767349, 0.00673437,
        0.02195795, 0.01981934, 0.01356967, 0.01164532]),
 'mean_score_time': array([0.00102901, 0.00235232, 0.00088827, 0.00091434, 0.00108878,
        0.00186086, 0.00095224, 0.00089995, 0.00115554]),
 'mean_test_score': array([8.95217115e-01, 8.94877868e-01, 8.92567222e-01, 8.79417860e-01,
        7.32189411e-01, 2.29949357e-01, 2.87075924e-02, 2.89169053e-03,
        2.38058367e-04]),
 'mean_train_score': array([8.95436560e-01, 8.95157310e-01, 8.92643128e-01, 8.79559652e-01,
        7.32217107e-01, 2.29993214e-01, 2.87635299e-02, 2.94913831e-03,
        2.95660670e-04]),
 'param_alpha': masked_array(data=[0.0, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0,
                    1000.0],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'params': ({'alpha': 0.0},
  {'alpha': 0.0001},
  {'alpha': 0.0

## Model conversion

In [None]:
from spark_sklearn import Converter

In [None]:
converter = Converter(sc)

In [None]:
est = linear_model.LinearRegression()

In [None]:
type(est.fit(trainingFeatures, trainingLabels))

sklearn.linear_model.base.LinearRegression

In [None]:
spark_est = converter.toSpark(est)

In [None]:
type(spark_est)

pyspark.ml.regression.LinearRegressionModel

In [None]:
spark_est.coefficients, spark_est.intercept

(DenseVector([2324452.1579, 106742.4077, 253151.2192, 297410.8291, -1325948.0436, -435920.9465, -1388800.1117, 43757.9117, 7738.0288]),
 6043.163969614121)

In [None]:
from pyspark.sql.types import *
from pyspark.ml.linalg import DenseVector, VectorUDT

In [None]:
schema = StructType(fields=[
    StructField("features", VectorUDT()),
    StructField("labels", IntegerType())
])

In [None]:
test_df = spark.createDataFrame(zip(map(DenseVector, testFeatures), map(int, testLabels)), schema=schema)

In [None]:
test_df.show()

+--------------------+------+
|            features|labels|
+--------------------+------+
|[0.00788666607325...|  9276|
|[0.00709799946592...|  9596|
|[0.00259796058883...|  1580|
|[0.00231960766860...|  2180|
|[0.00139176460116...|   405|
|[0.00269074489558...|  1899|
|[0.00329384288941...|  2743|
|[0.00143815675453...|   503|
|[0.00320105858267...|  2235|
|[0.00190207828825...|   683|
|[0.00185568613488...|  1397|
|[0.00171650967476...|   681|
|[0.00561345055802...|  5529|
|[0.00463921533720...|  3977|
|[0.00468560749057...|  7455|
|[0.00477839179732...|  6558|
|[0.00180929398151...|  1107|
|[0.00139176460116...|   844|
|[0.00199486259499...|   696|
|[0.00236599982197...|  1778|
+--------------------+------+
only showing top 20 rows



In [None]:
spark_est.transform(test_df).show()

+--------------------+------+-------------------+
|            features|labels|         prediction|
+--------------------+------+-------------------+
|[0.00788666607325...|  9276| 11296.620231638692|
|[0.00709799946592...|  9596|  9858.416315002723|
|[0.00259796058883...|  1580| 1865.1855745771973|
|[0.00231960766860...|  2180|  1689.371573917042|
|[0.00139176460116...|   405|-1615.6485644556606|
|[0.00269074489558...|  1899| 2618.6867921700714|
|[0.00329384288941...|  2743|  3363.065116162206|
|[0.00143815675453...|   503|-460.47274751667373|
|[0.00320105858267...|  2235|  2132.080121836687|
|[0.00190207828825...|   683| -222.0904419904209|
|[0.00185568613488...|  1397|  1942.235590016593|
|[0.00171650967476...|   681| 199.81841346109195|
|[0.00561345055802...|  5529|  6781.623232343527|
|[0.00463921533720...|  3977|  5278.026004134679|
|[0.00468560749057...|  7455|  5548.580869659747|
|[0.00477839179732...|  6558|  5731.986774714591|
|[0.00180929398151...|  1107| 1357.6523872960688|


In [None]:
spark.stop()