In [63]:
import pandas as pd
from sklearn.datasets import load_boston

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

spark = SparkSession.builder \
    .master("local[4]") \
    .appName("lesson_6") \
    .config("spark.executor.instances", 2) \
    .config("spark.executor.memory", f'{int(2000/4.4)}mb') \
    .config("spark.executor.cores", 2) \
    .getOrCreate()

In [59]:
# Загружаем данные
data = load_boston()
dataset = pd.DataFrame(data['data'], columns=data['feature_names'])
dataset['target'] = data['target']

# Создаем из данных Spark Data Frame и разбиваем на тестовую и обучающую выборки 
df = spark.createDataFrame(dataset)
train, test = df.randomSplit([0.7, 0.3])

In [60]:
# собираем все признаки в один вектор
va = VectorAssembler(inputCols=data['feature_names'], outputCol="features")

# используем алгоритм случайного леса
lr = RandomForestRegressor(featuresCol='features', labelCol='target')

In [61]:
# Собираем пайплайн
pipeline = Pipeline(stages=[va, lr])

# Обучаем модель на обучающих данныыых
model = pipeline.fit(train)
train_predict = model.transform(train)

# Прогоняем модель на тестовых данных
test_predict = model.transform(test)
test_predict['features', 'target', 'prediction'].show(truncate=False)

+---------------------------------------------------------------------------+------+------------------+
|features                                                                   |target|prediction        |
+---------------------------------------------------------------------------+------+------------------+
|[0.0136,75.0,4.0,0.0,0.41,5.888,47.6,7.3197,3.0,469.0,21.1,396.9,14.8]     |18.9  |19.9396683595725  |
|[0.02055,85.0,0.74,0.0,0.41,6.383,35.7,9.1876,2.0,313.0,17.3,396.9,5.77]   |24.7  |23.139752879982176|
|[0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03]  |34.7  |38.43440655504626 |
|[0.02985,0.0,2.18,0.0,0.458,6.43,58.7,6.0622,3.0,222.0,18.7,394.12,5.21]   |28.7  |23.491556958326186|
|[0.03359,75.0,2.95,0.0,0.428,7.024,15.8,5.4011,3.0,252.0,18.3,395.62,1.98] |34.9  |37.03165499155128 |
|[0.04294,28.0,15.04,0.0,0.464,6.249,77.3,3.615,4.0,270.0,18.2,396.9,10.59] |20.6  |21.960818315680413|
|[0.04684,0.0,3.41,0.0,0.489,6.417,66.1,3.0923,2.0,270.0,17.8,39

In [76]:
evaluator = RegressionEvaluator(labelCol='target')

mae_train = evaluator.evaluate(train_predict, {evaluator.metricName: 'mae'})
mae_test = evaluator.evaluate(test_predict, {evaluator.metricName: 'mae'})

In [77]:
print(f'''
    Scores:: 
        train: {mae_train}, 
        test: {mae_test}
    ''')


    Scores:: 
        train: 1.836783453413188, 
        test: 2.459304293044101
    
