# PySpark
### Вызывает функции из Scala

+ Импорт данных
+ Разделение выборки на обучающую и тестовую
+ Линейная регрессия
+ Результаты

## Импорт данных

In [2]:
from pyspark.mllib.regression import LabeledPoint

data = sc.textFile('/user/supp.bda08/boston.csv')\
    .filter(lambda l: 'target' not in l)\
    .map(lambda l: l.split(","))\
    .map(lambda l: map(float, l))\
    .map(lambda l: LabeledPoint(l[-1], l[:-1]))\
    .toDF(['features','label'])

data.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.00632,18.0,2.3...| 24.0|
|[0.02731,0.0,7.07...| 21.6|
|[0.02729,0.0,7.07...| 34.7|
|[0.03237,0.0,2.18...| 33.4|
|[0.06905,0.0,2.18...| 36.2|
+--------------------+-----+
only showing top 5 rows



## Разделение выборки на обучающую и тестовую

In [3]:
train, test = data.randomSplit([0.85, 0.15], seed=11)

## Линейная регрессия

In [4]:
from pyspark.ml.regression import LinearRegression, RandomForestRegressor

lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrModel = lr.fit(train)

results = lrModel.transform(test)



## Результаты

In [5]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(results)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
results.show(5)

Root Mean Squared Error (RMSE) on test data = 5.25262
+--------------------+-----+------------------+
|            features|label|        prediction|
+--------------------+-----+------------------+
|[0.02055,85.0,0.7...| 24.7| 25.94208720759296|
|[0.02187,60.0,2.9...| 31.1|30.976907086911904|
|[0.0351,95.0,2.68...| 48.5|37.224301270450525|
|[0.03584,80.0,3.3...| 23.5| 28.56204733440513|
|[0.05188,0.0,4.49...| 22.5| 22.08488638608624|
+--------------------+-----+------------------+
only showing top 5 rows

