In [None]:
from pyspark.ml import Pipeline, PipelineModel
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import StructType, StringType, IntegerType, TimestampType
from pyspark.sql import functions as F
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import OneHotEncoderEstimator, VectorAssembler, CountVectorizer, StringIndexer, IndexToString

dftr = spark.read.load("data_train_py.csv",
                     format="csv", sep=",", inferSchema="true", header="true")

## Cчитаем median_house_value-'это таргет по аналогии как на уроке смотрели "сегмент"'

categoricalColumns = ['ocean_proximity']
stages = []
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + 'Index').setHandleInvalid("keep")
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"]).setHandleInvalid("keep")
    stages += [stringIndexer, encoder]

label_stringIdx = StringIndexer(inputCol = 'median_house_value', outputCol = 'label').setHandleInvalid("keep")
stages += [label_stringIdx]


numericCols = ['housing_median_age','total_rooms','total_bedrooms','median_income']
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features").setHandleInvalid("keep")
stages += [assembler]

lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)
stages += [lr]


label_stringIdx_fit = label_stringIdx.fit(dftr)
indexToStringEstimator = IndexToString().setInputCol("prediction").setOutputCol("predict_value").setLabels(  label_stringIdx_fit.labels)

stages +=[indexToStringEstimator]

pipeline = Pipeline().setStages(stages)
pipelineModel = pipeline.fit(dftr)

#сохраняем модель на HDFS
pipelineModel.write().overwrite().save("my_LR_model8")

###для наглядности
pipelineModel.transform(dftr).select("median_house_value", "predict_value").show(100)
