In [1]:
zpis = spark.read.json('/FileStore/tables/zips.json')

In [2]:
zpis.printSchema()

In [3]:
from pyspark.sql.functions import to_json, from_json, col, struct, lit
from pyspark.sql.types import StructType, StructField
from pyspark.ml.linalg import VectorUDT

json_vec = to_json(struct(struct(
    lit(1).alias("type"),  # type 1 is dense, type 0 is sparse
    col("loc").alias("values")
).alias("v")))

schema = StructType([StructField("v", VectorUDT())])

with_parsed_vector = zpis.withColumn(
    "parsed_vector", from_json(json_vec, schema).getItem("v")
)

with_parsed_vector.show()

In [4]:
zpis.printSchema()

In [5]:
from pyspark.sql.types import IntegerType
zpis.withColumn('pop', zpis['pop'].cast(IntegerType()))

In [6]:
zpis.show()

In [7]:
import pyspark.ml.feature as ft

In [8]:
transformed_id = ft.StringIndexer(inputCol='_id', outputCol='id_en')
transformed_city = ft.StringIndexer(inputCol='city', outputCol='city_en')
transformed_state = ft.StringIndexer(inputCol='state', outputCol='state_en')

In [9]:
featurecreator = ft.VectorAssembler(inputCols=['id_en',
                                              'city_en',
                                              'state_en'], outputCol='features')

In [10]:
import pyspark.ml.classification as cl

In [11]:
logistic = cl.LogisticRegression(maxIter=10, regParam=0.01, labelCol='pop')

In [12]:
from pyspark.ml import Pipeline

In [13]:

pipeline = Pipeline(stages=[transformed_id, 
                            transformed_city,
                                                        transformed_state,
                            featurecreator,
                            logistic])

In [14]:
zpis_train, zpis_test = zpis.randomSplit([0.7,0.3],seed=100)

In [15]:
model = pipeline.fit(zpis_train)

In [16]:
test_out = model.transform(zpis_test)