In [1]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression

In [2]:
training = spark.createDataFrame([
    (1.0, Vectors.dense([0.0, 1.1, 0.1])),
    (0.0, Vectors.dense([2.0, 1.0, -1.0])),
    (0.0, Vectors.dense([2.0, 1.3, 1.0])),
    (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])

In [3]:
training.show()

In [4]:
# Create a LogisticRegression instance. This instance is an Estimator.
lr = LogisticRegression(maxIter=10, regParam=0.01)

In [5]:
lr.explainParams()

In [6]:
# Learn a LogisticRegression model. This uses the parameters stored in lr.
model1 = lr.fit(training)
# this is the transformer produced by Estimator()

In [7]:
# Prepare test data
test = spark.createDataFrame([
    (1.0, Vectors.dense([-1.0, 1.5, 1.3])),
    (0.0, Vectors.dense([3.0, 2.0, -0.1])),
    (1.0, Vectors.dense([0.0, 2.2, -1.5]))], ["label", "features"])

In [8]:
prediction = model1.transform(test)

In [9]:
prediction.show()

In [10]:
model1.extractParamMap()

In [11]:
# parameters are init values
paramMap = {lr.maxIter: 20}
paramMap.update({lr.regParam: 0.1, lr.threshold: 0.55})

paramMap

In [12]:
# You can combine paramMaps, which are python dictionaries.
paramMap2 = {lr.probabilityCol: "myProbability"}  # Change output column name
paramMapCombined = paramMap.copy()
paramMapCombined.update(paramMap2)

paramMapCombined

In [13]:
model2 = lr.fit(training, paramMapCombined)

In [14]:
model2.extractParamMap()

In [15]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer

In [16]:
# Prepare training documents from a list of (id, text, label) tuples.
training = spark.createDataFrame([
    (0, "a b c d e spark", 1.0),
    (1, "b d", 0.0),
    (2, "spark f g h", 1.0),
    (3, "hadoop mapreduce", 0.0)
], ["id", "text", "label"])

In [17]:
training.show()

In [18]:
tokenizer = Tokenizer(inputCol="text", outputCol="words")

In [19]:
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")

In [20]:
lr = LogisticRegression(maxIter=10, regParam=0.001)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

model = pipeline.fit(training)

In [21]:
test = spark.createDataFrame([
    (4, "spark i j k"),
    (5, "l m n"),
    (6, "spark hadoop spark"),
    (7, "apache hadoop")
], ["id", "text"])

In [22]:
pred = model.transform(test)

In [23]:
pred.show()