In [1]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Datacamp Pyspark Tutorial")\
.config("spark.memory.offHeap.enabled","true")\
.config("spark.memory.offHeap.size","10g")\
.getOrCreate()

In [3]:
# Sample data
data = [(0, 1.0, 0.5), (1, 2.0, 1.5), (0, 0.5, 0.3), (1, 2.5, 1.7)]
df = spark.createDataFrame(data, ["label", "feature1", "feature2"])

VectorAssembler is used to combine multiple feature columns into a single vector column.

LogisticRegression is the machine learning model we’re using to predict the label.

The Pipeline object chains together the steps for feature assembly and model training

In [4]:
# Assemble features into a vector
assembler = VectorAssembler(inputCols=["feature1", "feature2"], outputCol="features")

# Define a Logistic Regression model
lr = LogisticRegression(featuresCol="features", labelCol="label")

# Create a pipeline with the assembler and the logistic regression model
pipeline = Pipeline(stages=[assembler, lr])

In [5]:
# Train the model
model = pipeline.fit(df)

# Make predictions
predictions = model.transform(df)

# Show the predictions
predictions.select("label", "features", "prediction").show()

+-----+---------+----------+
|label| features|prediction|
+-----+---------+----------+
|    0|[1.0,0.5]|       0.0|
|    1|[2.0,1.5]|       1.0|
|    0|[0.5,0.3]|       0.0|
|    1|[2.5,1.7]|       1.0|
+-----+---------+----------+

