# Demo: To convert PySpark logistic regression model to Python

In [1]:
spark

In [2]:
import numpy as np
import pandas as pd

from pyspark.sql import functions as F
from pyspark.sql import Row
from pyspark.sql.types import *

In [3]:
# Helpers
vector_udf = F.udf(
    lambda x: x.toArray().tolist(),
    returnType=ArrayType(DoubleType()),
)

In [4]:
# Load and parse the data file, converting it to a DataFrame.
data = spark.read.format("libsvm").load("inputs/mllib_sample_libsvm_data.txt")
data.show(2)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
+-----+--------------------+
only showing top 2 rows



In [5]:
(trainData, testData) = data.randomSplit([0.7, 0.3])

## Trained PySpark LogisticRegression model

In [6]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrModel = lr.fit(trainData)

In [7]:
predictions = lrModel.transform(testData)

predictions.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[98,99,100,1...|[0.53918113792610...|[0.63162190899561...|       0.0|
|  0.0|(692,[122,123,148...|[0.21323955774631...|[0.55310879878587...|       0.0|
|  0.0|(692,[123,124,125...|[0.55931923418656...|[0.63629500958092...|       0.0|
|  0.0|(692,[124,125,126...|[0.10587391217610...|[0.52644378133346...|       0.0|
|  0.0|(692,[125,126,127...|[0.28816512069783...|[0.57154686487438...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



## Convert to sklearn LogisticRegression model

In [8]:
# Initialize a Python model
from sklearn.linear_model import LogisticRegression

py_model = LogisticRegression()

In [9]:
# Pretrain the model with a sample
sample_df = predictions.select("label", vector_udf("features").alias("features"), "probability").toPandas()

feats = np.array([np.array(v) for v in sample_df["features"].values])
targets = sample_df["label"].values

py_model.fit(feats, targets)

LogisticRegression()

In [10]:
# Set coefs and intercept from trained PySpark logistic regression model
py_model.coef_ = np.asarray(lrModel.coefficients).reshape(1, -1)
py_model.intercept_ = lrModel.intercept

In [11]:
# Comparing Python with Spark results
py_preds = py_model.predict_proba(feats)
spark_preds = np.array([np.array(v) for v in sample_df["probability"].values])

np.allclose(py_preds, spark_preds)

True