In [19]:
import numpy
import pandas
import pyspark
from sklearn import datasets

from pybda.glm import GLM
from pybda.spark.features import assemble

In [7]:
conf = (pyspark.SparkConf()
         .setMaster("local")
         .set("spark.driver.memory", "1g")
         .set("spark.executor.memory", "1g"))
sc = pyspark.SparkContext(conf=conf)
spark = pyspark.sql.SparkSession(sc)

In [29]:
iris = datasets.load_iris()


features = ["sl", "sw", "pl", "pw"]
X = iris.data[iris.target < 2, :4]
mu = X.dot(numpy.array([-1, 2, -2, 1]))
eta = 1 / (1 + numpy.exp(-mu))
y_log = numpy.random.binomial(1, eta)
df = pandas.DataFrame(
  data=numpy.column_stack((X,y_log)),
  columns=features + [response])


In [23]:
features = list(boston.feature_names)
response =  "response"
df = pandas.DataFrame(
  data=numpy.column_stack((boston.data, boston.target[:,numpy.newaxis])),
  columns=features + [response])

In [17]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,response
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [59]:
spark_df = spark.createDataFrame(df)
spark_df = assemble(spark_df, features, True)

In [60]:
spark_df.take(5)

[Row(response=0.0, features=DenseVector([5.1, 3.5, 1.4, 0.2])),
 Row(response=0.0, features=DenseVector([4.9, 3.0, 1.4, 0.2])),
 Row(response=0.0, features=DenseVector([4.7, 3.2, 1.3, 0.2])),
 Row(response=0.0, features=DenseVector([4.6, 3.1, 1.5, 0.2])),
 Row(response=1.0, features=DenseVector([5.0, 3.6, 1.4, 0.2]))]

In [33]:
model = GLM(spark, response, features, family="binomial")
fit = model.fit(spark_df)

Could not compute p-values, t-values and SEs. Possibly due to singular vcov.


In [37]:
fit._GLMFit__model.summary.pValues

AttributeError: 'BinaryLogisticRegressionTrainingSummary' object has no attribute 'pValues'

In [61]:
from pyspark.ml.regression import GeneralizedLinearRegression
glr = GeneralizedLinearRegression(family="binomial", link="logit")
glr.setLabelCol(response)
model = glr.fit(spark_df)
summary = model.summary

In [63]:
model.coefficients

DenseVector([-3.8447, 4.6132, 0.2264, 2.0322])

In [64]:
spark_df = model.transform(spark_df)

In [65]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol=response)

In [66]:
f1 = evaluator.evaluate(spark_df, {evaluator.metricName: "f1"})
accuracy = evaluator.evaluate(spark_df, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate( spark_df, {evaluator.metricName: "weightedPrecision"})
recall = evaluator.evaluate(spark_df, {evaluator.metricName: "weightedRecall"})

In [67]:
spark_df.take(5)

[Row(response=0.0, features=DenseVector([5.1, 3.5, 1.4, 0.2]), prediction=0.20282402802526672),
 Row(response=0.0, features=DenseVector([4.9, 3.0, 1.4, 0.2]), prediction=0.05183821406417523),
 Row(response=0.0, features=DenseVector([4.7, 3.2, 1.3, 0.2]), prediction=0.22487881331254464),
 Row(response=0.0, features=DenseVector([4.6, 3.1, 1.5, 0.2]), prediction=0.21942236593200545),
 Row(response=1.0, features=DenseVector([5.0, 3.6, 1.4, 0.2]), prediction=0.37216302656941064)]

In [69]:
spark.stop()