In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark

In [2]:
DClass = spark  \
    .read   \
    .option("inferSchema", "true")  \
    .option("header", "true")   \
    .csv("./data/census.csv")


In [3]:
DReg = spark    \
    .read   \
    .option("inferSchema", "true")  \
    .option("header", "true")   \
    .csv("./data/insurance.csv")

# Classification

## Logistic Regression

In [4]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import RFormula

supervised = RFormula(formula="income ~ . + age + workclass + education + occupation + race + sex")

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [5]:
fittedRF = supervised.fit(DClass)
preparedDF = fittedRF.transform(DClass)
preparedDF.show()

+---+-----------------+------------+-------------+-------------+--------------------+------------------+--------------+-------------------+-------+------------+------------+-------------+--------------+------+--------------------+-----+
|age|        workclass|final-weight|    education|education-num|      marital-status|        occupation|  relationship|               race|    sex|capital-gain|capital-loos|hour-per-week|native-country|income|            features|label|
+---+-----------------+------------+-------------+-------------+--------------------+------------------+--------------+-------------------+-------+------------+------------+-------------+--------------+------+--------------------+-----+
| 39|        State-gov|     77516.0|    Bachelors|         13.0|       Never-married|      Adm-clerical| Not-in-family|              White|   Male|      2174.0|         0.0|         40.0| United-States| <=50K|(100,[0,5,9,12,25...|  0.0|
| 50| Self-emp-not-inc|     83311.0|    Bachelors|  

In [6]:
lrModel = lr.fit(preparedDF)

In [7]:
# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

Coefficients: (100,[],[])
Intercept: -1.1482462553407051


In [8]:
mlr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial")
mlrModel = mlr.fit(preparedDF)

# Print the coefficients and intercepts for logistic regression with multinomial family
print("Multinomial coefficients: " + str(mlrModel.coefficientMatrix))
print("Multinomial intercepts: " + str(mlrModel.interceptVector))

Multinomial coefficients: 2 X 100 CSRMatrix

Multinomial intercepts: [0.5741225268121906,-0.5741225268121906]


In [9]:
## METRICS

trainingSummary = lrModel.summary

# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

# Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
trainingSummary.roc.show()
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

# Set the model threshold to maximize F-Measure
fMeasure = trainingSummary.fMeasureByThreshold
maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head()
bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
    .select('threshold').head()['threshold']
lr.setThreshold(bestThreshold)

objectiveHistory:
0.5520112931916317
+---+---+
|FPR|TPR|
+---+---+
|0.0|0.0|
|1.0|1.0|
|1.0|1.0|
+---+---+

areaUnderROC: 0.5


LogisticRegression_69c721d873b2

In [10]:
## METRICS

trainingSummary = mlrModel.summary

# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

# Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
trainingSummary.roc.show()
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

# Set the model threshold to maximize F-Measure
fMeasure = trainingSummary.fMeasureByThreshold
maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head()
bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
    .select('threshold').head()['threshold']
lr.setThreshold(bestThreshold)

objectiveHistory:
0.5520112938845835
0.552011293732214
0.5520112933268099
0.5520112932253791
0.5520112932001097
0.5520112931936949
0.5520112931921582
0.5520112931917337
+---+---+
|FPR|TPR|
+---+---+
|0.0|0.0|
|1.0|1.0|
|1.0|1.0|
+---+---+

areaUnderROC: 0.5


LogisticRegression_69c721d873b2

In [11]:
# Print the coefficients and intercept for multinomial logistic regression
print("Coefficients: \n" + str(lrModel.coefficientMatrix))
print("Intercept: " + str(lrModel.interceptVector))

trainingSummary = lrModel.summary

# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

# for multiclass, we can inspect metrics on a per-label basis
print("False positive rate by label:")
for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

print("True positive rate by label:")
for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

print("Precision by label:")
for i, prec in enumerate(trainingSummary.precisionByLabel):
    print("label %d: %s" % (i, prec))

print("Recall by label:")
for i, rec in enumerate(trainingSummary.recallByLabel):
    print("label %d: %s" % (i, rec))

print("F-measure by label:")
for i, f in enumerate(trainingSummary.fMeasureByLabel()):
    print("label %d: %s" % (i, f))

accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

Coefficients: 
1 X 100 CSRMatrix

Intercept: [-1.1482462553407051]
objectiveHistory:
0.5520112931916317
False positive rate by label:
label 0: 1.0
label 1: 0.0
True positive rate by label:
label 0: 1.0
label 1: 0.0
Precision by label:
label 0: 0.7591904425539756
label 1: 0.0
Recall by label:
label 0: 1.0
label 1: 0.0
F-measure by label:
label 0: 0.8631134232991743
label 1: 0.0
Accuracy: 0.7591904425539756
FPR: 0.7591904425539756
TPR: 0.7591904425539756
F-measure: 0.6552674618087769
Precision: 0.5763701280653014
Recall: 0.7591904425539756
