In [1]:
MASTER = "local"
NUM_PROCESSORS = "8"
NUM_EXECUTORS = "4"
NUM_PARTITIONS = 10

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType
from pyspark.ml import Pipeline
from pyspark.sql import SQLContext
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
import pyspark.sql.functions as psf
import json



In [2]:
conf = SparkConf()

conf.set("spark.app.name", "one_part_data")
conf.set("spark.master", MASTER)
conf.set("spark.executor.cores", NUM_PROCESSORS)
conf.set("spark.executor.instances", NUM_EXECUTORS)
conf.set("spark.executor.memory", "6g")
conf.set("spark.locality.wait", "0")
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
conf.set("spark.kryoserializer.buffer.max", "2000")
conf.set("spark.executor.heartbeatInterval", "6000s")
conf.set("spark.network.timeout", "10000000s")
conf.set("spark.shuffle.spill", "true")
conf.set("spark.driver.memory", "15g")
conf.set("spark.driver.maxResultSize", "15g")

<pyspark.conf.SparkConf at 0x7f5dfb2ae470>

In [3]:
from pyspark.sql import SparkSession

# Create SparkSession 
spark = SparkSession.builder \
      .config(conf=conf) \
      .master("local[*]") \
      .appName("SparkByExamples.com") \
      .getOrCreate()
spark

23/01/03 16:01:15 WARN Utils: Your hostname, yagor-pc resolves to a loopback address: 127.0.1.1; using 192.168.0.107 instead (on interface enp7s0)
23/01/03 16:01:15 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/01/03 16:01:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [200]:
dataset = spark.read.csv("/home/yagor/Рабочий стол/mipt/lab3/notebook/nutrition_table.csv",header=True,inferSchema=True)

In [201]:
dataset.show()

+---+--------+------------------+-----------+-------------+---------+-----------+--------------------+-----------------+--------+--------------------+
|_c0|fat_100g|carbohydrates_100g|sugars_100g|proteins_100g|salt_100g|energy_100g|reconstructed_energy|            g_sum|exceeded|             product|
+---+--------+------------------+-----------+-------------+---------+-----------+--------------------+-----------------+--------+--------------------+
|  1|   28.57|             64.29|      14.29|         3.57|      0.0|     2243.0|             2267.85|            96.43|       0|Banana Chips Swee...|
|  2|   17.86|             60.71|      17.86|        17.86|    0.635|     1941.0|             2032.23|96.42999999999999|       0|             Peanuts|
|  3|   57.14|             17.86|       3.57|        17.86|  1.22428|     2540.0|              2835.7|            92.86|       0|Organic Salted Nu...|
|  7|   18.75|             57.81|      15.62|        14.06|   0.1397|     1833.0|             

In [202]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [203]:
feat_cols = [ #'_c0',
 'fat_100g',
 'carbohydrates_100g',
 'sugars_100g',
 'proteins_100g',
 'salt_100g',
 'energy_100g',
 'reconstructed_energy',
 'g_sum',
 'exceeded',
 #'product'
 ]

In [204]:
vec_assembler = VectorAssembler(inputCols = feat_cols, outputCol='features')

In [205]:
final_data = vec_assembler.transform(dataset)

In [206]:
final_data.show()

+---+--------+------------------+-----------+-------------+---------+-----------+--------------------+-----------------+--------+--------------------+--------------------+
|_c0|fat_100g|carbohydrates_100g|sugars_100g|proteins_100g|salt_100g|energy_100g|reconstructed_energy|            g_sum|exceeded|             product|            features|
+---+--------+------------------+-----------+-------------+---------+-----------+--------------------+-----------------+--------+--------------------+--------------------+
|  1|   28.57|             64.29|      14.29|         3.57|      0.0|     2243.0|             2267.85|            96.43|       0|Banana Chips Swee...|[28.57,64.29,14.2...|
|  2|   17.86|             60.71|      17.86|        17.86|    0.635|     1941.0|             2032.23|96.42999999999999|       0|             Peanuts|[17.86,60.71,17.8...|
|  3|   57.14|             17.86|       3.57|        17.86|  1.22428|     2540.0|              2835.7|            92.86|       0|Organic Sal

In [207]:
from pyspark.ml.feature import StandardScaler

In [208]:
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)

In [209]:
final_data

DataFrame[_c0: int, fat_100g: double, carbohydrates_100g: double, sugars_100g: double, proteins_100g: double, salt_100g: double, energy_100g: double, reconstructed_energy: double, g_sum: double, exceeded: int, product: string, features: vector]

In [210]:
# Compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(final_data)

In [211]:
# Normalize each feature to have unit standard deviation.
cluster_final_data = scalerModel.transform(final_data)

## train model

In [329]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [331]:
for k in range(2,15):
    kmeans = KMeans(featuresCol='scaledFeatures',k=k)
    model = kmeans.fit(cluster_final_data)
    predictions = model.transform(cluster_final_data)
    evaluator = ClusteringEvaluator()
    silhouette = evaluator.evaluate(predictions)
    print("With K={}".format(k))
    print("Silhouette with squared euclidean distance = " + str(silhouette))
    print('--'*30)

With K=2
Silhouette with squared euclidean distance = 0.7669581049866521
------------------------------------------------------------
With K=3
Silhouette with squared euclidean distance = 0.5431066472112437
------------------------------------------------------------
With K=4
Silhouette with squared euclidean distance = 0.41997615593243925
------------------------------------------------------------
With K=5
Silhouette with squared euclidean distance = 0.27268251497225104
------------------------------------------------------------
With K=6
Silhouette with squared euclidean distance = 0.28940347341689504
------------------------------------------------------------
With K=7
Silhouette with squared euclidean distance = 0.28943152207863865
------------------------------------------------------------
With K=8
Silhouette with squared euclidean distance = 0.29255586497353264
------------------------------------------------------------
With K=9
Silhouette with squared euclidean distance = 0.3

In [214]:
k = 2

kmeans = KMeans(featuresCol='scaledFeatures', predictionCol='pred_kmeans_cluster', k=k)

In [215]:
modelKMeans = kmeans.fit(cluster_final_data)

In [216]:
predictions = modelKMeans.transform(cluster_final_data)

## 2. Split on train and test and analys

In [217]:
predictions.show()

+---+--------+------------------+-----------+-------------+---------+-----------+--------------------+-----------------+--------+--------------------+--------------------+--------------------+-------------------+
|_c0|fat_100g|carbohydrates_100g|sugars_100g|proteins_100g|salt_100g|energy_100g|reconstructed_energy|            g_sum|exceeded|             product|            features|      scaledFeatures|pred_kmeans_cluster|
+---+--------+------------------+-----------+-------------+---------+-----------+--------------------+-----------------+--------+--------------------+--------------------+--------------------+-------------------+
|  1|   28.57|             64.29|      14.29|         3.57|      0.0|     2243.0|             2267.85|            96.43|       0|Banana Chips Swee...|[28.57,64.29,14.2...|[1.91358558312757...|                  1|
|  2|   17.86|             60.71|      17.86|        17.86|    0.635|     1941.0|             2032.23|96.42999999999999|       0|             Peanut

In [218]:
evaluator = ClusteringEvaluator(predictionCol='pred_kmeans_cluster')

In [219]:
silhouette = evaluator.evaluate(predictions)

In [220]:
print(f"With k={k} Silhouette with squared euclidean distance = " + str(silhouette))

With k=2 Silhouette with squared euclidean distance = 0.7669581049866521


In [221]:
centers=modelKMeans.clusterCenters()
print("Cluster Centers:")
for center in centers:
    print(center)

Cluster Centers:
[0.24806769 0.44311114 0.33126029 0.50451036 0.11772674 0.57940681
 0.54971219 0.61099038 0.        ]
[1.19469736 1.86211665 1.15845923 1.16394279 0.11199035 2.26472
 2.25908682 2.41147805 0.1880228 ]


In [222]:
modelKMeans.transform(cluster_final_data).groupBy('pred_kmeans_cluster').count().show()

+-------------------+-----+
|pred_kmeans_cluster|count|
+-------------------+-----+
|                  1|22500|
|                  0|22528|
+-------------------+-----+



In [223]:
cols = ["_c0", "pred_kmeans_cluster"]
result = predictions.select(*cols)

In [224]:
result.show()

+---+-------------------+
|_c0|pred_kmeans_cluster|
+---+-------------------+
|  1|                  1|
|  2|                  1|
|  3|                  1|
|  7|                  1|
| 12|                  1|
| 15|                  1|
| 16|                  1|
| 19|                  1|
| 20|                  1|
| 21|                  1|
| 22|                  1|
| 23|                  1|
| 24|                  1|
| 26|                  1|
| 27|                  1|
| 28|                  1|
| 29|                  1|
| 31|                  1|
| 32|                  1|
| 34|                  1|
+---+-------------------+
only showing top 20 rows



In [225]:
(trainingData, testData) = predictions.randomSplit([0.7, 0.3])

In [226]:
trainingData.groupBy('pred_kmeans_cluster').count().show()

+-------------------+-----+
|pred_kmeans_cluster|count|
+-------------------+-----+
|                  1|15818|
|                  0|15856|
+-------------------+-----+



In [227]:
testData.count()

13354

In [228]:
trainingData.count()

31674

## 3. Classification model

In [229]:
from pyspark.ml.classification import MultilayerPerceptronClassifier, RandomForestClassifier, LogisticRegression
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

In [230]:
RandomForest = RandomForestClassifier(labelCol="pred_kmeans_cluster", featuresCol="scaledFeatures", predictionCol='pred_from_randomforest_class', numTrees=20, maxDepth=3)


In [231]:
modelRandomForest = RandomForest.fit(trainingData)

In [232]:
# Make predictions.
predictions_RandomForest = modelRandomForest.transform(testData)

In [233]:

# Select example rows to display.
predictions_RandomForest.select("pred_from_randomforest_class", "pred_kmeans_cluster", "scaledFeatures").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="pred_kmeans_cluster", predictionCol="pred_from_randomforest_class", metricName="accuracy")
accuracy = evaluator.evaluate(predictions_RandomForest)
print("Test Error = %g" % (1.0 - accuracy))

+----------------------------+-------------------+--------------------+
|pred_from_randomforest_class|pred_kmeans_cluster|      scaledFeatures|
+----------------------------+-------------------+--------------------+
|                         1.0|                  1|[1.21767539031358...|
|                         1.0|                  1|[0.39852412389251...|
|                         1.0|                  1|[1.17212977615444...|
|                         1.0|                  1|[1.48826992149438...|
|                         1.0|                  1|[3.12590266589302...|
+----------------------------+-------------------+--------------------+
only showing top 5 rows

Test Error = 0.0152014


In [234]:
# Select example rows to display.
predictions_RandomForest.select("pred_from_randomforest_class", "pred_kmeans_cluster", "scaledFeatures", 'rawPrediction', 'probability').show(5)

+----------------------------+-------------------+--------------------+--------------------+--------------------+
|pred_from_randomforest_class|pred_kmeans_cluster|      scaledFeatures|       rawPrediction|         probability|
+----------------------------+-------------------+--------------------+--------------------+--------------------+
|                         1.0|                  1|[1.21767539031358...|[0.22815366508993...|[0.01140768325449...|
|                         1.0|                  1|[0.39852412389251...|[1.23153962061928...|[0.06157698103096...|
|                         1.0|                  1|[1.17212977615444...|[0.22914498327826...|[0.01145724916391...|
|                         1.0|                  1|[1.48826992149438...|[0.22815366508993...|[0.01140768325449...|
|                         1.0|                  1|[3.12590266589302...|[0.61485495816030...|[0.03074274790801...|
+----------------------------+-------------------+--------------------+-----------------

In [235]:
predictions_RandomForest.groupBy('pred_from_randomforest_class').count().show()

+----------------------------+-----+
|pred_from_randomforest_class|count|
+----------------------------+-----+
|                         0.0| 6611|
|                         1.0| 6743|
+----------------------------+-----+



### Train all data

In [236]:
modelRandomForest = RandomForest.fit(predictions)

In [237]:
# Make predictions.
predictions_RandomForest = modelRandomForest.transform(testData)

In [238]:

# Select example rows to display.
predictions_RandomForest.select("pred_from_randomforest_class", "pred_kmeans_cluster", "scaledFeatures").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="pred_kmeans_cluster", predictionCol="pred_from_randomforest_class", metricName="accuracy")
accuracy = evaluator.evaluate(predictions_RandomForest)
print("Test Error = %g" % (1.0 - accuracy))

+----------------------------+-------------------+--------------------+
|pred_from_randomforest_class|pred_kmeans_cluster|      scaledFeatures|
+----------------------------+-------------------+--------------------+
|                         1.0|                  1|[1.21767539031358...|
|                         1.0|                  1|[0.39852412389251...|
|                         1.0|                  1|[1.17212977615444...|
|                         1.0|                  1|[1.48826992149438...|
|                         1.0|                  1|[3.12590266589302...|
+----------------------------+-------------------+--------------------+
only showing top 5 rows

Test Error = 0.0149019


In [239]:
# Select example rows to display.
predictions_RandomForest.select("pred_from_randomforest_class", "pred_kmeans_cluster", "scaledFeatures", 'rawPrediction', 'probability').show(5)

+----------------------------+-------------------+--------------------+--------------------+--------------------+
|pred_from_randomforest_class|pred_kmeans_cluster|      scaledFeatures|       rawPrediction|         probability|
+----------------------------+-------------------+--------------------+--------------------+--------------------+
|                         1.0|                  1|[1.21767539031358...|[0.24694698725151...|[0.01234734936257...|
|                         1.0|                  1|[0.39852412389251...|[0.72138124290125...|[0.03606906214506...|
|                         1.0|                  1|[1.17212977615444...|[0.25618526728129...|[0.01280926336406...|
|                         1.0|                  1|[1.48826992149438...|[0.25743953148578...|[0.01287197657428...|
|                         1.0|                  1|[3.12590266589302...|[0.76189713156136...|[0.03809485657806...|
+----------------------------+-------------------+--------------------+-----------------

In [240]:
predictions_RandomForest.groupBy('pred_from_randomforest_class').count().show()

+----------------------------+-----+
|pred_from_randomforest_class|count|
+----------------------------+-----+
|                         0.0| 6643|
|                         1.0| 6711|
+----------------------------+-----+



## 4. Linear regression model and propability (logostic regression)

In [241]:
# LinRegression = LinearRegression(maxIter=30, regParam=0.1, elasticNetParam=0.1, labelCol="pred_from_randomforest_class", featuresCol="scaledFeatures")

# # Fit the model
# modelLinearRegression = LinRegression.fit(predictions_RandomForest)

# # Print the coefficients and intercept for linear regression
# print("Coefficients: %s" % str(modelLinearRegression.coefficients))
# print("Intercept: %s" % str(modelLinearRegression.intercept))

# # Summarize the model over the training set and print out some metrics
# trainingSummary = modelLinearRegression.summary
# print("numIterations: %d" % trainingSummary.totalIterations)
# print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
# trainingSummary.residuals.show()
# print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
# print("r2: %f" % trainingSummary.r2)

In [242]:
columns_to_drop = ['rawPrediction', 'probability']
predictions_RandomForest = predictions_RandomForest.drop(*columns_to_drop)

In [243]:
LogRegression = LogisticRegression(maxIter=10, regParam=0.1, elasticNetParam=1.0, labelCol="pred_from_randomforest_class", featuresCol="scaledFeatures", probabilityCol="lr_prob", predictionCol='pred_from_logregression_class')

In [244]:
lrModel = LogRegression.fit(predictions_RandomForest)

In [245]:
# Print the coefficients and intercept for multinomial logistic regression
print("Coefficients: \n" + str(lrModel.coefficientMatrix))
print("Intercept: " + str(lrModel.interceptVector))

trainingSummary = lrModel.summary

# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

# for multiclass, we can inspect metrics on a per-label basis
print("False positive rate by label:")
for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

print("True positive rate by label:")
for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

print("Precision by label:")
for i, prec in enumerate(trainingSummary.precisionByLabel):
    print("label %d: %s" % (i, prec))

print("Recall by label:")
for i, rec in enumerate(trainingSummary.recallByLabel):
    print("label %d: %s" % (i, rec))

print("F-measure by label:")
for i, f in enumerate(trainingSummary.fMeasureByLabel()):
    print("label %d: %s" % (i, f))

accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

Coefficients: 
1 X 9 CSRMatrix
(0,5) 0.377
(0,6) 0.4166
(0,7) 1.3429
Intercept: [-3.110554177500023]
objectiveHistory:
0.6931342157259093
0.646258270951314
0.4505111552967898
0.4301093899345221
0.4219084264618615
0.41995861358923153
0.4147380024893558
0.41378046200789664
0.4128858275367787
0.41223235884272824
0.41118234851734503
False positive rate by label:
label 0: 0.018775145283862316
label 1: 0.019569471624266144
True positive rate by label:
label 0: 0.9804305283757339
label 1: 0.9812248547161377
Precision by label:
label 0: 0.9810212381382738
label 1: 0.9806403574087863
Recall by label:
label 0: 0.9804305283757339
label 1: 0.9812248547161377
F-measure by label:
label 0: 0.9807257943080862
label 1: 0.9809325189929987
Accuracy: 0.9808297139433878
FPR: 0.019174330851516208
TPR: 0.9808297139433877
F-measure: 0.9808296829826741
Precision: 0.9808298280307712
Recall: 0.9808297139433877


## Create pipeline

In [314]:
from pyspark.ml import Pipeline, Transformer
from pyspark.ml.feature import Bucketizer
from pyspark.sql import DataFrame
from typing import Iterable
from pyspark.mllib.evaluation import MulticlassMetrics

In [315]:
# CUSTOM TRANSFORMER ----------------------------------------------------------------
class ColumnDropper(Transformer):
    """
    A custom Transformer which drops all columns that have at least one of the
    words from the banned_list in the name.
    """
    def __init__(self, banned_list: Iterable[str]):
        super(ColumnDropper, self).__init__()
        self.banned_list = banned_list
    def _transform(self, df: DataFrame) -> DataFrame:
        df = df.drop(*[x for x in df.columns if any(y in x for y in self.banned_list)])
        return df

In [316]:
column_dropper = ColumnDropper(banned_list = ['rawPrediction', 'probability'])

In [317]:
vec_assembler = VectorAssembler(inputCols = feat_cols, outputCol='features')
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)

In [318]:
k = 2

kmeans = KMeans(featuresCol='scaledFeatures', predictionCol='pred_kmeans_cluster', k=k)

In [319]:
RandomForest = RandomForestClassifier(labelCol="pred_kmeans_cluster", featuresCol="scaledFeatures", predictionCol='pred_from_randomforest_class', numTrees=20, maxDepth=3)


In [328]:
LogRegression = LogisticRegression(maxIter=10, regParam=0.1, elasticNetParam=1.0, labelCol="pred_from_randomforest_class", \
featuresCol="scaledFeatures", probabilityCol="lr_prob", predictionCol='pred_from_logregression_class')

In [321]:
(trainingData, testData) = dataset.randomSplit([0.7, 0.3])

In [322]:
pipeline = Pipeline(stages=[vec_assembler, scaler, kmeans, RandomForest, column_dropper, LogRegression])

model = pipeline.fit(trainingData)
output = model.transform(testData)

In [323]:
evaluator = ClusteringEvaluator(predictionCol='pred_kmeans_cluster', featuresCol='scaledFeatures')
silhouette = evaluator.evaluate(output)
print(f"With k={k} Silhouette with squared euclidean distance = " + str(silhouette))
output.groupBy('pred_kmeans_cluster').count().show()

With k=2 Silhouette with squared euclidean distance = 0.5415623545533162
+-------------------+-----+
|pred_kmeans_cluster|count|
+-------------------+-----+
|                  1| 6841|
|                  0| 6872|
+-------------------+-----+



In [324]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="pred_kmeans_cluster", predictionCol="pred_from_randomforest_class", metricName="accuracy")
accuracy = evaluator.evaluate(output)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.0118136


In [325]:
y_true = output.select(['pred_kmeans_cluster']).collect()
y_pred = output.select(['pred_from_randomforest_class']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      6872
           1       0.98      0.99      0.99      6841

    accuracy                           0.99     13713
   macro avg       0.99      0.99      0.99     13713
weighted avg       0.99      0.99      0.99     13713



In [326]:
model.stages[5]

LogisticRegressionModel: uid=LogisticRegression_ebd55cf1cd6d, numClasses=2, numFeatures=9

In [327]:
# Print the coefficients and intercept for multinomial logistic regression
print("Coefficients: \n" + str(model.stages[5].coefficientMatrix))
print("Intercept: " + str(model.stages[5].interceptVector))

trainingSummary = model.stages[5].summary

# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

# for multiclass, we can inspect metrics on a per-label basis
print("False positive rate by label:")
for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

print("True positive rate by label:")
for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

print("Precision by label:")
for i, prec in enumerate(trainingSummary.precisionByLabel):
    print("label %d: %s" % (i, prec))

print("Recall by label:")
for i, rec in enumerate(trainingSummary.recallByLabel):
    print("label %d: %s" % (i, rec))

print("F-measure by label:")
for i, f in enumerate(trainingSummary.fMeasureByLabel()):
    print("label %d: %s" % (i, f))

accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

Coefficients: 
1 X 9 CSRMatrix
(0,5) 0.4205
(0,6) 0.5186
(0,7) 1.2574
Intercept: [-3.1586289909975647]
objectiveHistory:
0.6931003685588204
0.6457369813387426
0.44699065550936784
0.42572991259192006
0.41897441485633996
0.4176530115931187
0.41390590278031647
0.41205438156191804
0.4124578837401556
0.41072719135269103
0.4098039394294418
False positive rate by label:
label 0: 0.007527357834145107
label 1: 0.013736618083322584
True positive rate by label:
label 0: 0.9862633819166774
label 1: 0.9924726421658548
Precision by label:
label 0: 0.9922787438359719
label 1: 0.9866063006979815
Recall by label:
label 0: 0.9862633819166774
label 1: 0.9924726421658548
F-measure by label:
label 0: 0.9892619186234556
label 1: 0.9895307769929365
Accuracy: 0.9893980520517324
FPR: 0.01066202796920009
TPR: 0.9893980520517324
F-measure: 0.9893976485280739
Precision: 0.9894150793439237
Recall: 0.9893980520517324
