In [1]:
# spark.stop()
# creating Spark session

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark = SparkSession.builder\
        .master("local")\
        .appName("RDD")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

spark
sc = spark.sparkContext

## SVM 

In [2]:
from pyspark.ml.classification import LinearSVC

# Load training data
training = spark.read.format("libsvm").load("sample_libsvm_data.txt")

lsvc = LinearSVC(maxIter=10, regParam=0.1)

# Fit the model
lsvcModel = lsvc.fit(training)

# Print the coefficients and intercept for linear SVC
print("Coefficients: " + str(lsvcModel.coefficients))
print("Intercept: " + str(lsvcModel.intercept))

Coefficients: [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.00015154081891400172,-3.4357432628275696e-05,6.8868723771654e-05,0.0005825396368790324,0.0002658667437974877,-5.4448990232205866e-06,-0.000410876298911309,-0.00023771334401618933,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0015051243197375345,0.0005056741785679464,0.0007739871946118443,-7.439317729362511e-05,2.2395429551533054e-07,2.1502767568913162e-05,4.000155795906807e-05,2.841045988826015e-05,1.2172703998609027e-05,-1.4702408529921009e-05,-4.00596456869388e-05,3.0693747761902103e-06,0.00015395475863074347,0.00015205858963404883,-0.00021785419667457335,0.0,0.0,0.0,0.0,0.

In [3]:
training.show(50, truncate=False)

+-----+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [4]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from sklearn.metrics import confusion_matrix
from sklearn.datasets import load_breast_cancer
import pandas as pd

In [5]:
bc = load_breast_cancer()

df_bc = pd.DataFrame(bc.data, columns=bc.feature_names)
df_bc['label'] = pd.Series(bc.target) 

In [6]:
bc

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
         1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
         8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
         8.758e-02],
        ...,
        [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
         7.820e-02],
        [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
         1.240e-01],
        [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
         7.039e-02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0

In [7]:
print(df_bc)

     mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0          17.99         10.38          122.80     1001.0          0.11840   
1          20.57         17.77          132.90     1326.0          0.08474   
2          19.69         21.25          130.00     1203.0          0.10960   
3          11.42         20.38           77.58      386.1          0.14250   
4          20.29         14.34          135.10     1297.0          0.10030   
..           ...           ...             ...        ...              ...   
564        21.56         22.39          142.00     1479.0          0.11100   
565        20.13         28.25          131.20     1261.0          0.09780   
566        16.60         28.08          108.30      858.1          0.08455   
567        20.60         29.33          140.10     1265.0          0.11780   
568         7.76         24.54           47.92      181.0          0.05263   

     mean compactness  mean concavity  mean concave points  mea

In [8]:
# sc = SparkContext().getOrCreate()
sqlContext = SQLContext(sc)

data = sqlContext.createDataFrame(df_bc)
print(data.printSchema()) 



root
 |-- mean radius: double (nullable = true)
 |-- mean texture: double (nullable = true)
 |-- mean perimeter: double (nullable = true)
 |-- mean area: double (nullable = true)
 |-- mean smoothness: double (nullable = true)
 |-- mean compactness: double (nullable = true)
 |-- mean concavity: double (nullable = true)
 |-- mean concave points: double (nullable = true)
 |-- mean symmetry: double (nullable = true)
 |-- mean fractal dimension: double (nullable = true)
 |-- radius error: double (nullable = true)
 |-- texture error: double (nullable = true)
 |-- perimeter error: double (nullable = true)
 |-- area error: double (nullable = true)
 |-- smoothness error: double (nullable = true)
 |-- compactness error: double (nullable = true)
 |-- concavity error: double (nullable = true)
 |-- concave points error: double (nullable = true)
 |-- symmetry error: double (nullable = true)
 |-- fractal dimension error: double (nullable = true)
 |-- worst radius: double (nullable = true)
 |-- worst 

In [9]:
features = bc.feature_names

va = VectorAssembler(inputCols = features, outputCol='features')

va_df = va.transform(data)
va_df = va_df.select(['features', 'label'])
va_df.show(50)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[17.99,10.38,122....|    0|
|[20.57,17.77,132....|    0|
|[19.69,21.25,130....|    0|
|[11.42,20.38,77.5...|    0|
|[20.29,14.34,135....|    0|
|[12.45,15.7,82.57...|    0|
|[18.25,19.98,119....|    0|
|[13.71,20.83,90.2...|    0|
|[13.0,21.82,87.5,...|    0|
|[12.46,24.04,83.9...|    0|
|[16.02,23.24,102....|    0|
|[15.78,17.89,103....|    0|
|[19.17,24.8,132.4...|    0|
|[15.85,23.95,103....|    0|
|[13.73,22.61,93.6...|    0|
|[14.54,27.54,96.7...|    0|
|[14.68,20.13,94.7...|    0|
|[16.13,20.68,108....|    0|
|[19.81,22.15,130....|    0|
|[13.54,14.36,87.4...|    1|
|[13.08,15.71,85.6...|    1|
|[9.504,12.44,60.3...|    1|
|[15.34,14.26,102....|    0|
|[21.16,23.04,137....|    0|
|[16.65,21.38,110....|    0|
|[17.14,16.4,116.0...|    0|
|[14.58,21.53,97.4...|    0|
|[18.61,20.25,122....|    0|
|[15.3,25.27,102.4...|    0|
|[17.57,15.05,115....|    0|
|[18.63,25.11,124....|    0|
|[11.84,18.7,7

In [10]:
(train, test) = va_df.randomSplit([0.9, 0.1])

In [11]:
lsvc = LinearSVC(labelCol="label", maxIter=50)
lsvc = lsvc.fit(train)

In [12]:
pred = lsvc.transform(test)
pred.show(20)

+--------------------+-----+--------------------+----------+
|            features|label|       rawPrediction|prediction|
+--------------------+-----+--------------------+----------+
|[6.981,13.43,43.7...|    1|[-17.097935091095...|       1.0|
|[8.196,16.84,51.7...|    1|[-13.878875195172...|       1.0|
|[9.029,17.33,58.7...|    1|[1.31351407490927...|       0.0|
|[12.19,13.29,79.0...|    1|[-9.2160691132490...|       1.0|
|[12.31,16.52,79.1...|    1|[-7.3643803360363...|       1.0|
|[12.36,18.54,79.0...|    1|[-6.5780483175796...|       1.0|
|[12.45,15.7,82.57...|    0|[3.28057736294740...|       0.0|
|[13.05,19.31,82.6...|    1|[-9.4805831813953...|       1.0|
|[14.42,19.77,94.4...|    0|[0.18232310863180...|       0.0|
|[14.5,10.89,94.28...|    1|[-4.4415255790189...|       1.0|
|[14.54,27.54,96.7...|    0|[11.2891989231919...|       0.0|
|[14.78,23.94,97.4...|    0|[6.34046613186484...|       0.0|
|[15.1,22.02,97.26...|    0|[3.94809261254959...|       0.0|
|[15.46,19.48,101....|  

In [13]:
evaluator=MulticlassClassificationEvaluator(metricName="accuracy")
acc = evaluator.evaluate(pred)
print("Prediction Accuracy: ", acc)


y_pred=pred.select("prediction").collect()
y_orig=pred.select("label").collect()

cm = confusion_matrix(y_orig, y_pred)
print("Confusion Matrix:")
print(cm)

Prediction Accuracy:  0.9710144927536232
Confusion Matrix:
[[26  1]
 [ 1 41]]


## Naive Bayes

In [14]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Load training data
data = spark.read.format("libsvm") \
    .load("sample_libsvm_data.txt")

# Split the data into train and test
splits = data.randomSplit([0.8, 0.2], 1234)
train = splits[0]
test = splits[1]

# create the trainer and set its parameters
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

# train the model
model = nb.fit(train)

# select example rows to display.
predictions = model.transform(test)
predictions.show()

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

+-----+--------------------+--------------------+-----------+----------+
|label|            features|       rawPrediction|probability|prediction|
+-----+--------------------+--------------------+-----------+----------+
|  0.0|(692,[98,99,100,1...|[-176798.45359013...|  [1.0,0.0]|       0.0|
|  0.0|(692,[122,123,124...|[-189158.41593609...|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|[-210439.09448849...|  [1.0,0.0]|       0.0|
|  0.0|(692,[127,128,129...|[-171124.40702548...|  [1.0,0.0]|       0.0|
|  0.0|(692,[127,128,129...|[-213017.59308161...|  [1.0,0.0]|       0.0|
|  0.0|(692,[127,128,129...|[-183431.70141377...|  [1.0,0.0]|       0.0|
|  0.0|(692,[128,129,130...|[-246449.63384784...|  [1.0,0.0]|       0.0|
|  0.0|(692,[153,154,155...|[-144270.60315757...|  [1.0,0.0]|       0.0|
|  1.0|(692,[100,101,102...|[-144997.52281187...|  [0.0,1.0]|       1.0|
|  1.0|(692,[123,124,125...|[-138655.43253299...|  [0.0,1.0]|       1.0|
|  1.0|(692,[124,125,126...|[-128503.73534101...|  

In [15]:
# data.show()

## MultiLayerPerceptron 

In [16]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Load training data
data = spark.read.format("libsvm")\
    .load("sample_multiclass_classification_data.txt")

# Split the data into train and test
splits = data.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]

# specify layers for the neural network:
# input layer of size 4 (features), two intermediate of size 5 and 4
# and output of size 3 (classes)
layers = [4, 5, 4, 3]

# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)

# train the model
model = trainer.fit(train)

# compute accuracy on the test set
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

Test set accuracy = 0.9523809523809523


In [17]:
data.show(50,truncate=False)

+-----+-----------------------------------------------------------+
|label|features                                                   |
+-----+-----------------------------------------------------------+
|1.0  |(4,[0,1,2,3],[-0.222222,0.5,-0.762712,-0.833333])          |
|1.0  |(4,[0,1,2,3],[-0.555556,0.25,-0.864407,-0.916667])         |
|1.0  |(4,[0,1,2,3],[-0.722222,-0.166667,-0.864407,-0.833333])    |
|1.0  |(4,[0,1,2,3],[-0.722222,0.166667,-0.694915,-0.916667])     |
|0.0  |(4,[0,1,2,3],[0.166667,-0.416667,0.457627,0.5])            |
|1.0  |(4,[0,2,3],[-0.833333,-0.864407,-0.916667])                |
|2.0  |(4,[0,1,2,3],[-1.32455E-7,-0.166667,0.220339,0.0833333])   |
|2.0  |(4,[0,1,2,3],[-1.32455E-7,-0.333333,0.0169491,-4.03573E-8])|
|1.0  |(4,[0,1,2,3],[-0.5,0.75,-0.830508,-1.0])                   |
|0.0  |(4,[0,2,3],[0.611111,0.694915,0.416667])                   |
|0.0  |(4,[0,1,2,3],[0.222222,-0.166667,0.423729,0.583333])       |
|1.0  |(4,[0,1,2,3],[-0.722222,-0.166667,-0.8644