In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [2]:
sc = SparkContext(appName = "module3_week4")

In [3]:
! echo $PYSPARK_SUBMIT_ARGS

--deploy-mode client --master local[2] --executor-memory 512m --driver-memory 512m --executor-cores 1 --num-executors 2 --conf spark.driver.maxResultSize=256m pyspark-shell


In [4]:
spark = SparkSession.Builder().getOrCreate() # required for dataframes

## Download a dataset (breast cancer diagnosis)

In [5]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data

--2017-07-31 11:09:06--  https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.249
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.249|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 124103 (121K) [text/plain]
Saving to: 'wdbc.data.11'


2017-07-31 11:09:07 (158 KB/s) - 'wdbc.data.11' saved [124103/124103]



In [6]:
!head -n 3 wdbc.data

842302,M,17.99,10.38,122.8,1001,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
842517,M,20.57,17.77,132.9,1326,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956,0.1238,0.1866,0.2416,0.186,0.275,0.08902
84300903,M,19.69,21.25,130,1203,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709,0.1444,0.4245,0.4504,0.243,0.3613,0.08758


In [7]:
!wc -l wdbc.data

569 wdbc.data


In [8]:
#1) ID number 
#2) Diagnosis (M = malignant, B = benign) 
#3) Features

In [9]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer

In [10]:
# Load a text file and convert each line to a Row.

data = []

with open("wdbc.data") as infile:
    for line in infile:
        tokens = line.rstrip("\n").split(",")        
        y = tokens[1]
        features = Vectors.dense([float(x) for x in tokens[2:]])        
        
        data.append((y, features))

In [11]:
inputDF = spark.createDataFrame(data, ["label", "features"])

In [12]:
inputDF.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    M|[17.99,10.38,122....|
|    M|[20.57,17.77,132....|
|    M|[19.69,21.25,130....|
|    M|[11.42,20.38,77.5...|
|    M|[20.29,14.34,135....|
|    M|[12.45,15.7,82.57...|
|    M|[18.25,19.98,119....|
|    M|[13.71,20.83,90.2...|
|    M|[13.0,21.82,87.5,...|
|    M|[12.46,24.04,83.9...|
|    M|[16.02,23.24,102....|
|    M|[15.78,17.89,103....|
|    M|[19.17,24.8,132.4...|
|    M|[15.85,23.95,103....|
|    M|[13.73,22.61,93.6...|
|    M|[14.54,27.54,96.7...|
|    M|[14.68,20.13,94.7...|
|    M|[16.13,20.68,108....|
|    M|[19.81,22.15,130....|
|    B|[13.54,14.36,87.4...|
+-----+--------------------+
only showing top 20 rows



In [13]:
stringIndexer = StringIndexer(inputCol = "label", outputCol = "labelIndexed")
si_model = stringIndexer.fit(inputDF)
inputDF2 = si_model.transform(inputDF)

In [14]:
inputDF2.show()

+-----+--------------------+------------+
|label|            features|labelIndexed|
+-----+--------------------+------------+
|    M|[17.99,10.38,122....|         1.0|
|    M|[20.57,17.77,132....|         1.0|
|    M|[19.69,21.25,130....|         1.0|
|    M|[11.42,20.38,77.5...|         1.0|
|    M|[20.29,14.34,135....|         1.0|
|    M|[12.45,15.7,82.57...|         1.0|
|    M|[18.25,19.98,119....|         1.0|
|    M|[13.71,20.83,90.2...|         1.0|
|    M|[13.0,21.82,87.5,...|         1.0|
|    M|[12.46,24.04,83.9...|         1.0|
|    M|[16.02,23.24,102....|         1.0|
|    M|[15.78,17.89,103....|         1.0|
|    M|[19.17,24.8,132.4...|         1.0|
|    M|[15.85,23.95,103....|         1.0|
|    M|[13.73,22.61,93.6...|         1.0|
|    M|[14.54,27.54,96.7...|         1.0|
|    M|[14.68,20.13,94.7...|         1.0|
|    M|[16.13,20.68,108....|         1.0|
|    M|[19.81,22.15,130....|         1.0|
|    B|[13.54,14.36,87.4...|         0.0|
+-----+--------------------+------

### train/test split

In [15]:
(trainingData, testData) = inputDF2.randomSplit([0.7, 0.3], seed = 23)

### Training Decision Tree

In [16]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [17]:
decisionTree = DecisionTreeClassifier(labelCol = "labelIndexed")

In [18]:
dtModel = decisionTree.fit(trainingData)

In [19]:
dtModel.numNodes

29

In [20]:
dtModel.depth

5

In [21]:
dtModel.featureImportances

SparseVector(30, {1: 0.0589, 6: 0.0037, 10: 0.0112, 13: 0.0117, 20: 0.0324, 21: 0.0302, 22: 0.7215, 24: 0.01, 26: 0.0191, 27: 0.1013})

In [22]:
dtModel.numFeatures

30

In [23]:
print dtModel.toDebugString

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_4653be4ce1bd9e589b64) of depth 5 with 29 nodes
  If (feature 22 <= 114.2)
   If (feature 27 <= 0.1613)
    If (feature 20 <= 16.57)
     If (feature 27 <= 0.1258)
      If (feature 10 <= 0.9289)
       Predict: 0.0
      Else (feature 10 > 0.9289)
       Predict: 1.0
     Else (feature 27 > 0.1258)
      If (feature 21 <= 32.85)
       Predict: 0.0
      Else (feature 21 > 32.85)
       Predict: 1.0
    Else (feature 20 > 16.57)
     If (feature 1 <= 16.54)
      Predict: 0.0
     Else (feature 1 > 16.54)
      If (feature 24 <= 0.1084)
       Predict: 0.0
      Else (feature 24 > 0.1084)
       Predict: 1.0
   Else (feature 27 > 0.1613)
    If (feature 13 <= 17.67)
     If (feature 1 <= 17.53)
      Predict: 0.0
     Else (feature 1 > 17.53)
      Predict: 1.0
    Else (feature 13 > 17.67)
     Predict: 1.0
  Else (feature 22 > 114.2)
   If (feature 26 <= 0.1904)
    If (feature 1 <= 19.63)
     Predict: 0.0
    Else (feature

In [24]:
predictions = dtModel.transform(testData)

In [25]:
predictions.select('label', 'labelIndexed', 'probability', 'prediction').show()

+-----+------------+--------------------+----------+
|label|labelIndexed|         probability|prediction|
+-----+------------+--------------------+----------+
|    B|         0.0|[0.99086757990867...|       0.0|
|    B|         0.0|[0.99086757990867...|       0.0|
|    B|         0.0|[0.99086757990867...|       0.0|
|    B|         0.0|[0.99086757990867...|       0.0|
|    B|         0.0|[0.99086757990867...|       0.0|
|    B|         0.0|[0.99086757990867...|       0.0|
|    B|         0.0|[0.99086757990867...|       0.0|
|    B|         0.0|[0.99086757990867...|       0.0|
|    B|         0.0|[0.99086757990867...|       0.0|
|    B|         0.0|[0.99086757990867...|       0.0|
|    B|         0.0|[0.99086757990867...|       0.0|
|    B|         0.0|[0.99086757990867...|       0.0|
|    B|         0.0|[0.99086757990867...|       0.0|
|    B|         0.0|[0.99086757990867...|       0.0|
|    B|         0.0|[0.99086757990867...|       0.0|
|    B|         0.0|[0.99086757990867...|     

In [26]:
evaluator = MulticlassClassificationEvaluator(labelCol = "labelIndexed", predictionCol = "prediction", metricName = "accuracy")
accuracy = evaluator.evaluate(predictions)

print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.0451977


### GBDT

In [27]:
from pyspark.ml.classification import GBTClassifier

In [28]:
gbdt = GBTClassifier(labelCol = "labelIndexed", featuresCol = "features", maxIter = 100, stepSize = 0.1)

In [29]:
gbdtModel = gbdt.fit(trainingData)

In [30]:
gbdtModel.featureImportances

SparseVector(30, {0: 0.0176, 1: 0.0631, 2: 0.0012, 3: 0.0066, 4: 0.0096, 5: 0.0068, 6: 0.0045, 7: 0.0106, 8: 0.0016, 9: 0.0, 10: 0.006, 11: 0.0025, 12: 0.0039, 13: 0.0077, 14: 0.0032, 15: 0.0088, 16: 0.0057, 17: 0.0007, 18: 0.0034, 19: 0.0023, 20: 0.1718, 21: 0.0266, 22: 0.4246, 23: 0.0839, 24: 0.0214, 25: 0.0126, 26: 0.0476, 27: 0.0419, 28: 0.0011, 29: 0.0026})

In [31]:
gbdtModel.toDebugString

u'GBTClassificationModel (uid=GBTClassifier_4a02b2ac733bb03672df) with 100 trees\n  Tree 0 (weight 1.0):\n    If (feature 22 <= 114.2)\n     If (feature 27 <= 0.1613)\n      If (feature 20 <= 16.57)\n       If (feature 27 <= 0.1258)\n        If (feature 10 <= 0.9289)\n         Predict: -0.9817351598173516\n        Else (feature 10 > 0.9289)\n         Predict: 1.0\n       Else (feature 27 > 0.1258)\n        If (feature 21 <= 32.85)\n         Predict: -0.7894736842105263\n        Else (feature 21 > 32.85)\n         Predict: 1.0\n      Else (feature 20 > 16.57)\n       If (feature 1 <= 16.54)\n        Predict: -1.0\n       Else (feature 1 > 16.54)\n        If (feature 24 <= 0.1084)\n         Predict: -1.0\n        Else (feature 24 > 0.1084)\n         Predict: 1.0\n     Else (feature 27 > 0.1613)\n      If (feature 13 <= 17.67)\n       If (feature 1 <= 17.53)\n        Predict: -1.0\n       Else (feature 1 > 17.53)\n        Predict: 1.0\n      Else (feature 13 > 17.67)\n       Predict: 1.0\

In [32]:
predictions = gbdtModel.transform(testData)
predictions.select('label', 'labelIndexed', 'prediction').show()

+-----+------------+----------+
|label|labelIndexed|prediction|
+-----+------------+----------+
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
+-----+------------+----------+
only showing top 20 rows



In [33]:
evaluator = MulticlassClassificationEvaluator(labelCol = "labelIndexed", predictionCol = "prediction", metricName = "accuracy")
accuracy = evaluator.evaluate(predictions)

print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.0451977


### Random Forest

In [34]:
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel

In [35]:
rfClassifer = RandomForestClassifier(labelCol = "labelIndexed", numTrees = 100)

In [36]:
rfModel = rfClassifer.fit(trainingData)

In [37]:
rfModel.featureImportances

SparseVector(30, {0: 0.0466, 1: 0.0242, 2: 0.0748, 3: 0.0609, 4: 0.0037, 5: 0.0044, 6: 0.0464, 7: 0.0939, 8: 0.0052, 9: 0.0042, 10: 0.0097, 11: 0.004, 12: 0.0078, 13: 0.0239, 14: 0.0033, 15: 0.0029, 16: 0.0047, 17: 0.003, 18: 0.0026, 19: 0.0055, 20: 0.1514, 21: 0.0178, 22: 0.1206, 23: 0.1174, 24: 0.0101, 25: 0.0156, 26: 0.0256, 27: 0.0891, 28: 0.0123, 29: 0.0087})

In [38]:
rfModel.toDebugString

u'RandomForestClassificationModel (uid=rfc_3804477fc8d5) with 100 trees\n  Tree 0 (weight 1.0):\n    If (feature 10 <= 0.4455)\n     If (feature 6 <= 0.09847)\n      If (feature 3 <= 716.9)\n       If (feature 28 <= 0.3397)\n        If (feature 21 <= 29.33)\n         Predict: 0.0\n        Else (feature 21 > 29.33)\n         Predict: 0.0\n       Else (feature 28 > 0.3397)\n        If (feature 1 <= 19.63)\n         Predict: 0.0\n        Else (feature 1 > 19.63)\n         Predict: 1.0\n      Else (feature 3 > 716.9)\n       If (feature 21 <= 19.31)\n        Predict: 0.0\n       Else (feature 21 > 19.31)\n        Predict: 1.0\n     Else (feature 6 > 0.09847)\n      If (feature 11 <= 1.916)\n       If (feature 27 <= 0.1105)\n        Predict: 0.0\n       Else (feature 27 > 0.1105)\n        If (feature 28 <= 0.2687)\n         Predict: 0.0\n        Else (feature 28 > 0.2687)\n         Predict: 1.0\n      Else (feature 11 > 1.916)\n       Predict: 0.0\n    Else (feature 10 > 0.4455)\n     If (f

In [39]:
predictions = rfModel.transform(testData)
predictions.select('label', 'labelIndexed', 'prediction').show()

+-----+------------+----------+
|label|labelIndexed|prediction|
+-----+------------+----------+
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
|    B|         0.0|       0.0|
+-----+------------+----------+
only showing top 20 rows



In [40]:
evaluator = MulticlassClassificationEvaluator(labelCol = "labelIndexed", predictionCol = "prediction", metricName = "accuracy")
accuracy = evaluator.evaluate(predictions)

print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.0338983
