In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [2]:
sc = SparkContext(appName = "module3_week4")

In [3]:
! echo $PYSPARK_SUBMIT_ARGS

--deploy-mode client --master local[2] --executor-memory 512m --driver-memory 512m --executor-cores 1 --num-executors 2 --conf spark.driver.maxResultSize=256m pyspark-shell


In [4]:
spark = SparkSession.Builder().getOrCreate() # required for dataframes

## Download a dataset (breast cancer diagnosis)

In [5]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data

--2017-07-31 11:23:55--  https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.249
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.249|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 124103 (121K) [text/plain]
Saving to: 'wdbc.data.14'


2017-07-31 11:23:56 (158 KB/s) - 'wdbc.data.14' saved [124103/124103]



In [6]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer

In [7]:
# Load a text file and convert each line to a Row.

data = []

with open("wdbc.data") as infile:
    for line in infile:
        tokens = line.rstrip("\n").split(",")        
        y = tokens[1]
        features = Vectors.dense([float(x) for x in tokens[2:]])        
        
        data.append((y, features))

In [8]:
inputDF = spark.createDataFrame(data, ["label", "features"])

In [9]:
stringIndexer = StringIndexer(inputCol = "label", outputCol = "labelIndexed")
si_model = stringIndexer.fit(inputDF)

inputDF2 = si_model.transform(inputDF)

In [10]:
inputDF2.show()

+-----+--------------------+------------+
|label|            features|labelIndexed|
+-----+--------------------+------------+
|    M|[17.99,10.38,122....|         1.0|
|    M|[20.57,17.77,132....|         1.0|
|    M|[19.69,21.25,130....|         1.0|
|    M|[11.42,20.38,77.5...|         1.0|
|    M|[20.29,14.34,135....|         1.0|
|    M|[12.45,15.7,82.57...|         1.0|
|    M|[18.25,19.98,119....|         1.0|
|    M|[13.71,20.83,90.2...|         1.0|
|    M|[13.0,21.82,87.5,...|         1.0|
|    M|[12.46,24.04,83.9...|         1.0|
|    M|[16.02,23.24,102....|         1.0|
|    M|[15.78,17.89,103....|         1.0|
|    M|[19.17,24.8,132.4...|         1.0|
|    M|[15.85,23.95,103....|         1.0|
|    M|[13.73,22.61,93.6...|         1.0|
|    M|[14.54,27.54,96.7...|         1.0|
|    M|[14.68,20.13,94.7...|         1.0|
|    M|[16.13,20.68,108....|         1.0|
|    M|[19.81,22.15,130....|         1.0|
|    B|[13.54,14.36,87.4...|         0.0|
+-----+--------------------+------

### Cross-Validation 

In [11]:
from pyspark.ml.classification import DecisionTreeClassifier

In [12]:
decisionTree = DecisionTreeClassifier(labelCol = "labelIndexed")

In [13]:
from pyspark.ml import Pipeline

In [14]:
pipeline = Pipeline(stages = [decisionTree])

In [15]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [16]:
paramGrid = ParamGridBuilder()\
    .addGrid(decisionTree.maxDepth, [1, 2, 4, 5, 6, 7, 8])\
    .build()

In [None]:
#paramGrid = ParamGridBuilder()\
#    .addGrid(decisionTree.maxDepth, [1, 2, 4, 5, 6, 7, 8])\
#    .addGrid(decisionTree.minInstancesPerNode, [1, 2, 4, 5, 6, 7, 8])\
#    .build()

In [17]:
evaluator = MulticlassClassificationEvaluator(labelCol = "labelIndexed", predictionCol = "prediction", metricName = "accuracy") 

crossval = CrossValidator(estimator = pipeline,
                          estimatorParamMaps = paramGrid,
                          evaluator = evaluator,
                          numFolds = 10)

In [18]:
cvModel = crossval.fit(inputDF2)

In [19]:
cvModel.avgMetrics

[0.8924563921120648,
 0.9203744192767783,
 0.9418223975919139,
 0.9419915318207598,
 0.9457320946210345,
 0.938026791703486,
 0.9361037147804091]

In [20]:
print cvModel.bestModel.stages[0]

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_4bceb658f839baff82ff) of depth 6 with 47 nodes


In [None]:
cvModel.transform(....)