In [1]:
import pandas as pd
import numpy as np
import datetime as dt

import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *#avg, count, expr
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler, StringIndexer, MinMaxScaler
from pyspark.ml.classification import *
from pyspark.ml.evaluation import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
'''from pyspark.ml.linalg import Vector, VectorUDT

from pyspark.mllib.linalg import Vectors
from pyspark.mllib.util import MLUtils'''

In [2]:
# initialize
sc = pyspark.SparkContext()
spark = SparkSession(sc)
spark.sparkContext.appName = 'exploreReadWrite'
# show the number of cores
print('%d cores'%spark._jsc.sc().getExecutorMemoryStatus().keySet().size())
spark

1 cores


In [3]:
# load the data
fil = '../data/Toddler_Autism_dataset_July_2018.csv'
schem = StructType([StructField('Case_No', IntegerType()), StructField('A1', IntegerType()), StructField('A2', IntegerType()),
                    StructField('A3', IntegerType()), StructField('A4', IntegerType()), StructField('A5', IntegerType()),
                    StructField('A6', IntegerType()), StructField('A7', IntegerType()), StructField('A8', IntegerType()),
                    StructField('A9', IntegerType()), StructField('A10', IntegerType()), StructField('Age_Mons', IntegerType()),
                    StructField('Qchat-10-Score', IntegerType()), StructField('Sex', StringType()), StructField('Ethnicity', StringType()),
                    StructField('Jaundice', StringType()), StructField('Family_mem_with_ASD', StringType()),
                    StructField('Who completed the test', StringType()), StructField('Class/ASD Traits ', StringType())])

asd = spark.read.format('csv').options(header=True).schema(schem).load(fil)
print('%d records'%asd.count())
asd.show(truncate=False)

1054 records
+-------+---+---+---+---+---+---+---+---+---+---+--------+--------------+---+--------------+--------+-------------------+------------------------+-----------------+
|Case_No|A1 |A2 |A3 |A4 |A5 |A6 |A7 |A8 |A9 |A10|Age_Mons|Qchat-10-Score|Sex|Ethnicity     |Jaundice|Family_mem_with_ASD|Who completed the test  |Class/ASD Traits |
+-------+---+---+---+---+---+---+---+---+---+---+--------+--------------+---+--------------+--------+-------------------+------------------------+-----------------+
|1      |0  |0  |0  |0  |0  |0  |1  |1  |0  |1  |28      |3             |f  |middle eastern|yes     |no                 |family member           |No               |
|2      |1  |1  |0  |0  |0  |1  |1  |0  |0  |0  |36      |4             |m  |White European|yes     |no                 |family member           |Yes              |
|3      |1  |0  |0  |0  |0  |0  |1  |1  |0  |1  |36      |4             |m  |middle eastern|yes     |no                 |family member           |Yes             

In [4]:
responseVar = 'Class/ASD Traits '
asd.groupBy(responseVar).count().show()

+-----------------+-----+
|Class/ASD Traits |count|
+-----------------+-----+
|               No|  326|
|              Yes|  728|
+-----------------+-----+



In [5]:
''' prep the data for modeling '''
# set inputs
inputColumns = asd.columns[1:-1]
strFeats = {c:c+'_int' for c in inputColumns if asd.schema[c].dataType is StringType()}
inputColumns = [strFeats.get(c, c) for c in inputColumns]

# create all numerical features & numerical response
indxr = StringIndexer(inputCols=list(strFeats.keys()) + ['label_str'], outputCols=list(strFeats.values()) + ['label'])
asdML = asd.withColumnRenamed(responseVar, 'label_str')
asdML = indxr.fit(asdML).transform(asdML)

# create the features vector
assr = VectorAssembler(inputCols=inputColumns, outputCol='features')
asdML = assr.transform(asdML)

# talk
display(asdML.limit(10).toPandas())
asdML.select('features', 'label').show(truncate=True)
asdML.select('features').take(1)
print('First row features = %s'%asdML.select('features').take(1)[0])

Unnamed: 0,Case_No,A1,A2,A3,A4,A5,A6,A7,A8,A9,...,Family_mem_with_ASD,Who completed the test,label_str,label,Jaundice_int,Family_mem_with_ASD_int,Who completed the test_int,Ethnicity_int,Sex_int,features
0,1,0,0,0,0,0,0,1,1,0,...,no,family member,No,1.0,1.0,0.0,0.0,2.0,1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, ..."
1,2,1,1,0,0,0,1,1,0,0,...,no,family member,Yes,0.0,1.0,0.0,0.0,0.0,0.0,"(1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, ..."
2,3,1,0,0,0,0,0,1,1,0,...,no,family member,Yes,0.0,1.0,0.0,0.0,2.0,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, ..."
3,4,1,1,1,1,1,1,1,1,1,...,no,family member,Yes,0.0,0.0,0.0,0.0,5.0,0.0,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
4,5,1,1,0,1,1,1,1,1,1,...,yes,family member,Yes,0.0,0.0,1.0,0.0,0.0,1.0,"[1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
5,6,1,1,0,0,1,1,1,1,1,...,no,family member,Yes,0.0,0.0,0.0,0.0,4.0,0.0,"[1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
6,7,1,0,0,1,1,1,0,0,1,...,no,family member,Yes,0.0,1.0,0.0,0.0,1.0,0.0,"(1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, ..."
7,8,0,1,0,0,1,0,1,1,1,...,no,family member,Yes,0.0,1.0,0.0,0.0,1.0,0.0,"(0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, ..."
8,9,0,0,0,0,0,0,1,0,0,...,no,family member,No,1.0,0.0,0.0,0.0,1.0,0.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
9,10,1,1,1,0,1,1,0,1,1,...,no,Health Care Professional,Yes,0.0,0.0,0.0,1.0,3.0,0.0,"[1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, ..."


+--------------------+-----+
|            features|label|
+--------------------+-----+
|(17,[6,7,9,10,11,...|  1.0|
|(17,[0,1,5,6,10,1...|  0.0|
|(17,[0,6,7,9,10,1...|  0.0|
|[1.0,1.0,1.0,1.0,...|  0.0|
|[1.0,1.0,0.0,1.0,...|  0.0|
|[1.0,1.0,0.0,0.0,...|  0.0|
|(17,[0,3,4,5,8,10...|  0.0|
|(17,[1,4,6,7,8,9,...|  0.0|
|(17,[6,9,10,11,13...|  1.0|
|[1.0,1.0,1.0,0.0,...|  0.0|
|[1.0,0.0,0.0,1.0,...|  0.0|
|[1.0,1.0,1.0,1.0,...|  0.0|
|(17,[10,12,13,14]...|  1.0|
|[1.0,1.0,1.0,1.0,...|  0.0|
|(17,[10,13],[18.0...|  1.0|
|(17,[0,1,2,4,6,7,...|  0.0|
|(17,[10,13,15],[3...|  1.0|
|[1.0,1.0,1.0,0.0,...|  0.0|
|(17,[0,4,9,10,11,...|  1.0|
|[1.0,1.0,1.0,0.0,...|  0.0|
+--------------------+-----+
only showing top 20 rows

First row features = (17,[6,7,9,10,11,12,13,14],[1.0,1.0,1.0,28.0,3.0,1.0,2.0,1.0])


In [9]:
''' split for cross-val '''
trainPerc = 0.7
randSeed = 42
trainASD, testASD = asdML.randomSplit([trainPerc, 1.0 - trainPerc], seed=randSeed)

# talk
print('Training Cases')
trainASD.select('Case_No').show()
print('Testing Cases')
testASD.select('Case_No').show()

Training Cases
+-------+
|Case_No|
+-------+
|      1|
|      2|
|      4|
|      5|
|      6|
|      8|
|     11|
|     12|
|     13|
|     17|
|     18|
|     19|
|     21|
|     23|
|     26|
|     27|
|     28|
|     32|
|     34|
|     37|
+-------+
only showing top 20 rows

Testing Cases
+-------+
|Case_No|
+-------+
|      3|
|      7|
|      9|
|     10|
|     14|
|     15|
|     16|
|     20|
|     22|
|     24|
|     25|
|     29|
|     30|
|     31|
|     33|
|     35|
|     36|
|     40|
|     43|
|     44|
+-------+
only showing top 20 rows



In [29]:
''' try logistic regression - using predefined train/test split '''
# create objects
#auc = BinaryClassificationEvaluator(metricName='areaUnderROC')
acc = MulticlassClassificationEvaluator(metricName='accuracy')
logreg = LogisticRegression()

# train & eval
fitModel = logreg.fit(trainASD.select('features', 'label'))
trainRes = fitModel.evaluate(trainASD.select('features', 'label'))
trainAcc = acc.evaluate(trainRes.predictions)

# now evaluate test accuracy
testRes = fitModel.transform(testASD.select('features', 'label'))
testAcc = acc.evaluate(testRes)

print('Train Accuracy = %0.3f, Test Accurcy = %0.3f'%(trainAcc, testAcc))

Train Accuracy = 1.000, Test Accurcy = 1.000


In [30]:
''' now use the builtin cross-validator '''
#estimator
logreg = LogisticRegression()
# parameters grid
params = (ParamGridBuilder().addGrid(logreg.threshold, [0.4, 0.5, 0.6]).addGrid(logreg.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0]).build())
# cross-validator
cv = CrossValidator(estimator=logreg, estimatorParamMaps=params, evaluator=acc, numFolds=5)

#run
fitModel = cv.fit(trainASD.select('features', 'label'))

In [47]:
# best model parameters
print('Threshold = %0.3f, Elastic Net = %0.3f'%(bestModel.getThreshold(), bestModel.getElasticNetParam()))

# show performance of best model
bestModel = fitModel.bestModel
testAcc = acc.evaluate(bestModel.transform(testASD.select('features', 'label')))
print('Best Model Test Accuracy = %0.3f'%testAcc)

Threshold = 0.400, Elastic Net = 0.000
Best Model Test Accuracy = 1.000


In [62]:
# only available for logistic regression
summ = bestModel.summary
summ.predictions.describe().show()
summ.objectiveHistory

print('Summary Information %s'%[am for am in dir(summ) if am[0] != '_'])

+-------+-------------------+-------------------+
|summary|              label|         prediction|
+-------+-------------------+-------------------+
|  count|                780|                780|
|   mean|0.30256410256410254|0.30256410256410254|
| stddev| 0.4596628666274773| 0.4596628666274773|
|    min|                0.0|                0.0|
|    max|                1.0|                1.0|
+-------+-------------------+-------------------+

Summary Information ['accuracy', 'areaUnderROC', 'fMeasureByLabel', 'fMeasureByThreshold', 'falsePositiveRateByLabel', 'featuresCol', 'labelCol', 'labels', 'objectiveHistory', 'pr', 'precisionByLabel', 'precisionByThreshold', 'predictionCol', 'predictions', 'probabilityCol', 'recallByLabel', 'recallByThreshold', 'roc', 'scoreCol', 'totalIterations', 'truePositiveRateByLabel', 'weightCol', 'weightedFMeasure', 'weightedFalsePositiveRate', 'weightedPrecision', 'weightedRecall', 'weightedTruePositiveRate']


In [None]:
sc.stop()