In [0]:
dbutils.library.installPyPI("keras","2.2.4")
dbutils.library.installPyPI("tensorflow","1.13.1")
dbutils.library.installPyPI("tensorframes")
dbutils.library.installPyPI("coverage","4.4.1")
dbutils.library.installPyPI("h5py")
dbutils.library.installPyPI("nose","1.3.7")
dbutils.library.installPyPI("parameterized","0.6.1")
dbutils.library.installPyPI("pillow","4.1.1")
dbutils.library.installPyPI("pygments","2.2.0")
dbutils.library.installPyPI("pandas")
dbutils.library.installPyPI("six","1.10.0")
dbutils.library.installPyPI("paramiko","2.4.0")
dbutils.library.installPyPI("PyNaCl","1.2.1")
dbutils.library.installPyPI("cloudpickle","0.5.2")
dbutils.library.installPyPI("horovod")
dbutils.library.installPyPI("wrapt","1.10.11")
dbutils.library.installPyPI("Deprecated","1.2.7")
dbutils.library.restartPython()
dbutils.library.list()

In [0]:
from pyspark.sql.functions import lit

#create covid19 dataframe
coviddf = spark.read.format("image").load('/FileStore/ml/database/covid19').withColumn("label", lit(1))
#create normal dataframe
normaldf = spark.read.format("image").load('/FileStore/ml/database/normal').withColumn("label", lit(0))

In [0]:
from pyspark.sql.functions import lit
#create covid19 test dataframe
coviddf_test = spark.read.format("image").load('/FileStore/ml/test_database/test_covid19').withColumn("label", lit(1))
#create normal test dataframe
normaldf_test = spark.read.format("image").load('/FileStore/ml/test_database/test_normal').withColumn("label", lit(0))


In [0]:
#Merge the two dataframes
train_df = coviddf.union(normaldf)

In [0]:
test_df = coviddf_test.union(normaldf_test)

In [0]:
#display(train_df.limit(2))

In [0]:
#train_df.count()

In [0]:
#test_df.count()

In [0]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from sparkdl import DeepImageFeaturizer 

featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3")
lr = LogisticRegression(maxIter=10, regParam=0.05, elasticNetParam=0.3, labelCol="label")
p = Pipeline(stages=[featurizer, lr])

p_model = p.fit(train_df)

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

tested_df = p_model.transform(test_df)
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(tested_df.select("prediction", "label"))))

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#tested_df = p_model.transform(test_df)
evaluator = MulticlassClassificationEvaluator(metricName="f1")
print("F1 = " + str(evaluator.evaluate(tested_df)))

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#tested_df = p_model.transform(test_df)
evaluator = MulticlassClassificationEvaluator(metricName="weightedPrecision")
print("precision = " + str(evaluator.evaluate(tested_df)))

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

tested_df = p_model.transform(test_df)
evaluator = MulticlassClassificationEvaluator(metricName="weightedRecall")
print("Recall = " + str(evaluator.evaluate(tested_df)))

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

tested_df = p_model.transform(test_df).select("label", "prediction")
evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="prediction", metricName='areaUnderROC')
print("AUC = " + str(evaluator.evaluate(tested_df)))