In [2]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

In [3]:
from PIL import Image, ImageDraw
from pyspark.sql.functions import lit
import pyspark.sql.functions as F
from pyspark.ml.image import ImageSchema
from pyspark.ml.linalg import DenseVector, VectorUDT

In [4]:
df = spark.read.format("image").option("dropInvalid", True).load("AlzheimersDataset/Negative")
df.select("image.origin", "image.width", "image.height", "image.nChannels", "image.mode", "image.data")
df_neg = df.withColumn("label",lit(0))

df2 = spark.read.format("image").option("dropInvalid", True).load("AlzheimersDataset/Positive")
df2.select("image.origin", "image.width", "image.height", "image.nChannels", "image.mode", "image.data")
df_pos = df2.withColumn("label",lit(1))

dataframe = df_neg.unionAll(df_pos)

In [5]:
ImageSchema.imageFields

img2vec = F.udf(lambda x: DenseVector(ImageSchema.toNDArray(x).flatten()), VectorUDT())

df = dataframe.withColumn('features', img2vec("image"))
df = df.select("features", "label")
df.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.0,0.0,0.0,0.0,...|    0|
|[0.0,0.0,0.0,0.0,...|    0|
|[0.0,0.0,0.0,0.0,...|    0|
|[0.0,0.0,0.0,0.0,...|    0|
|[0.0,0.0,0.0,0.0,...|    0|
+--------------------+-----+
only showing top 5 rows



In [6]:
df.count()

6400

In [7]:
#image_row = 40
#spark_single_img = df_pos.select("image").collect()[image_row]
#(spark_single_img.image.origin, spark_single_img.image.mode, spark_single_img.image.nChannels )

#Image.frombytes(mode="L", data=bytes(spark_single_img.image.data), size=[spark_single_img.image.width,spark_single_img.image.height]).show()

In [8]:
#df.first()

In [9]:
#from pyspark.ml.feature import MinMaxScaler
#from pyspark.ml.linalg import Vectors

#scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

# Compute summary statistics and generate MinMaxScalerModel
#scalerModel = scaler.fit(df)

# rescale each feature to range [min, max].
#scaledData = scalerModel.transform(df)
#print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax()))
#scaledData.select("label", "scaledFeatures").show(5)

In [10]:
#scaledData = scaledData.select("scaledFeatures", "label")
#scaledData.show(5)

In [11]:
#finalDF = scaledData.withColumnRenamed("scaledFeatures", "features")
#finalDF.show(5)

In [13]:
# Load training data
(trainingData, testData) = df.randomSplit([0.7, 0.3])

from pyspark.ml.classification import LinearSVC
lsvc = LinearSVC(maxIter=10, regParam=0.1)

# Fit the model
lsvcModel = lsvc.fit(trainingData)

# Print the coefficients and intercept for linear SVC
# print("Coefficients: " + str(lsvcModel.coefficients))
# print("Intercept: " + str(lsvcModel.intercept))

Coefficients: [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0

In [14]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Make predictions.
predictions = lsvcModel.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
|       1.0|    0|[0.0,0.0,0.0,0.0,...|
|       0.0|    0|[0.0,0.0,0.0,0.0,...|
+----------+-----+--------------------+
only showing top 5 rows

Test Error = 0.225707 


In [15]:
# Intercept: 2.3711518048939983