In [1]:
# Initial Configuration
import findspark 
findspark.init()

from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local') # we are hosting clusters locally.
spark = SparkSession(sc) # create an interface to the spark session.

In [3]:
# Read the data
df = spark.read.parquet('hmp.parquet')

In [4]:
# display the dataframe
df.show()

+---+---+---+--------+--------------------+
|  x|  y|  z|   Class|              Source|
+---+---+---+--------+--------------------+
| 33| 36| 51|Eat_meat|Accelerometer-201...|
| 33| 36| 51|Eat_meat|Accelerometer-201...|
| 33| 35| 53|Eat_meat|Accelerometer-201...|
| 31| 37| 52|Eat_meat|Accelerometer-201...|
| 32| 36| 52|Eat_meat|Accelerometer-201...|
| 32| 36| 51|Eat_meat|Accelerometer-201...|
| 32| 36| 51|Eat_meat|Accelerometer-201...|
| 33| 36| 53|Eat_meat|Accelerometer-201...|
| 33| 35| 52|Eat_meat|Accelerometer-201...|
| 33| 36| 52|Eat_meat|Accelerometer-201...|
| 32| 35| 53|Eat_meat|Accelerometer-201...|
| 33| 36| 52|Eat_meat|Accelerometer-201...|
| 32| 38| 53|Eat_meat|Accelerometer-201...|
| 32| 37| 52|Eat_meat|Accelerometer-201...|
| 33| 35| 52|Eat_meat|Accelerometer-201...|
| 32| 36| 53|Eat_meat|Accelerometer-201...|
| 32| 36| 53|Eat_meat|Accelerometer-201...|
| 32| 36| 52|Eat_meat|Accelerometer-201...|
| 34| 36| 52|Eat_meat|Accelerometer-201...|
| 33| 36| 52|Eat_meat|Accelerome

In [5]:
# Split the dataset into 80:20 ratio
splits = df.randomSplit([0.8, 0.2])
df_train = splits[0]
df_test = splits[1]

In [6]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import Normalizer


indexer = StringIndexer(inputCol="Class", outputCol="label")

vectorAssembler = VectorAssembler(inputCols=["x","y","z"],
                                  outputCol="features")

normalizer = Normalizer(inputCol="features", outputCol="features_norm", p=1.0)

In [7]:
# Instantiate the Linear Regression model
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [8]:
# create a pipeline
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[indexer, vectorAssembler, normalizer,lr])

In [9]:
model = pipeline.fit(df_train)

In [10]:
prediction = model.transform(df_train)

In [11]:
prediction.printSchema()

root
 |-- x: integer (nullable = true)
 |-- y: integer (nullable = true)
 |-- z: integer (nullable = true)
 |-- Class: string (nullable = true)
 |-- Source: string (nullable = true)
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- features_norm: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [12]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
binEval = MulticlassClassificationEvaluator().setMetricName("accuracy") .setPredictionCol("prediction").setLabelCol("label")
    
binEval.evaluate(prediction) 

0.1297830709134191

In [13]:
prediction = model.transform(df_test)

In [14]:
binEval.evaluate(prediction) 

0.12728243909981585