In [17]:
!pip install pyspark==2.4.5



In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [None]:
# delete files from previous runs
!rm -f hmp.parquet*

# download the file containing the data in PARQUET format
!wget https://github.com/IBM/coursera/raw/master/hmp.parquet
    
# create a dataframe out of it
df = spark.read.parquet('hmp.parquet')

# register a corresponding query table
df.createOrReplaceTempView('df')

In [4]:
df.show()

+---+---+---+--------------------+-----------+
|  x|  y|  z|              source|      class|
+---+---+---+--------------------+-----------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|
| 22| 51| 34|Accelerometer-201...|Brush_teeth|
| 20| 50| 35|Accelerometer-201...|Brush_teeth|
| 22| 52| 34|Accelerometer-201...|Brush_teeth|
| 22| 50| 34|Accelerometer-201...|Brush_teeth|
| 22| 51| 35|Accelerometer-201...|Brush_teeth|
| 21| 51| 33|Accelerometer-201...|Brush_teeth|
| 20| 50| 34|Accelerometer-201...|Brush_teeth|
| 21| 49| 33|Accelerometer-201...|Brush_teeth|
| 21| 49| 33|Accelerometer-201...|Brush_teeth|
| 20| 51| 35|Accelerometer-201...|Brush_teeth|
| 18| 49| 34|Accelerometer-201...|Brush_teeth|
| 19| 48| 34|Accelerometer-201...|Brush_teeth|
| 16| 53| 34|Accelerometer-201...|Brush_teeth|
| 18| 52| 35|

In [5]:
from pyspark.ml.feature import StringIndexer,OneHotEncoder,VectorAssembler,Normalizer
from pyspark.ml.linalg import Vectors

In [6]:
#preprocessing
indexer = StringIndexer(inputCol="class",outputCol="label")
encoder = OneHotEncoder(inputCol="label", outputCol="labelVec")
vA = VectorAssembler(inputCols=["x","y","z"],outputCol="features")
normalizer = Normalizer(inputCol="features",outputCol="features_norm",p=1.0)

In [7]:
df.createOrReplaceTempView('df')
df_class = spark.sql("select * from df where class in ('Use_telephone','Standup_chair')")


In [8]:
#for binary classification
splits = df_class.randomSplit([0.8,0.2])
df_train = splits[0]
df_test = splits[1]

In [9]:
#gradient boosting
from pyspark.ml.classification import GBTClassifier
GB = GBTClassifier(labelCol="label",featuresCol="features",maxIter=10)

In [10]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[indexer,encoder,vA,normalizer,GB])

In [11]:

#training
model = pipeline.fit(df_train)

In [12]:
pred = model.transform(df_train)

In [13]:
#evaluation
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
binEval = MulticlassClassificationEvaluator().setMetricName("accuracy").setPredictionCol("prediction").setLabelCol("label")
binEval.evaluate(pred)

0.9110592426522527

In [14]:
#testing
model = pipeline.fit(df_test)

In [15]:
pred = model.transform(df_train)

In [16]:
#evaluation
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
binEval = MulticlassClassificationEvaluator().setMetricName("accuracy").setPredictionCol("prediction").setLabelCol("label")
binEval.evaluate(pred)

0.9064832161174411