In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.6.tgz
!tar xvf spark-2.4.4-bin-hadoop2.6.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.6"
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext(appName="PySpark_dataframe")

# Linear Regression

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

spark = SparkSession.builder.master('local[2]') \
        .appName('clustering_app') \
        .getOrCreate()

In [None]:
# delete files from previous runs
!rm -f hmp.parquet*

# download the file containing the data in PARQUET format
!wget https://github.com/IBM/coursera/raw/master/hmp.parquet
    
# create a dataframe out of it
df = spark.read.parquet('hmp.parquet')

# register a corresponding query table
df.createOrReplaceTempView('df')

--2020-01-13 09:51:13--  https://github.com/IBM/coursera/raw/master/hmp.parquet
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/IBM/coursera/master/hmp.parquet [following]
--2020-01-13 09:51:14--  https://raw.githubusercontent.com/IBM/coursera/master/hmp.parquet
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 932997 (911K) [application/octet-stream]
Saving to: ‘hmp.parquet’


2020-01-13 09:51:14 (14.5 MB/s) - ‘hmp.parquet’ saved [932997/932997]



In this appliation we will try to predicte the energy which is sum of $(x*x)+(y*y) + (z*z)$ by the way this is not the actual energy as we miss the mass feature.

In [None]:
df.show()

+---+---+---+--------------------+-----------+
|  x|  y|  z|              source|      class|
+---+---+---+--------------------+-----------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|
| 22| 51| 34|Accelerometer-201...|Brush_teeth|
| 20| 50| 35|Accelerometer-201...|Brush_teeth|
| 22| 52| 34|Accelerometer-201...|Brush_teeth|
| 22| 50| 34|Accelerometer-201...|Brush_teeth|
| 22| 51| 35|Accelerometer-201...|Brush_teeth|
| 21| 51| 33|Accelerometer-201...|Brush_teeth|
| 20| 50| 34|Accelerometer-201...|Brush_teeth|
| 21| 49| 33|Accelerometer-201...|Brush_teeth|
| 21| 49| 33|Accelerometer-201...|Brush_teeth|
| 20| 51| 35|Accelerometer-201...|Brush_teeth|
| 18| 49| 34|Accelerometer-201...|Brush_teeth|
| 19| 48| 34|Accelerometer-201...|Brush_teeth|
| 16| 53| 34|Accelerometer-201...|Brush_teeth|
| 18| 52| 35|

create a temporary query view in order to write SQL statements

In [None]:
df.createOrReplaceTempView('df')

Generating the label column

In [None]:
df_energy = spark.sql("""
select sqrt(sum(x*x)+sum(y*y)+sum(z*z)) as label, class
from df
group by class
""")
df_energy.createOrReplaceTempView('df_energy')

In [None]:
df_join = spark.sql(
    """
    select * from df inner join df_energy
    on df.class = df_energy.class
    """
)

In [None]:
df_join.show()

+---+---+---+--------------------+-----------+-----------------+-----------+
|  x|  y|  z|              source|      class|            label|      class|
+---+---+---+--------------------+-----------+-----------------+-----------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|
| 22| 51| 34|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|
| 20| 50| 35|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|
| 22| 52| 34|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|
| 22| 50| 34|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|
| 22| 51| 35|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler, Normalizer

vectorAssembler = VectorAssembler(inputCols=['x','y','z'],
                                  outputCol = 'features')
normalizer = Normalizer(inputCol="features", outputCol="features_norm", p=1.0)


In [None]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [None]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[vectorAssembler, normalizer, lr])

In [None]:
model = pipeline.fit(df_join)

In [None]:
pred = model.transform(df_join)

Get RMSE

In [None]:
model.stages[2].summary.r2

0.03259100556263628