# Human Activity Recognition -Preprocessing

<b>Group4</b>

Shreejaya Bharathan, Ivette Sulca, Sakshi Singla, Akansha Shrivastava, Roja Immani, Aakanksha Nallabothula Surya

In [1]:
from pyspark import SparkContext, SparkConf
import os 
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import numpy as np
import plotly.figure_factory as ff
from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [2]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages "org.apache.hadoop:hadoop-aws:2.7.4" pyspark-shell'

In [3]:
#create spark context
sc = SparkContext.getOrCreate()

### Fetching data from S3

In [4]:
sc._jsc.hadoopConfiguration().set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
sc._jsc.hadoopConfiguration().set('fs.s3a.access.key', 'AKIA5UF2BJCMTWQO3OVY')
sc._jsc.hadoopConfiguration().set('fs.s3a.secret.key', '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')

In [5]:
file = "s3a://group-4-distributed-data-systems/merged.txt"
rdd = sc.textFile(file)

In [6]:
def toDoubleSafe(v):
    try:
        return float(v)
    except ValueError:
        return str(v) #if it is not a float type return as a string.

In [7]:
rdd=rdd.map(lambda x : x[:-1].split(','))
rdd = rdd.map(lambda row:[toDoubleSafe(x) for x in row])

In [8]:
rdd.first()

[1613.0,
 'phone',
 'gyro',
 'A',
 178468071944614.0,
 -0.020240024,
 -0.004261058,
 -0.023435818]

### Creating Spark Dataframe from rdd

In [9]:
schema = StructType([ StructField("person_id", DoubleType(), True),
                      StructField("device", StringType(), True),
                      StructField("sensor", StringType(), True),
                      StructField("activity", StringType(), True),
                      StructField("time_stamp", DoubleType(), True),
                      StructField("x", DoubleType(), True),
                      StructField("y", DoubleType(), True),
                      StructField("z", DoubleType(), True)
                    ])

In [10]:
ss = SparkSession.builder.getOrCreate()
merged_df = ss.createDataFrame(rdd, schema)

In [11]:
merged_df.cache()

DataFrame[person_id: double, device: string, sensor: string, activity: string, time_stamp: double, x: double, y: double, z: double]

In [12]:
merged_df.show(5)

+---------+------+------+--------+-------------------+-------------------+------------+------------+
|person_id|device|sensor|activity|         time_stamp|                  x|           y|           z|
+---------+------+------+--------+-------------------+-------------------+------------+------------+
|   1613.0| phone|  gyro|       A|1.78468071944614E14|       -0.020240024|-0.004261058|-0.023435818|
|   1613.0| phone|  gyro|       A|1.78468104194617E14|         -2.5750105|  0.18109496|   1.3864417|
|   1613.0| phone|  gyro|       A|1.78468142811857E14|-1.5739281999999999|   0.6668556|    1.320928|
|   1613.0| phone|  gyro|       A|1.78468183987271E14|-1.5041533999999999|   1.7973675|    0.824781|
|   1613.0| phone|  gyro|       A|1.78468225406856E14|        -0.50786483|   1.6002935|  0.45833004|
+---------+------+------+--------+-------------------+-------------------+------------+------------+
only showing top 5 rows



### Feature Extraction

In [13]:
phone_gyro_avg_df = merged_df.filter("device=='phone' and sensor=='gyro'").groupBy(["person_id","activity"]).agg({"x": "avg","y": "avg","z": "avg"})
phone_gyro_avg_df = phone_gyro_avg_df.withColumnRenamed('avg(x)', 'xpgyro')
phone_gyro_avg_df = phone_gyro_avg_df.withColumnRenamed('avg(y)', 'ypgyro')
phone_gyro_avg_df = phone_gyro_avg_df.withColumnRenamed('avg(z)', 'zpgyro')

In [14]:
phone_gyro_avg_df.show(5)

+---------+--------+--------------------+--------------------+--------------------+
|person_id|activity|              xpgyro|              ypgyro|              zpgyro|
+---------+--------+--------------------+--------------------+--------------------+
|   1605.0|       P|-0.02017300288020...|-0.11814119451513522|-0.00850835712610...|
|   1610.0|       A|  0.0342472453562128|-0.03603498108390...| 0.07850520876840324|
|   1631.0|       A|0.017430668883800425|-0.07093975710237109|  0.0782874234309697|
|   1640.0|       D|-0.00183715203550...|0.001569241866935...|0.001051475461950...|
|   1639.0|       D|0.001665114776400...|6.063320751547568E-4|-9.91082868971936E-5|
+---------+--------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [15]:
phone_accel_avg_df=merged_df.filter("device=='phone' and sensor=='accel'").groupBy(["person_id","activity"]).agg({"x": "avg","y": "avg","z": "avg"})
phone_accel_avg_df = phone_accel_avg_df.withColumnRenamed('avg(x)', 'xpaccel')
phone_accel_avg_df = phone_accel_avg_df.withColumnRenamed('avg(y)', 'ypaccel')
phone_accel_avg_df = phone_accel_avg_df.withColumnRenamed('avg(z)', 'zpaccel')

In [16]:
watch_gyro_avg_df=merged_df.filter("device=='watch' and sensor=='gyro'").groupBy(["person_id","activity"]).agg({"x": "avg","y": "avg","z": "avg"})
watch_gyro_avg_df = watch_gyro_avg_df.withColumnRenamed('avg(x)', 'xwgyro')
watch_gyro_avg_df = watch_gyro_avg_df.withColumnRenamed('avg(y)', 'ywgyro')
watch_gyro_avg_df = watch_gyro_avg_df.withColumnRenamed('avg(z)', 'zwgyro')

In [17]:
watch_accel_avg_df=merged_df.filter("device=='watch' and sensor=='accel'").groupBy(["person_id","activity"]).agg({"x": "avg","y": "avg","z": "avg"})
watch_accel_avg_df=watch_accel_avg_df.withColumnRenamed('avg(x)', 'xwaccel')
watch_accel_avg_df=watch_accel_avg_df.withColumnRenamed('avg(y)', 'ywaccel')
watch_accel_avg_df=watch_accel_avg_df.withColumnRenamed('avg(z)', 'zwaccel')

In [18]:
all_phone = phone_gyro_avg_df.join(phone_accel_avg_df, on=['person_id', 'activity'])

In [19]:
all_watch = watch_gyro_avg_df.join(watch_accel_avg_df, on=['person_id', 'activity'])

In [20]:
all_feats = all_watch.join(all_phone, on=['person_id', 'activity'])

In [21]:
all_feats.show(5)

+---------+--------+--------------------+--------------------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-------------------+-------------------+
|person_id|activity|              xwgyro|              ywgyro|              zwgyro|           xwaccel|            ywaccel|             zwaccel|              xpgyro|              ypgyro|              zpgyro|            xpaccel|            ypaccel|            zpaccel|
+---------+--------+--------------------+--------------------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-------------------+-------------------+
|   1605.0|       P|  0.0356906530786702| -0.2210499653278735|   0.324840953022115| 5.524821611024183|-3.9579231058513504|  3.6875186409674425|-0.02017300288020...|-0.11814119451513522|-0.00850835712