# Explore data from activity trackers.

##### Activity trackers are devices that are designed to measure and track various aspects of physical activity. These devices are often worn on the wrist, but they can also be worn on other parts of the body, such as the waist or ankle. Many activity trackers use sensors to measure things like steps taken, distance traveled, and calories burned. They may also track sleep patterns, heart rate, and other indicators of physical activity and health.

##### One of the key benefits of activity trackers is that they can provide users with valuable insights into their physical activity levels. By tracking various metrics over time, users can see how their activity levels change and identify patterns in their behavior. This can be useful for those who are trying to improve their fitness, lose weight, or simply maintain a healthy lifestyle.

##### In addition to tracking physical activity, many activity trackers also offer other features, such as the ability to connect with smartphone apps or online platforms to track progress and set goals. Some activity trackers also offer additional features, such as the ability to track nutrition and hydration, or to provide reminders and alerts to help users stay on track.

##### Overall, activity trackers are a useful tool for anyone looking to track and improve their physical activity levels. By providing valuable data and insights, they can help users make informed decisions about their health and wellness.

#### Data cleaning

Data cleaning is the process of identifying and correcting or removing errors and inconsistencies in data. In Python, data cleaning can be accomplished using a variety of tools and libraries. These tools provide functions and methods for handling missing values, correcting data types, and removing duplicates, among other tasks. Data cleaning is an important step in the data analysis process, as it helps ensure that the data is accurate and reliable, and that it can be effectively analyzed and used to draw meaningful insights.

In [1]:
#

In [2]:
import seaborn as sns #this is the plotting library I'll be using 
import numpy as np
import pandas as pd #"as pd" means that we can use the abbreviation in commands
import matplotlib.pyplot as plt #we need Matplotlib for setting the labels in the Seaborn graphs

In [3]:
import pandas as pd

# Read in the first data set
df1 = pd.read_csv('steps.csv',delimiter=';')

# Read in the second data set
df2 = pd.read_csv('survey.csv')

In [4]:
# Merge the two data sets
df = pd.merge(df1, df2, on='id')

In [5]:
df.shape

(929, 337)

In [6]:
x = df.isna().mean()*100 # store column wise percentage of NaN

In [7]:
col_with_allNaN = []
for i,j in zip(x,x.index):
    if i==100: # check for column with all NaN
        print(j)
        col_with_allNaN.append(j)

13-5-2014


In [8]:
df.dropna(axis = 1,how='all',inplace=True) # drop columns with all NaN, axis = 1 is for column

In [9]:
df.shape

(929, 336)

In [10]:
# Now keep participants with minimum 50% data in their respective rows.
df.dropna(axis = 0,how='all',inplace=True,thresh=.5*df1.shape[1])

In [11]:
df

Unnamed: 0,id,20-6-2013,21-6-2013,22-6-2013,23-6-2013,24-6-2013,25-6-2013,26-6-2013,27-6-2013,28-6-2013,...,11-5-2014,12-5-2014,city,gender,age,hh_size,education,education_1,height,weight
0,1,,,,,3941.0,15733.0,9929.0,12879.0,10541.0,...,,,Bordeaux,Male,25-34,4,4,Master or doctorate,178.0,98.0
1,2,,,10473.0,705.0,4287.0,5507.0,4024.0,3926.0,14595.0,...,,,Lille,Male,35-44,1,3,Bachelor,180.0,77.0
13,14,,,,,,,,,,...,,,Lille,Male,45-54,1,1,Master or doctorate,170.0,83.0
21,22,,,,,,,,,,...,,,Bordeaux,Female,45-54,4,4,Master or doctorate,170.0,73.0
27,28,,6262.0,16594.0,7708.0,6419.0,10889.0,7663.0,8728.0,15860.0,...,20492.0,3607.0,Lille,Female,35-44,3,3,Bachelor,162.0,58.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,914,,18543.0,18979.0,15262.0,16937.0,22538.0,20108.0,14040.0,18397.0,...,,,Bordeaux,Female,45-54,2,2,Primary,160.0,55.0
915,916,,8667.0,10471.0,659.0,12079.0,5212.0,7308.0,12229.0,7007.0,...,,,Lille,Female,35-44,1,1,High school,160.0,72.0
919,920,,,,,,,,,,...,,,Montpellier,Male,25-34,5,3,Bachelor,176.0,70.0
920,921,,,,,,,,,5308.0,...,,,Lille,Female,35-44,5,4,Bachelor,166.0,88.0


In [12]:
list(df1)

['id',
 '20-6-2013',
 '21-6-2013',
 '22-6-2013',
 '23-6-2013',
 '24-6-2013',
 '25-6-2013',
 '26-6-2013',
 '27-6-2013',
 '28-6-2013',
 '29-6-2013',
 '30-6-2013',
 '1-7-2013',
 '2-7-2013',
 '3-7-2013',
 '4-7-2013',
 '5-7-2013',
 '6-7-2013',
 '7-7-2013',
 '8-7-2013',
 '9-7-2013',
 '10-7-2013',
 '11-7-2013',
 '12-7-2013',
 '13-7-2013',
 '14-7-2013',
 '15-7-2013',
 '16-7-2013',
 '17-7-2013',
 '18-7-2013',
 '19-7-2013',
 '20-7-2013',
 '21-7-2013',
 '22-7-2013',
 '23-7-2013',
 '24-7-2013',
 '25-7-2013',
 '26-7-2013',
 '27-7-2013',
 '28-7-2013',
 '29-7-2013',
 '30-7-2013',
 '31-7-2013',
 '1-8-2013',
 '2-8-2013',
 '3-8-2013',
 '4-8-2013',
 '5-8-2013',
 '6-8-2013',
 '7-8-2013',
 '8-8-2013',
 '9-8-2013',
 '10-8-2013',
 '11-8-2013',
 '12-8-2013',
 '13-8-2013',
 '14-8-2013',
 '15-8-2013',
 '16-8-2013',
 '17-8-2013',
 '18-8-2013',
 '19-8-2013',
 '20-8-2013',
 '21-8-2013',
 '22-8-2013',
 '23-8-2013',
 '24-8-2013',
 '25-8-2013',
 '26-8-2013',
 '27-8-2013',
 '28-8-2013',
 '29-8-2013',
 '30-8-2013',
 '3

In [13]:
cols = list(df1)[1:] # get list of columns to apply mean()

In [14]:
cols = list(set(cols) - set(col_with_allNaN))  # remove columns that are already dropped earlier

In [15]:
cols

['22-8-2013',
 '23-8-2013',
 '15-1-2014',
 '26-2-2014',
 '10-12-2013',
 '28-7-2013',
 '14-2-2014',
 '12-12-2013',
 '5-4-2014',
 '18-1-2014',
 '19-7-2013',
 '2-3-2014',
 '26-8-2013',
 '29-9-2013',
 '6-2-2014',
 '21-6-2013',
 '30-10-2013',
 '20-9-2013',
 '29-1-2014',
 '20-11-2013',
 '24-11-2013',
 '3-2-2014',
 '7-2-2014',
 '30-1-2014',
 '24-12-2013',
 '2-8-2013',
 '16-2-2014',
 '23-9-2013',
 '17-2-2014',
 '8-3-2014',
 '15-9-2013',
 '4-5-2014',
 '5-10-2013',
 '6-5-2014',
 '14-11-2013',
 '16-3-2014',
 '26-3-2014',
 '15-11-2013',
 '13-7-2013',
 '9-3-2014',
 '28-11-2013',
 '13-10-2013',
 '29-6-2013',
 '23-7-2013',
 '8-8-2013',
 '12-8-2013',
 '3-11-2013',
 '1-4-2014',
 '8-5-2014',
 '9-7-2013',
 '27-6-2013',
 '20-8-2013',
 '4-3-2014',
 '25-9-2013',
 '2-7-2013',
 '30-6-2013',
 '8-1-2014',
 '8-11-2013',
 '28-10-2013',
 '29-4-2014',
 '6-12-2013',
 '16-1-2014',
 '22-3-2014',
 '9-5-2014',
 '7-4-2014',
 '26-7-2013',
 '10-9-2013',
 '12-10-2013',
 '13-1-2014',
 '19-10-2013',
 '1-10-2013',
 '17-10-2013

In [16]:
# mean_steps_per_participant, apply mean on columns stored in 'cols'

df['mean_steps_per_participant'] = df.loc[:,cols].mean(axis=1)

In [17]:
df

Unnamed: 0,id,20-6-2013,21-6-2013,22-6-2013,23-6-2013,24-6-2013,25-6-2013,26-6-2013,27-6-2013,28-6-2013,...,12-5-2014,city,gender,age,hh_size,education,education_1,height,weight,mean_steps_per_participant
0,1,,,,,3941.0,15733.0,9929.0,12879.0,10541.0,...,,Bordeaux,Male,25-34,4,4,Master or doctorate,178.0,98.0,10205.521212
1,2,,,10473.0,705.0,4287.0,5507.0,4024.0,3926.0,14595.0,...,,Lille,Male,35-44,1,3,Bachelor,180.0,77.0,5687.423313
13,14,,,,,,,,,,...,,Lille,Male,45-54,1,1,Master or doctorate,170.0,83.0,9214.798077
21,22,,,,,,,,,,...,,Bordeaux,Female,45-54,4,4,Master or doctorate,170.0,73.0,5667.045977
27,28,,6262.0,16594.0,7708.0,6419.0,10889.0,7663.0,8728.0,15860.0,...,3607.0,Lille,Female,35-44,3,3,Bachelor,162.0,58.0,7969.027586
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,914,,18543.0,18979.0,15262.0,16937.0,22538.0,20108.0,14040.0,18397.0,...,,Bordeaux,Female,45-54,2,2,Primary,160.0,55.0,15213.159624
915,916,,8667.0,10471.0,659.0,12079.0,5212.0,7308.0,12229.0,7007.0,...,,Lille,Female,35-44,1,1,High school,160.0,72.0,7140.983471
919,920,,,,,,,,,,...,,Montpellier,Male,25-34,5,3,Bachelor,176.0,70.0,9950.074074
920,921,,,,,,,,,5308.0,...,,Lille,Female,35-44,5,4,Bachelor,166.0,88.0,13572.084507


In [19]:
import os

In [None]:
os.li

In [20]:
os.path.getsize('week1_ml_v1.ipynb')

51439

In [23]:
def convert_bytes(num):
    """
    this function will convert bytes to MB.... GB... etc
    """
    for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
        if num < 1024.0:
            return "%3.1f %s" % (num, x)
        num /= 1024.0


def file_size(file_path):
    """
    this function will return the file size
    """
    if os.path.isfile(file_path):
        file_info = os.stat(file_path)
        return convert_bytes(file_info.st_size)


# Lets check the file size of MS Paint exe 
# or you can use any file path
file_path = r"week1_ml_v1.ipynb"
print(file_size(file_path))

51.0 KB


#### Exploratory data analysis
