# Data Scrubber

Load the CSV data from `../data/raw` and clean it for processing. Cleaned data will be stored in `../data/clean`.

In [5]:
import os
import glob
import pandas as pd
import util
import datetime as dt
import numpy as np

In [6]:
cur_dir = os.getcwd()
exterior_files = glob.glob(cur_dir.replace('notebooks','data/raw/*exterior*'))
interior_files = glob.glob(cur_dir.replace('notebooks','data/raw/*interior*'))

In [7]:
govee_columns = ['Timestamp for sample frequency every 1 min', 
                 'Temperature_Celsius', 
                 'Relative_Humidity']
def combine_no_time_duplicates(files_glob):
    all_data = pd.DataFrame(columns=govee_columns)
    for f in files_glob:
        data = pd.read_csv(f)
        data.set_index(keys=govee_columns[0])
        all_data = pd.concat([all_data, data], 
                             ignore_index=True,
                             join='inner',
                             copy=True,
                             sort=True)

    all_data[govee_columns[0]] = pd.to_datetime(all_data[govee_columns[0]], format="%Y-%m-%d %H:%M:%S")
        
    return all_data

In [8]:
all_exterior_data = combine_no_time_duplicates(exterior_files)
all_exterior_data['location'] = util.Locations.EXTERIOR.value
# all_exterior_data.to_csv('../data/clean/exterior.data.csv')

all_interior_data = combine_no_time_duplicates(interior_files)
all_interior_data['location'] = util.Locations.INTERIOR.value
# all_interior_data.to_csv('../data/clean/interior.data.csv')


In [9]:
# combine into one
all_data = pd.concat([all_exterior_data, all_interior_data])
all_data.rename(columns={govee_columns[0]: "timestamp", govee_columns[1]: str(govee_columns[1]).lower(), govee_columns[2]: str(govee_columns[2]).lower()},
                inplace=True)

In [10]:
# invalidate any data points prior to a certain date/time
invalid_prior_to = dt.datetime(year=2019, month=11, day=9, hour=15, minute=30)
all_data = all_data[all_data.timestamp>=invalid_prior_to]

# Write Clean Data

In [11]:
all_data.to_csv('../data/clean/sensor_data.csv', index=False)

In [12]:
os.listdir('../data/clean/')

['exterior.data.csv', 'interior.data.csv', 'sensor_data.csv']

In [13]:
all_data.head()

Unnamed: 0,relative_humidity,temperature_celsius,timestamp,location
30,36.4,7.6,2019-11-09 15:30:00,1
31,36.5,7.6,2019-11-09 15:31:00,1
32,36.8,7.6,2019-11-09 15:32:00,1
33,36.9,7.6,2019-11-09 15:33:00,1
34,37.6,7.6,2019-11-09 15:34:00,1
