# Data Scrubber

Load the CSV data from `../data/raw` and clean it for processing. Cleaned data will be stored in `../data/clean`.

In [1]:
import os
import glob
import pandas as pd

In [2]:
cur_dir = os.getcwd()
exterior_files = glob.glob(cur_dir.replace('notebooks','data/raw/*exterior*'))
interior_files = glob.glob(cur_dir.replace('notebooks','data/raw/*interior*'))

In [3]:
govee_columns = ['Timestamp for sample frequency every 1 min', 
                 'Temperature_Celsius', 
                 'Relative_Humidity']
def combine_no_time_duplicates(files_glob):
    all_data = pd.DataFrame(columns=govee_columns)
    for f in exterior_files:
        data = pd.read_csv(f)
        data.set_index(keys=govee_columns[0])
        all_data = pd.concat([all_data, data], 
                             ignore_index=True,
                             join='inner',
                             copy=True,
                             sort=True)

    all_data[govee_columns[0]] = pd.to_datetime(all_data[govee_columns[0]], format="%Y-%m-%d %H:%M:%S")
        
    return all_data

In [5]:
all_exterior_data = combine_no_time_duplicates(exterior_files)
all_exterior_data['location'] = Locations.EXTERIOR.value

all_interior_data = combine_no_time_duplicates(interior_files)
all_interior_data['location'] = Locations.INTERIOR.value

In [14]:
# combine into one
all_data = pd.concat([all_exterior_data, all_interior_data])
all_data.rename(columns={govee_columns[0]: "timestamp", govee_columns[1]: str(govee_columns[1]).lower(), govee_columns[2]: str(govee_columns[2]).lower()},
                inplace=True)

# Write Clean Data

In [15]:
all_data.to_csv('../data/clean/sensor_data.csv', index=False)

In [13]:
os.listdir('../data/clean/')

['sensor_data.csv']