This notebook:
1.  Drops device_id as redundant
2.  Converts 4 time features to datetime
3.  Converts burn_material_amount to float
4.  Leaves hotplate_temp as string
5.  Leaves distance_sensor_hotplate  
4.  Encodes specimen_class as encoded_specimen and drops the former  
    in_smoke: 1, clean_air: 0
5.  Drops burn_material_name since it is exactly the same as burn_material

In [1]:
import pandas as pd
import os

In [2]:
input_file = '../AA Dataset/aa_dryad_merged_cleaned_data2_update.csv'
output_file = '../AA Dataset/aa_dryad_merged_cleaned_data2_update.csv'

In [3]:
df = pd.read_csv(input_file)

In [4]:
df.head()

Unnamed: 0,sensor_node_id,device_id,scan_time,temp_0,temp_1,temp_2,temp_3,temp_4,temp_5,temp_6,...,end_time,burn_material,burn_material_name,burn_material_amount,hotplate_start,hotplate_temp,distance_sensor_hotplate,venue,venue_type,specimen_class
0,146,me-sylvav1n21-e637a6d4-30ee-49a6-8ad,2021-09-01 09:01:36.817528009,18.8,19.0,19.6,19.6,19.7,20.0,20.1,...,2021-09-01 18:50:46.000000000,SP2,SP2,600g,2021-09-01 10:03:57.000000000,500,30m,hall,Hall,clean_air
1,146,me-sylvav1n21-e637a6d4-30ee-49a6-8ad,2021-09-01 09:02:13.375891000,19.2,19.3,19.8,19.8,19.9,20.2,20.2,...,2021-09-01 18:50:46.000000000,SP2,SP2,600g,2021-09-01 10:03:57.000000000,500,30m,hall,Hall,clean_air
2,146,me-sylvav1n21-e637a6d4-30ee-49a6-8ad,2021-09-01 09:02:50.057426000,19.3,19.4,19.9,19.9,20.0,20.3,20.3,...,2021-09-01 18:50:46.000000000,SP2,SP2,600g,2021-09-01 10:03:57.000000000,500,30m,hall,Hall,clean_air
3,146,me-sylvav1n21-e637a6d4-30ee-49a6-8ad,2021-09-01 09:03:26.565743000,19.3,19.5,20.0,20.0,20.1,20.3,20.4,...,2021-09-01 18:50:46.000000000,SP2,SP2,600g,2021-09-01 10:03:57.000000000,500,30m,hall,Hall,clean_air
4,146,me-sylvav1n21-e637a6d4-30ee-49a6-8ad,2021-09-01 09:04:03.635421991,19.4,19.5,20.0,20.0,20.1,20.4,20.4,...,2021-09-01 18:50:46.000000000,SP2,SP2,600g,2021-09-01 10:03:57.000000000,500,30m,hall,Hall,clean_air


In [5]:
df.shape

(2498, 47)

In [6]:
df.columns

Index(['sensor_node_id', 'device_id', 'scan_time', 'temp_0', 'temp_1',
       'temp_2', 'temp_3', 'temp_4', 'temp_5', 'temp_6', 'temp_7', 'temp_8',
       'temp_9', 'humid_0', 'humid_1', 'humid_2', 'humid_3', 'humid_4',
       'humid_5', 'humid_6', 'humid_7', 'humid_8', 'humid_9', 'gas_scan_0',
       'gas_scan_1', 'gas_scan_2', 'gas_scan_3', 'gas_scan_4', 'gas_scan_5',
       'gas_scan_6', 'gas_scan_7', 'gas_scan_8', 'gas_scan_9', 'gas_scan_cnt',
       'trigger', 'experiment_id', 'start_time', 'end_time', 'burn_material',
       'burn_material_name', 'burn_material_amount', 'hotplate_start',
       'hotplate_temp', 'distance_sensor_hotplate', 'venue', 'venue_type',
       'specimen_class'],
      dtype='object')

Drop device_id since it is synonymous with sensor_node_id

In [7]:
df = df.drop(['device_id'], axis=1)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2498 entries, 0 to 2497
Data columns (total 46 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   sensor_node_id            2498 non-null   int64  
 1   scan_time                 2498 non-null   object 
 2   temp_0                    2498 non-null   float64
 3   temp_1                    2498 non-null   float64
 4   temp_2                    2498 non-null   float64
 5   temp_3                    2498 non-null   float64
 6   temp_4                    2498 non-null   float64
 7   temp_5                    2498 non-null   float64
 8   temp_6                    2498 non-null   float64
 9   temp_7                    2498 non-null   float64
 10  temp_8                    2498 non-null   float64
 11  temp_9                    2498 non-null   float64
 12  humid_0                   2498 non-null   float64
 13  humid_1                   2498 non-null   float64
 14  humid_2 

Note that there are null in burn_material_amount

Convert times to datetime

In [9]:
times = ['scan_time', 'start_time', 'end_time', 'hotplate_start']
for t in times:
    df[t] = pd.to_datetime(df[t])

Convert burn_material_amount to float

In [10]:
df.burn_material_amount = df.burn_material_amount.str.strip('g').astype(float)
df.rename(columns={'burn_material_amount':'burn_material_amount(g)'},inplace=True)

Leave hotplate_temp as string since one of the values is "open fire."  These values will not be used for modeling anyway since these conditions won't exist in the field.

In [11]:
df.hotplate_temp.value_counts()

500          1099
250           627
open fire     428
320           344
Name: hotplate_temp, dtype: int64

Leave distance_sensor_hotplate as a string also since it only takes on two values and it is synonymous with venue.

In [12]:
df.groupby(['distance_sensor_hotplate','venue_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,sensor_node_id,scan_time,temp_0,temp_1,temp_2,temp_3,temp_4,temp_5,temp_6,temp_7,...,experiment_id,start_time,end_time,burn_material,burn_material_name,burn_material_amount(g),hotplate_start,hotplate_temp,venue,specimen_class
distance_sensor_hotplate,venue_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
30m,Hall,1791,1791,1791,1791,1791,1791,1791,1791,1791,1791,...,1791,1791,1791,1791,1791,1363,1791,1791,1791,1791
4m,Chamber,707,707,707,707,707,707,707,707,707,707,...,707,707,707,707,707,707,707,707,707,707


Encode specimen_class as encoded_specimen where:  
    in_smoke = 1  
    clean_air = 0  

In [13]:
df.specimen_class.value_counts()

in_smoke     2017
clean_air     481
Name: specimen_class, dtype: int64

In [14]:
air_encode = {'in_smoke':1, 'clean_air':0}

In [15]:
df['encoded_specimen'] = df['specimen_class'].map(air_encode).astype(int)

In [16]:
df['encoded_specimen'].value_counts()

1    2017
0     481
Name: encoded_specimen, dtype: int64

In [17]:
df = df.drop(['specimen_class'], axis=1)

Leave burn_material and burn_material_name even though they are redundant because future groups might want the code or the name.

In [19]:
df.burn_material.value_counts()

SP2                                 1834
Scott Pine branches in fire bowl     428
BM2                                  236
Name: burn_material, dtype: int64

In [20]:
df.groupby(['burn_material','burn_material_name']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,sensor_node_id,scan_time,temp_0,temp_1,temp_2,temp_3,temp_4,temp_5,temp_6,temp_7,...,experiment_id,start_time,end_time,burn_material_amount(g),hotplate_start,hotplate_temp,distance_sensor_hotplate,venue,venue_type,encoded_specimen
burn_material,burn_material_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
BM2,BM2,236,236,236,236,236,236,236,236,236,236,...,236,236,236,236,236,236,236,236,236,236
SP2,SP2,1834,1834,1834,1834,1834,1834,1834,1834,1834,1834,...,1834,1834,1834,1834,1834,1834,1834,1834,1834,1834
Scott Pine branches in fire bowl,Scott pine branches in fire bowl,428,428,428,428,428,428,428,428,428,428,...,428,428,428,0,428,428,428,428,428,428


In [22]:
df.drop('burn_material_name', axis=1, inplace=True)

One more check of data types ...

In [23]:
df.dtypes

sensor_node_id                       int64
scan_time                   datetime64[ns]
temp_0                             float64
temp_1                             float64
temp_2                             float64
temp_3                             float64
temp_4                             float64
temp_5                             float64
temp_6                             float64
temp_7                             float64
temp_8                             float64
temp_9                             float64
humid_0                            float64
humid_1                            float64
humid_2                            float64
humid_3                            float64
humid_4                            float64
humid_5                            float64
humid_6                            float64
humid_7                            float64
humid_8                            float64
humid_9                            float64
gas_scan_0                         float64
gas_scan_1 

In [24]:
df.to_csv(output_file, index = False, header = True)