In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

%matplotlib inline

## Dataset Description
#### Type: Timeseries, Multivariate

The dataset consist on the reading of 14 temperature modulated mox sensors.
Each experiment consists of 100 measurements: 10 experimental mixtures uniformly distributed in the range of 0 - 20 ppm and 10 replicates per concentration.

At the beggining of each experiment the gas chamber is cleaned by 15 mins using a stream of 240 mln/min. And after that the gas mixtures are released at 240mln/min, thus assuming the flow rate is constant.

A single experiment lasted 25 hours (100 samples x 15 minutes / sample) and was replicated on 13 working days spanning in total 17 days.


## Data Loading

In [3]:
path = "./dataset_tempMod_CO_RH"

# I get a list of the filenames for later use in a for loop.
# Then they get sorted by name and the last file which is the Readme.txt gets sliced.
files = sorted(os.listdir(path))[:-1]
files

['20160930_203718.csv',
 '20161001_231809.csv',
 '20161003_085624.csv',
 '20161004_104124.csv',
 '20161005_140846.csv',
 '20161006_182224.csv',
 '20161007_210049.csv',
 '20161008_234508.csv',
 '20161010_095046.csv',
 '20161011_113032.csv',
 '20161013_143355.csv',
 '20161014_184659.csv',
 '20161016_053656.csv']

In [None]:
# Here I load the first csv as a exploratory dataset.
df_list = []

for i in files[:]:
    try:
        ptf = path + "/" + i
        temp_df = pd.read_csv(ptf)
        df_list.append(temp_df.iloc[:,:20])
        #current_df = pd.read_csv(ptf)
        #df = pd.concat([df, pd.read_csv(ptf)], axis=0)
       
        print(ptf + " " + "loaded")
    except:
        print(ptf + " not loaded")
 
df = pd.concat(df_list, axis=0)

./dataset_tempMod_CO_RH/20160930_203718.csv loaded
./dataset_tempMod_CO_RH/20161001_231809.csv loaded
./dataset_tempMod_CO_RH/20161003_085624.csv loaded
./dataset_tempMod_CO_RH/20161004_104124.csv loaded
./dataset_tempMod_CO_RH/20161005_140846.csv loaded
./dataset_tempMod_CO_RH/20161006_182224.csv loaded


In [None]:
df.head(10)

## Data structure and NA values
Now we are going to check the df memory usage and formats and the presence of NaN values in each column.

In [None]:
df.info()

In [None]:
pd.set_option('display.max_rows',None)
df.isna().sum()

In [None]:
df = df.iloc[:,:20]
df.info()

In [None]:
fcols = df.select_dtypes('float').columns
df[fcols] = df[fcols].apply(pd.to_numeric, downcast='float')
df.info()

## Dataset Description


In [None]:
#Set option to display float format instead of scientific notation.
pd.set_option('float_format', '{:f}'.format)
df.describe()

From the R1 to R14 sensor descriptions, we can appreciate that the max values are far from the 75% quartile, thus we can assume there's presence of outliers.

In [None]:
df.hist(figsize=(20,20));#;

From the previous distribution charts we can see that the data tends to be left-skewed

In [None]:
df.iloc[:,1:].boxplot(figsize=(20,10));#;
plt.xticks(rotation=45);#;

In [None]:
df.plot('Time (s)', 'Flow rate (mL/min)')
plt.grid()

Presence of outliers in the flow rate data, probably due to instrumental noise.

In [None]:
df.plot('Time (s)', 'Temperature (C)')
plt.ylim([25,30])
plt.grid()

In [None]:
df.plot('Time (s)', 'Humidity (%r.h.)')
#plt.xlim([0,15000])
plt.grid()

In [None]:
df['Time (s)'] = pd.to_timedelta(df['Time (s)'], unit='sec')
df.head(10)

In [None]:
df.set_index('Time (s)', inplace=True)
resampled = df.resample('45S', label='right').mean()
resampled.head()

In [None]:
resampled = resampled.reset_index()
resampled.head()
#resampled['Time (s)'] = [x.seconds for x in resampled['Time (s)']]

In [None]:
resampled['Time (s)'] = [x.seconds for x in resampled['Time (s)']]


In [None]:
resampled.head(10)

In [None]:
resampled.plot('Time (s)', 'Flow rate (mL/min)', grid=True)

In [None]:
columns = ['Time (s)', 'CO (ppm)', 'Humidity (%r.h.)', 'Temperature (C)',
       'Flow rate (mL/min)', 'Heater voltage (V)']
#resampled.columns
pd.plotting.scatter_matrix(resampled[columns], alpha=0.2, figsize=(12,12));#;

In [None]:
columns = ['Time (s)', 'CO (ppm)', 'R7 (MOhm)', 'R8 (MOhm)', 'R9 (MOhm)', 'R10 (MOhm)'
          , 'R11 (MOhm)','R12 (MOhm)','R13 (MOhm)','R14 (MOhm)' ]
#resampled.columns
pd.plotting.scatter_matrix(resampled[columns], alpha=0.1, figsize=(15,15));#;

## Data Pre-processing.

And the boxplot chart shows that there's presence of outliers in the sensor readings.

In [None]:
from sklearn.preprocessing import PowerTransformer, QuantileTransformer