In [1]:
import numpy as np
import pandas as pd

In [2]:
from matplotlib import pyplot as plt
import seaborn as sns
from pylab import rcParams

In [3]:
df = pd.read_csv('train.csv')

In [4]:
df['date_time'] = pd.to_datetime(df['date_time'])

In [5]:
df["day"] = df["date_time"].dt.day
df["hour"] = df["date_time"].dt.hour
df.head()

Unnamed: 0,date_time,deg_C,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,target_carbon_monoxide,target_benzene,target_nitrogen_oxides,day,hour
0,2010-03-10 18:00:00,13.1,46.0,0.7578,1387.2,1087.8,1056.0,1742.8,1293.4,2.5,12.0,167.7,10,18
1,2010-03-10 19:00:00,13.2,45.3,0.7255,1279.1,888.2,1197.5,1449.9,1010.9,2.1,9.9,98.9,10,19
2,2010-03-10 20:00:00,12.6,56.2,0.7502,1331.9,929.6,1060.2,1586.1,1117.0,2.2,9.2,127.1,10,20
3,2010-03-10 21:00:00,11.0,62.4,0.7867,1321.0,929.0,1102.9,1536.5,1263.2,2.2,9.7,177.2,10,21
4,2010-03-10 22:00:00,11.9,59.0,0.7888,1272.0,852.7,1180.9,1415.5,1132.2,1.5,6.4,121.8,10,22


In [6]:
targets = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']

In [7]:
seasons = ['spring', 'summer', 'autumn', 'winter']

In [8]:
def season(date):
    x = date.month
    if x in [12,1,2]:
        return 'winter'
    elif x in [3,4,5]:
        return 'spring'
    elif x in [6,7,8]:
        return 'summer'
    elif x in [9,10,11]:
        return 'autumn'

In [9]:
df['season'] = df['date_time'].apply(season)

In [10]:
months = {1 : 'Jan', 2 : 'Feb', 3 : 'Mar', 4 : 'Apr', 5 : 'May', 6 : 'Jun', 7 : 'Jul',
          8 : 'Aug', 9 : 'Sep', 10 : 'Oct', 11 : 'Nov', 12 : 'Dec'}

In [11]:
def month(date):
    months = {1 : 'Jan', 2 : 'Feb', 3 : 'Mar', 4 : 'Apr', 5 : 'May', 6 : 'Jun', 7 : 'Jul', 
              8 : 'Aug', 9 : 'Sep', 10 : 'Oct', 11 : 'Nov', 12 : 'Dec'}
    return months[int(date.month)]

In [12]:
df['month'] = df['date_time'].apply(month)

In [13]:
columns = list(df.columns)

In [14]:
columns

['date_time',
 'deg_C',
 'relative_humidity',
 'absolute_humidity',
 'sensor_1',
 'sensor_2',
 'sensor_3',
 'sensor_4',
 'sensor_5',
 'target_carbon_monoxide',
 'target_benzene',
 'target_nitrogen_oxides',
 'day',
 'hour',
 'season',
 'month']

In [15]:
columns = list(df.columns)

In [16]:
columns

['date_time',
 'deg_C',
 'relative_humidity',
 'absolute_humidity',
 'sensor_1',
 'sensor_2',
 'sensor_3',
 'sensor_4',
 'sensor_5',
 'target_carbon_monoxide',
 'target_benzene',
 'target_nitrogen_oxides',
 'day',
 'hour',
 'season',
 'month']

In [17]:
def lag_feature(df, columns, lag):
    for col in columns:
        df['lag_' + str(lag) + '_' + col] = df[col].shift(lag)

In [18]:
def RWF(series, lag): # rolling window feature
    return series.rolling(lag)

In [19]:
def EWF(series, lag): # expanding window feature
    return series.expanding(lag)

In [20]:
from statsmodels.api import tsa
from statsmodels.graphics import tsaplots

In [21]:
columns = list(df.columns)
columns

['date_time',
 'deg_C',
 'relative_humidity',
 'absolute_humidity',
 'sensor_1',
 'sensor_2',
 'sensor_3',
 'sensor_4',
 'sensor_5',
 'target_carbon_monoxide',
 'target_benzene',
 'target_nitrogen_oxides',
 'day',
 'hour',
 'season',
 'month']

In [22]:
for col in columns[16:]:
    df[col] = df[col].apply(int)

In [23]:
seasons_dict = {'winter' : 1, 'spring' : 2, 'summer' : 3, 'autumn' : 4}

In [24]:
df['season'] = df['season'].apply(lambda x : seasons_dict[x])

In [25]:
from statsmodels.tsa import seasonal as ssn

In [26]:
def get_STL(series, period):
    stl = ssn.STL(series, period = period).fit()
    return [stl.trend, stl.seasonal,  stl.resid] 

In [27]:
df['month'] = df['date_time'].dt.month

In [28]:
from sklearn.metrics import mean_absolute_error

In [29]:
def outliers_frame(series,conf_interval = True, boxplots = True, scale = 1.96, lag = 24 ,perc = [25,75]):
    
    if boxplots == True:
        Q1, Q3 = np.percentile(series, perc)
        IQR = Q3 - Q1
        [dw, uw] = [Q1 - 1.5*IQR, Q3 + 1.5*IQR]
        outliers = np.logical_or(series <= dw , series >= uw)
    
    if conf_interval == True:
        rolling_mean = series.rolling(lag).mean()
        mae = mean_absolute_error(series[lag:], rolling_mean[lag:])
        deviation = np.std(series[lag:] - rolling_mean[lag:])
        lower_bond = rolling_mean - (mae + scale * deviation)
        upper_bond = rolling_mean + (mae + scale * deviation)
        anomalies = np.logical_or(series > upper_bond , series < lower_bond)
        
    if boxplots and conf_interval:
        return [anomalies, outliers]
    elif boxplots ^ conf_interval:
        if boxplots: return outliers
        else: return anomalies
    else:
        return []

In [30]:
columns = list(df.columns)

In [31]:
columns

['date_time',
 'deg_C',
 'relative_humidity',
 'absolute_humidity',
 'sensor_1',
 'sensor_2',
 'sensor_3',
 'sensor_4',
 'sensor_5',
 'target_carbon_monoxide',
 'target_benzene',
 'target_nitrogen_oxides',
 'day',
 'hour',
 'season',
 'month']

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7111 entries, 0 to 7110
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   date_time               7111 non-null   datetime64[ns]
 1   deg_C                   7111 non-null   float64       
 2   relative_humidity       7111 non-null   float64       
 3   absolute_humidity       7111 non-null   float64       
 4   sensor_1                7111 non-null   float64       
 5   sensor_2                7111 non-null   float64       
 6   sensor_3                7111 non-null   float64       
 7   sensor_4                7111 non-null   float64       
 8   sensor_5                7111 non-null   float64       
 9   target_carbon_monoxide  7111 non-null   float64       
 10  target_benzene          7111 non-null   float64       
 11  target_nitrogen_oxides  7111 non-null   float64       
 12  day                     7111 non-null   int64   

In [33]:
f_dict = {f_name : [df[f_name]] for f_name in columns[1:]} # features series dict

In [34]:
for key in f_dict.keys():
    if df[key].dtype != bool and key not in ['day', 'hour', 'season', 'month']:
        f_dict[key].append(RWF(df[key] ,lag = 24).max())
        f_dict[key].append(RWF(df[key], lag = 24).min())
        f_dict[key].append(RWF(df[key], lag = 24).mean())
    
        f_dict[key].append(EWF(df[key], lag = 24).max())
        f_dict[key].append(EWF(df[key], lag = 24).min())
        f_dict[key].append(EWF(df[key], lag = 24).mean())
    
        f_dict[key].extend(get_STL(df[key], period = 24))
        
        outliers, anomalies = outliers_frame(df[key])
        
        f_dict[key].append(RWF(outliers, lag = 24).sum())
        f_dict[key].append(RWF(anomalies, lag = 24).sum())
        