## Feature Generation for Alarm Type Prediction

    - loads labels generated before (04-b)

In [None]:
import numpy as np
import pickle
import pandas as pd
import sys

In [None]:
label_df = pd.read_csv('multiclass_labels.csv')
label_df['date'] = pd.to_datetime(label_df['date'])
label_df

In [None]:
inv_df = pd.read_csv('all_inverters.csv')

In [None]:
np_funs = [np.mean, np.std, 'max', 'median']

def rolling_features(df, start_step, window_size, funcs):
    features = df.shift(start_step).rolling(window_size, min_periods=window_size).agg(funcs)
    features.columns = ["{}_{}{}".format(x[0], x[1], str(window_size)) for x in features.columns]
    return features

def create_features(df, colnames, ROLLING_WINDOWS):
    # Feature engineering
    df["day"] = df[TIMESTAMP_COL_NAME].apply(lambda x: x.day)
    df["dayofweek"] = df[TIMESTAMP_COL_NAME].apply(lambda x: x.dayofweek)
    df["weekofyear"] = df[TIMESTAMP_COL_NAME].apply(lambda x: x.isocalendar()[1])
    df["month"] = df[TIMESTAMP_COL_NAME].apply(lambda x: x.month)

    # exclude the current time data - so shift rolling calcs by 1
    start_step = 1 
    for col in colnames:
        for window in ROLLING_WINDOWS:
            feats = rolling_features(df[[col]], start_step=1, window_size=window, funcs=np_funs).reset_index(drop=True)
            df = pd.concat([df, feats], axis=1)
    return df

In [None]:
windows = [x*12*24 for x in [1, 2, 3, 7, 14, 21, 30]]
TIMESTAMP_COL_NAME = 'date'
data = []
label_col = 'label_7006'
for inverter in label_df['inverter'].unique():
    x = label_df[label_df.inverter==inverter]
    y = dict(x[label_col].value_counts())
    features = ['IN.GMRX.CHAR.'+inverter+'.Active Power (kW)', 
            'IN.GMRX.CHAR.WS-20 MW.Module Temperature (°C)',
            'IN.GMRX.CHAR.WS-20 MW.POA Irradiance (w/m²)',
            'IN.GMRX.CHAR.WS-5 MW.Module Temperature (°C)',
            'IN.GMRX.CHAR.WS-5 MW.POA Irradiance (w/m²)'
           ]
    columns = ['date'] + features
    inv_df_i = inv_df[columns].copy()
    inv_df_i['date'] = pd.to_datetime(inv_df_i["date"])
    inv_df_i.rename(columns={'IN.GMRX.CHAR.'+inverter+'.Active Power (kW)': 'power',
                            'IN.GMRX.CHAR.WS-20 MW.Module Temperature (°C)': 'temp1',
                            'IN.GMRX.CHAR.WS-20 MW.POA Irradiance (w/m²)': 'rad1',
                            'IN.GMRX.CHAR.WS-5 MW.Module Temperature (°C)': 'temp2',
                            'IN.GMRX.CHAR.WS-5 MW.POA Irradiance (w/m²)': 'rad2'}, inplace=True)
    inv_df_i['hour'] = inv_df_i.date.dt.hour
    df_ = create_features(inv_df_i, colnames=['power', 'temp1', 'rad1'], ROLLING_WINDOWS=windows)
    df_ = x.merge(df_, on='date', how='left')
    df_['inverter'] = inverter
    y = df_[label_col].value_counts()
    print(inverter, x.shape[0], df_.shape[0], y[1], y[0], y[1]/(y[1]+y[0]))
    data.append(df_)

data = pd.concat(data, axis=0)
data.shape

In [None]:
label_df['inverter'].unique()

In [None]:
df_['date']

In [None]:
x['date']

In [None]:
with open('inverter-data-v02.pkl', 'rb') as handle:
    all_data = pickle.load(handle)
    
print(all_data.shape)
# all_data.dropna(inplace=True)
all_data.shape

In [None]:
k = 12 * 24  # 1 day means 12 * 24 data points
dfg = all_data.groupby('inverter')
ndata = []
for inv, gdf in dfg:
    gdf = gdf.sort_values('date').reset_index(drop=True)
    n = gdf.shape[0]
    gdf['y'] = np.nan
    for ii in range(n-k):
        start, end = ii, ii+k
        lwindow = gdf.iloc[start : end]['label']
        if lwindow.sum() > 0:
            label = 1
        else:
            label = 0
        gdf.loc[ii, 'y'] = label
    gdf = gdf[~gdf.y.isnull()]
    sys.exit()
    ndata.append(gdf)

In [None]:
min_date, max_date = gdf['date'].min(), gdf['date'].max()

In [None]:
pd.date_range(start=min_date, end=max_date, freq="5min")

In [None]:
gdf.shape

In [None]:
pd.to_datetime('2019-03-25 18:50:00') + pd.Timedelta(1, 'D')