In [None]:
import datetime
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns
import pickle

In [None]:
inv_df = pd.read_csv('all_inverters.csv')
# inv_df.head()

target_codes = [7006, 3511, 7502, 7501, 3504, 6448, 1500, 7704]
alarm_df = pd.read_csv('all_alarms.csv')
alarm_df = alarm_df[alarm_df["Error Code"].isin(target_codes)]
alarm_df = alarm_df[(alarm_df.hod >= 6) & (alarm_df.hod <= 18)]  # original (6,17)
print(alarm_df.shape)
inverters = sorted(alarm_df["Controller Name"].unique().tolist())

In [None]:
with open('inverter-labels-v3.pkl', 'rb') as handle:
    inv_labels = pickle.load(handle)

In [None]:
label_df = {'inverters': [], 'positive': [], 'negative': []}
label_col = 'label_1h'
for inv in inv_labels.keys():
    x = inv_labels[inv]
    y = dict(x[label_col].value_counts())
    label_df['inverters'].append(inv)
    if 1 in y:
        label_df['positive'].append(y[1])
    else:
        label_df['positive'].append(0)
    label_df['negative'].append(y[0])
label_df = pd.DataFrame(label_df)
label_df

In [None]:
label_df.to_csv('inverter-faults-v2.csv', index=False)

In [None]:
# class ratio
mask = label_df['positive'] > 10
total = label_df[mask][['positive', 'negative']].apply(np.sum, axis=0)
100 * total['positive'] / (total['positive'] + total['negative'])

In [None]:
total

## Generate Features

In [None]:
np_funs = [np.mean, np.std, 'max', 'median']

def rolling_features(df, start_step, window_size, funcs):
    features = df.shift(start_step).rolling(window_size, min_periods=window_size).agg(funcs)
    features.columns = ["{}_{}{}".format(x[0], x[1], str(window_size)) for x in features.columns]
    return features

def create_features(df, colnames, ROLLING_WINDOWS):
    # Feature engineering
    df["day"] = df[TIMESTAMP_COL_NAME].apply(lambda x: x.day)
    df["dayofweek"] = df[TIMESTAMP_COL_NAME].apply(lambda x: x.dayofweek)
    df["weekofyear"] = df[TIMESTAMP_COL_NAME].apply(lambda x: x.isocalendar()[1])
    df["month"] = df[TIMESTAMP_COL_NAME].apply(lambda x: x.month)

    # exclude the current time data - so shift rolling calcs by 1
    start_step = 1 
    for col in colnames:
        for window in ROLLING_WINDOWS:
            feats = rolling_features(df[[col]], start_step=1, window_size=window, funcs=np_funs).reset_index(drop=True)
            df = pd.concat([df, feats], axis=1)
    return df

In [None]:
windows = [x*12*24 for x in [1, 2, 3, 7, 14, 21, 30]]
TIMESTAMP_COL_NAME = 'date'
data = []
for inverter in inv_labels.keys():
    x = inv_labels[inverter]
    y = dict(x['label'].value_counts())
    if True in y and y[True] > 10:
        features = ['IN.GMRX.CHAR.'+inverter+'.Active Power (kW)', 
                'IN.GMRX.CHAR.WS-20 MW.Module Temperature (°C)',
                'IN.GMRX.CHAR.WS-20 MW.POA Irradiance (w/m²)',
                'IN.GMRX.CHAR.WS-5 MW.Module Temperature (°C)',
                'IN.GMRX.CHAR.WS-5 MW.POA Irradiance (w/m²)'
               ]
        columns = ['date'] + features
        inv_df_i = inv_df[columns].copy()
        inv_df_i['date'] = pd.to_datetime(inv_df_i["date"])
        inv_df_i.rename(columns={'IN.GMRX.CHAR.'+inverter+'.Active Power (kW)': 'power',
                                'IN.GMRX.CHAR.WS-20 MW.Module Temperature (°C)': 'temp1',
                                'IN.GMRX.CHAR.WS-20 MW.POA Irradiance (w/m²)': 'rad1',
                                'IN.GMRX.CHAR.WS-5 MW.Module Temperature (°C)': 'temp2',
                                'IN.GMRX.CHAR.WS-5 MW.POA Irradiance (w/m²)': 'rad2'}, inplace=True)
        inv_df_i['hour'] = inv_df_i.date.dt.hour
        df_ = create_features(inv_df_i, colnames=['power', 'temp1', 'rad1'], ROLLING_WINDOWS=windows)
        df_ = x.merge(df_, on='date', how='left')
        df_['inverter'] = inverter
        y = df_[label_col].value_counts()
        print(inverter, x.shape[0], df_.shape[0], y[1], y[0], y[1]/(y[1]+y[0]))
        data.append(df_)
        sys.exit("HERE")
    else:
        continue
data = pd.concat(data, axis=0)
data.shape

In [None]:
total2 = data[label_col].value_counts()
100 * total2[1] / (total2[0] + total2[1])

In [None]:
import pickle
with open('inverter-data-v03.pkl', 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)