## Data Exploration & Visualization

In [None]:
import datetime
import pandas as pd
import random
import matplotlib.pyplot as plt

In [None]:
inv_df = pd.read_csv('all_inverters.csv')
inv_df.head()

In [None]:
target_codes = [7006, 3511, 7502, 7501, 3504, 6448, 1500, 7704]
alarm_df = pd.read_csv('all_alarms.csv')
alarm_df = alarm_df[alarm_df["Error Code"].isin(target_codes)]
alarm_df = alarm_df[(alarm_df.hod >= 6) & (alarm_df.hod <= 17)]  # original (6,17)
print(alarm_df.shape)
alarm_df.head()

In [None]:
inverters = sorted(alarm_df["Controller Name"].unique().tolist())
inverters

In [None]:
alarm_df['Error Code']

In [None]:
inverter_name = 'INV-19'
num_past_days = 7
plot_count = 5
alarm_plot = [7502, 1500]
plotcols = ['date',
            'IN.GMRX.CHAR.'+inverter_name+'.Active Power (kW)', 
#             'IN.GMRX.CHAR.WS-20 MW.Ambient Temperature (°C)', # all nulls
            'IN.GMRX.CHAR.WS-20 MW.Module Temperature (°C)',
            'IN.GMRX.CHAR.WS-20 MW.POA Irradiance (w/m²)',
#             'IN.GMRX.CHAR.WS-5 MW.Ambient Temperature (°C)',  # all nulls
#             'IN.GMRX.CHAR.WS-5 MW.Module Temperature (°C)',
#             'IN.GMRX.CHAR.WS-5 MW.POA Irradiance (w/m²)'
           ]
 
inverter_index = inverters.index(inverter_name)
alarm_df_i = alarm_df[alarm_df["Controller Name"] == inverters[inverter_index]].copy()
alarm_df_i['Raised Time'] = pd.to_datetime(alarm_df_i['Raised Time'])
print("Number of alarms:", alarm_df_i.shape[0])

columns = ['Time', 'date',
           'IN.GMRX.CHAR.'+inverter_name+'.Active Power (kW)', 
#            'IN.GMRX.CHAR.WS-20 MW.Ambient Temperature (°C)',
           'IN.GMRX.CHAR.WS-20 MW.Module Temperature (°C)',
           'IN.GMRX.CHAR.WS-20 MW.POA Irradiance (w/m²)',
#            'IN.GMRX.CHAR.WS-5 MW.Ambient Temperature (°C)',
#            'IN.GMRX.CHAR.WS-5 MW.Module Temperature (°C)',
#            'IN.GMRX.CHAR.WS-5 MW.POA Irradiance (w/m²)'
          ]
inv_df_i = inv_df[columns].copy()
inv_df_i['date'] = pd.to_datetime(inv_df_i["date"])
print(inv_df_i.shape)

num_rows = num_past_days * 24 * 12
count, count_pair = 0, 0
for ii, row in alarm_df_i.iterrows():
    if row['Error Code'] in alarm_plot:
        df_ = inv_df_i[pd.to_datetime(inv_df_i['date']) < pd.to_datetime(row['Raised Time'])]
        df_ = df_.sort_values('date').reset_index(drop=True)
    #     print(ii, row['Error Code'], row['Raised Time'], df_['date'].max())
        df_plot = df_.tail(num_rows)
        if random.random() > 0.5:
            df_plot[plotcols].plot(x="date", title=str(row['Error Code']) + '-' + str(row['Raised Time']))
            count += 1
        if count > plot_count:
            break

In [None]:
alarm_df[alarm_df['Error Code'].isin(alarm_plot)]['Controller Name'].value_counts()

In [None]:
pd.crosstab(alarm_df['Controller Name'], alarm_df['Error Code'])

In [None]:
alarm_df['Error Code'].value_counts()

In [None]:
num_past_days = 10
num_rows = num_past_days * 24 * 12
count, count_pair = 0, 0
for ii, row in alarm_df_i.iterrows():
    end_time = row['Raised Time']
    start_time = pd.to_datetime(end_time) - pd.Timedelta(num_past_days, 'D')
    print(start_time, end_time)
    sys.exit()

In [None]:
start_time.date()

In [None]:
t = start_time.date()
t.strftime('%Y-%m-%d')

In [None]:
def get_all_days(start, end, date_format):
    days = []
    current = start
    t = current.date()
    days.append(t.strftime('%Y-%m-%d'))
    while current < end:
        current += pd.Timedelta(1, 'D')
        t = current.date()
        days.append(t.strftime('%Y-%m-%d'))
    return days

def get_smb_data(dates, data_dir):
    df = []
    for d in dates:
        year, month, day = d.split('-')
        print(year[2:])
        file = 'min'+year[2:]+month+day+'_smb.csv'
        df_ = pd.read_csv(os.path.join(data_dir, file))
        df.append(df_)
    df = pd.concat(df, axis=0)
    return df

In [None]:
ds = get_all_days(start_time, end_time, '%Y-%m-%d')
df = get_smb_data(ds, data_dir='data/SMB/')

In [None]:
raised = alarm_df_i.groupby("Error Code").apply(lambda x: [sorted(list(x['Raised Time']))]).apply(pd.Series)

In [None]:
def get_features_single_alarm(inverter_df, fail_time, past_days, features):
    df_ = inverter_df[pd.to_datetime(inverter_df['date']) < pd.to_datetime(fail_time)].copy()
    df_ = df_[df_['date'] >= pd.to_datetime(fail_time) - pd.Timedelta(past_days, 'D')]
    df_ = df_.sort_values('date').reset_index(drop=True)
    df_ = df_[features]
    return df_.values

In [None]:
row['Raised Time'], row['Raised Time'] - pd.Timedelta(1, 'D')

In [None]:
feature_cols = ['IN.GMRX.CHAR.INV-01.Active Power (kW)', 
                'IN.GMRX.CHAR.WS-20 MW.Ambient Temperature (°C)']
dx = get_features_single_alarm(inv_df_i, row['Raised Time'], 3, feature_cols)
# dx['date'].min(), dx['date'].max(), row['Raised Time']

In [None]:
plt.plot(dx[:,1])

In [None]:
import os

os.getcwd()