In [None]:
from datetime import timedelta
from pathlib import Path
import os

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

DATA_DIR = Path('../../Data/Wind Turbine')

## First look at data

### SCADA data

In [None]:
scada_df = pd.read_csv(DATA_DIR / 'scada_data.csv')
scada_df['DateTime'] = pd.to_datetime(scada_df['DateTime'], format='%m/%d/%Y %H:%M')
scada_df['HasError'] = (scada_df['Error'] != 0).astype(int)

In [None]:
scada_df['DateTimeR'] = scada_df['DateTime'].dt.round(freq='10min')
scada_df_gr = scada_df.groupby('DateTimeR', as_index=False).mean()

date_range = pd.Series(pd.date_range(start=scada_df_gr['DateTimeR'].min(), end=scada_df_gr['DateTimeR'].max(), freq='10min'), name='DateTimeR')
scada_df_gr = scada_df_gr.merge(date_range, how='outer', on='DateTimeR').sort_values('DateTimeR')
scada_df_gr['HasMissing'] = (scada_df_gr['Time'].isna()).astype(int)
scada_df_gr.head(60)


In [None]:
data_to_plot = scada_df.head(5000)
fig, axes = plt.subplots(figsize=(20, 14), nrows=3)
sns.lineplot(x='DateTime', y='WEC: ava. windspeed', data=data_to_plot, ax=axes[0])
sns.lineplot(x='DateTime', y='Error', data=data_to_plot, ax=axes[0])

sns.lineplot(x='DateTime', y='WEC: ava. Rotation', data=data_to_plot, ax=axes[1])
sns.lineplot(x='DateTime', y='WEC: ava. Power', data=data_to_plot, ax=axes[2])

In [None]:
scada_df.columns

### Faults

In [None]:
fault_df = pd.read_csv(DATA_DIR / 'fault_data.csv')
fault_df['DateTime'] = pd.to_datetime(fault_df['DateTime'], format='%Y-%m-%d %H:%M:%S')
fault_df['TimeDiff'] = fault_df['DateTime'] - fault_df['DateTime'].shift(1)

fault_df['DateTimeR'] = fault_df['DateTime'].dt.round(freq='10min')

grouped_records = []
for dt, group_df in fault_df.groupby('DateTimeR', as_index=False):
    fault_record = (dt, ','.join(group_df['Fault'].unique()))
    grouped_records.append(fault_record)

grouped_fault_df = pd.DataFrame.from_records(grouped_records, columns=['DateTime', 'Faults'])

for fault_type in fault_df['Fault'].unique():
    grouped_fault_df[f'Fault_{fault_type}'] = (grouped_fault_df['Faults'].str.contains(fault_type)).astype(int)

# grouped_fault_df.head(50)

date_range = pd.Series(pd.date_range(start=scada_df_gr['DateTimeR'].min(), end=scada_df_gr['DateTimeR'].max(), freq='10min'), name='DateTime')
grouped_fault_df = grouped_fault_df.merge(date_range, how='outer', on='DateTime').sort_values('DateTime')
grouped_fault_df['HasMissing'] = (grouped_fault_df['Faults'].isna()).astype(int)
grouped_fault_df.head(60)


In [None]:
fault_df['TimeDiff'][fault_df['TimeDiff'] < timedelta(minutes=10)]

In [None]:
fault_df['Fault'].value_counts()

### Status data

In [None]:
status_df = pd.read_csv(DATA_DIR / 'status_data.csv')
status_df['Time'] = pd.to_datetime(status_df['Time'], format='%d/%m/%Y %H:%M:%S')
status_df = status_df.rename(columns={'Time': 'DateTime'})
status_df

In [None]:
status_df['Main Status'].value_counts()

### Merged data

In [None]:
faults_status_df = fault_df.merge(status_df, on='DateTime', how='outer').sort_values('DateTime')
faults_status_df['FaultCode'] = faults_status_df['Fault'].map({
    'AF': 1,
    'EF': 2,
    'GF': 3,
    'FF': 4,
    'MF': 5,
})
faults_status_df.head(60).tail(60)

In [None]:
fig, axes = plt.subplots(figsize=(20, 14), nrows=3)
sns.scatterplot(x='DateTime', y='FaultCode', data=faults_status_df, hue='FaultCode', ax=axes[0])
sns.scatterplot(x='DateTime', y='T', data=faults_status_df, ax=axes[1])
sns.scatterplot(x='DateTime', y='Value0', data=faults_status_df, ax=axes[2])