# Wind Turbine SCADA Data Exploration

This notebook explores the synthetic SCADA data, performs basic analysis, and visualizes key patterns.


In [None]:
import sys
from pathlib import Path
sys.path.insert(0, str(Path.cwd().parent))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.data.data_loader import load_scada_data
from src.utils.config_utils import load_config

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)


## 1. Load Data


In [None]:
# Load configuration
config = load_config()
data_path = config['data']['synthetic_data_file']

# Load data
df = load_scada_data(data_path)
print(f"Loaded {len(df)} records")
print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"Number of turbines: {df['turbine_id'].nunique()}")


## 2. Basic Statistics


In [None]:
# Display basic info
print("\nData Info:")
print(df.info())

print("\nBasic Statistics:")
print(df.describe())


## 3. Failure Analysis


In [None]:
# Failure statistics
failure_rate = df['failure_within_horizon'].mean() * 100
print(f"Overall failure rate: {failure_rate:.2f}%")

# Failures by turbine
failures_by_turbine = df.groupby('turbine_id')['failure_within_horizon'].sum()
print("\nFailures by turbine:")
print(failures_by_turbine)

# Plot
plt.figure(figsize=(10, 6))
failures_by_turbine.plot(kind='bar')
plt.title('Number of Failures by Turbine')
plt.xlabel('Turbine ID')
plt.ylabel('Number of Failures')
plt.tight_layout()
plt.show()


## 4. RUL Distribution


In [None]:
# RUL analysis
rul_data = df['time_to_failure_hours'].dropna()
print(f"RUL statistics:")
print(rul_data.describe())

# Plot distribution
plt.figure(figsize=(10, 6))
plt.hist(rul_data, bins=50, edgecolor='black', alpha=0.7)
plt.xlabel('RUL (hours)')
plt.ylabel('Frequency')
plt.title('RUL Distribution')
plt.tight_layout()
plt.show()


## 5. Feature Correlations


In [None]:
# Select numerical features
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
exclude = ['turbine_id', 'failure_within_horizon', 'time_to_failure_hours', 'alarm_code']
numeric_cols = [col for col in numeric_cols if col not in exclude]

# Correlation matrix
corr_matrix = df[numeric_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()
