# Hill of Towie Wind Turbine Power Prediction - Exploratory Data Analysis

This notebook contains exploratory data analysis for the wind turbine power prediction competition.

## Goals:
1. Load and examine the training and test datasets
2. Understand the data structure and features
3. Analyze distributions and patterns
4. Identify correlations with target variable
5. Explore temporal patterns if applicable
6. Detect outliers and missing values
7. Plan feature engineering strategies

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Setup plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

# Add src to path for imports
import sys
sys.path.append('../src')

from config import TRAIN_FILE, TEST_FILE, SAMPLE_SUBMISSION_FILE
from utils import load_data, setup_logging

# Setup logging
logger = setup_logging(log_level="INFO")

print("Libraries imported successfully!")

## 1. Data Loading

In [None]:
# Load datasets
print("Loading datasets...")
train_df = load_data(TRAIN_FILE)
test_df = load_data(TEST_FILE)
sample_submission = load_data(SAMPLE_SUBMISSION_FILE)

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"Sample submission shape: {sample_submission.shape}")

## 2. Data Overview

In [None]:
# Basic info about training data
print("=== TRAINING DATA INFO ===")
print(train_df.info())
print("\n=== FIRST FEW ROWS ===")
display(train_df.head())

In [None]:
# Basic info about test data
print("=== TEST DATA INFO ===")
print(test_df.info())
print("\n=== FIRST FEW ROWS ===")
display(test_df.head())

In [None]:
# Sample submission format
print("=== SAMPLE SUBMISSION ===")
display(sample_submission.head())
print(f"Submission columns: {sample_submission.columns.tolist()}")

## 3. Statistical Summary

In [None]:
# Statistical summary of numerical features
print("=== TRAINING DATA STATISTICS ===")
display(train_df.describe())

print("\n=== TEST DATA STATISTICS ===")
display(test_df.describe())

## 4. Missing Values Analysis

In [None]:
# Check for missing values
print("=== MISSING VALUES IN TRAINING DATA ===")
train_missing = train_df.isnull().sum()
train_missing_pct = 100 * train_missing / len(train_df)
missing_df = pd.DataFrame({
    'Column': train_missing.index,
    'Missing Count': train_missing.values,
    'Missing %': train_missing_pct.values
}).sort_values('Missing Count', ascending=False)
display(missing_df[missing_df['Missing Count'] > 0])

print("\n=== MISSING VALUES IN TEST DATA ===")
test_missing = test_df.isnull().sum()
test_missing_pct = 100 * test_missing / len(test_df)
missing_test_df = pd.DataFrame({
    'Column': test_missing.index,
    'Missing Count': test_missing.values,
    'Missing %': test_missing_pct.values
}).sort_values('Missing Count', ascending=False)
display(missing_test_df[missing_test_df['Missing Count'] > 0])

## 5. Target Variable Analysis

In [None]:
# Identify target column (assuming it's the last column or has 'power' in name)
potential_targets = [col for col in train_df.columns if 'power' in col.lower()]
if not potential_targets:
    # If no column with 'power', assume last numerical column is target
    numerical_cols = train_df.select_dtypes(include=[np.number]).columns
    target_col = numerical_cols[-1]
else:
    target_col = potential_targets[0]

print(f"Assumed target column: {target_col}")

if target_col in train_df.columns:
    target = train_df[target_col]
    
    print(f"Target statistics:")
    print(f"  Mean: {target.mean():.2f}")
    print(f"  Std: {target.std():.2f}")
    print(f"  Min: {target.min():.2f}")
    print(f"  Max: {target.max():.2f}")
    print(f"  Median: {target.median():.2f}")
else:
    print("Could not identify target column. Please update manually.")

In [None]:
# Target distribution visualization
if target_col in train_df.columns:
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Histogram
    axes[0, 0].hist(target, bins=50, alpha=0.7, color='blue')
    axes[0, 0].set_title('Target Distribution')
    axes[0, 0].set_xlabel(target_col)
    axes[0, 0].set_ylabel('Frequency')
    
    # Box plot
    axes[0, 1].boxplot(target)
    axes[0, 1].set_title('Target Box Plot')
    axes[0, 1].set_ylabel(target_col)
    
    # QQ plot
    from scipy import stats
    stats.probplot(target, dist="norm", plot=axes[1, 0])
    axes[1, 0].set_title('Target Q-Q Plot')
    
    # Time series plot (if we can identify a time column)
    time_cols = [col for col in train_df.columns if any(word in col.lower() for word in ['time', 'date', 'timestamp'])]
    if time_cols:
        time_col = time_cols[0]
        if pd.api.types.is_datetime64_any_dtype(train_df[time_col]) or train_df[time_col].dtype == 'object':
            try:
                time_data = pd.to_datetime(train_df[time_col])
                axes[1, 1].plot(time_data, target, alpha=0.7)
                axes[1, 1].set_title(f'Target vs {time_col}')
                axes[1, 1].set_xlabel(time_col)
                axes[1, 1].set_ylabel(target_col)
            except:
                axes[1, 1].scatter(range(len(target)), target, alpha=0.5)
                axes[1, 1].set_title('Target vs Index')
        else:
            axes[1, 1].scatter(range(len(target)), target, alpha=0.5)
            axes[1, 1].set_title('Target vs Index')
    else:
        axes[1, 1].scatter(range(len(target)), target, alpha=0.5)
        axes[1, 1].set_title('Target vs Index')
    
    plt.tight_layout()
    plt.show()

## 6. Feature Analysis

In [None]:
# Analyze numerical features
numerical_features = train_df.select_dtypes(include=[np.number]).columns.tolist()
if target_col in numerical_features:
    numerical_features.remove(target_col)

print(f"Numerical features ({len(numerical_features)}): {numerical_features[:10]}...")

# Categorical features
categorical_features = train_df.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Categorical features ({len(categorical_features)}): {categorical_features}")

In [None]:
# Correlation analysis with target
if target_col in train_df.columns and len(numerical_features) > 0:
    correlations = train_df[numerical_features + [target_col]].corr()[target_col].sort_values(key=abs, ascending=False)
    correlations = correlations.drop(target_col)  # Remove self-correlation
    
    print("Top 10 features correlated with target:")
    display(correlations.head(10).to_frame('Correlation'))
    
    # Plot correlations
    plt.figure(figsize=(10, 8))
    top_corr = correlations.head(15)
    colors = ['red' if x < 0 else 'blue' for x in top_corr.values]
    plt.barh(range(len(top_corr)), top_corr.values, color=colors, alpha=0.7)
    plt.yticks(range(len(top_corr)), top_corr.index)
    plt.xlabel('Correlation with Target')
    plt.title('Feature Correlation with Target')
    plt.grid(axis='x', alpha=0.3)
    plt.tight_layout()
    plt.show()

## 7. Feature Distributions

In [None]:
# Plot distributions of top correlated features
if target_col in train_df.columns and len(numerical_features) > 0:
    top_features = correlations.head(8).index.tolist()
    
    fig, axes = plt.subplots(2, 4, figsize=(20, 10))
    axes = axes.ravel()
    
    for i, feature in enumerate(top_features):
        if feature in train_df.columns:
            axes[i].hist(train_df[feature], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
            axes[i].set_title(f'Distribution of {feature}')
            axes[i].set_xlabel(feature)
            axes[i].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

## 8. Correlation Matrix

In [None]:
# Correlation matrix of top features
if target_col in train_df.columns and len(numerical_features) > 0:
    top_features = correlations.head(10).index.tolist() + [target_col]
    correlation_matrix = train_df[top_features].corr()
    
    plt.figure(figsize=(12, 10))
    mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
    sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
                square=True, linewidths=0.5, cbar_kws={"shrink": .8})
    plt.title('Correlation Matrix of Top Features')
    plt.tight_layout()
    plt.show()

## 9. Outlier Detection

In [None]:
# Detect outliers using IQR method for top features
if len(numerical_features) > 0:
    top_features_for_outliers = correlations.head(5).index.tolist()
    
    outlier_summary = []
    
    for feature in top_features_for_outliers:
        if feature in train_df.columns:
            Q1 = train_df[feature].quantile(0.25)
            Q3 = train_df[feature].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            
            outliers = train_df[(train_df[feature] < lower_bound) | (train_df[feature] > upper_bound)]
            outlier_count = len(outliers)
            outlier_pct = 100 * outlier_count / len(train_df)
            
            outlier_summary.append({
                'Feature': feature,
                'Outlier Count': outlier_count,
                'Outlier %': outlier_pct,
                'Lower Bound': lower_bound,
                'Upper Bound': upper_bound
            })
    
    outlier_df = pd.DataFrame(outlier_summary)
    print("Outlier Analysis:")
    display(outlier_df)

## 10. Temporal Analysis (if applicable)

In [None]:
# Time series analysis if we have datetime columns
time_cols = [col for col in train_df.columns if any(word in col.lower() for word in ['time', 'date', 'timestamp'])]

if time_cols and target_col in train_df.columns:
    time_col = time_cols[0]
    print(f"Analyzing temporal patterns using column: {time_col}")
    
    try:
        # Convert to datetime
        train_df_temp = train_df.copy()
        train_df_temp[time_col] = pd.to_datetime(train_df_temp[time_col])
        
        # Extract time features
        train_df_temp['hour'] = train_df_temp[time_col].dt.hour
        train_df_temp['day_of_week'] = train_df_temp[time_col].dt.dayofweek
        train_df_temp['month'] = train_df_temp[time_col].dt.month
        
        # Plot temporal patterns
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        
        # Hourly pattern
        hourly_avg = train_df_temp.groupby('hour')[target_col].mean()
        axes[0, 0].plot(hourly_avg.index, hourly_avg.values, marker='o')
        axes[0, 0].set_title('Average Target by Hour')
        axes[0, 0].set_xlabel('Hour')
        axes[0, 0].set_ylabel(f'Average {target_col}')
        
        # Daily pattern
        daily_avg = train_df_temp.groupby('day_of_week')[target_col].mean()
        day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
        axes[0, 1].bar(range(7), daily_avg.values)
        axes[0, 1].set_title('Average Target by Day of Week')
        axes[0, 1].set_xlabel('Day of Week')
        axes[0, 1].set_ylabel(f'Average {target_col}')
        axes[0, 1].set_xticks(range(7))
        axes[0, 1].set_xticklabels(day_names)
        
        # Monthly pattern
        monthly_avg = train_df_temp.groupby('month')[target_col].mean()
        axes[1, 0].plot(monthly_avg.index, monthly_avg.values, marker='o')
        axes[1, 0].set_title('Average Target by Month')
        axes[1, 0].set_xlabel('Month')
        axes[1, 0].set_ylabel(f'Average {target_col}')
        
        # Time series plot (sampled if too many points)
        if len(train_df_temp) > 10000:
            sample_df = train_df_temp.sample(10000).sort_values(time_col)
        else:
            sample_df = train_df_temp.sort_values(time_col)
        
        axes[1, 1].plot(sample_df[time_col], sample_df[target_col], alpha=0.6)
        axes[1, 1].set_title('Target Over Time (Sample)')
        axes[1, 1].set_xlabel(time_col)
        axes[1, 1].set_ylabel(target_col)
        
        plt.tight_layout()
        plt.show()
        
    except Exception as e:
        print(f"Could not perform temporal analysis: {e}")
else:
    print("No datetime columns found for temporal analysis")

## 11. Train vs Test Distribution Comparison

In [None]:
# Compare feature distributions between train and test
common_features = list(set(train_df.columns) & set(test_df.columns))
common_numerical = [col for col in common_features if train_df[col].dtype in ['int64', 'float64']]

if len(common_numerical) > 0:
    # Plot distributions for top features
    top_common_features = common_numerical[:6]
    
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    axes = axes.ravel()
    
    for i, feature in enumerate(top_common_features):
        if feature in train_df.columns and feature in test_df.columns:
            axes[i].hist(train_df[feature], bins=30, alpha=0.7, label='Train', color='blue')
            axes[i].hist(test_df[feature], bins=30, alpha=0.7, label='Test', color='red')
            axes[i].set_title(f'Distribution Comparison: {feature}')
            axes[i].set_xlabel(feature)
            axes[i].set_ylabel('Frequency')
            axes[i].legend()
    
    plt.tight_layout()
    plt.show()

## 12. Key Insights and Next Steps

In [None]:
# Summary of findings
print("=== KEY INSIGHTS FROM EDA ===")
print(f"1. Dataset shape - Train: {train_df.shape}, Test: {test_df.shape}")
print(f"2. Target column identified: {target_col if 'target_col' in locals() else 'Not identified'}")
print(f"3. Number of numerical features: {len(numerical_features)}")
print(f"4. Number of categorical features: {len(categorical_features)}")
print(f"5. Missing values present: {train_df.isnull().sum().sum() > 0}")
print(f"6. Temporal features available: {len(time_cols) > 0 if 'time_cols' in locals() else False}")

if 'correlations' in locals():
    print(f"7. Top correlated feature: {correlations.index[0]} (correlation: {correlations.iloc[0]:.3f})")

print("\n=== RECOMMENDED NEXT STEPS ===")
print("1. Feature Engineering:")
print("   - Create lag features if temporal data available")
print("   - Generate polynomial/interaction features for top correlated features")
print("   - Create statistical features (rolling means, std, etc.)")
print("   - Handle categorical variables with encoding")
print("\n2. Data Preprocessing:")
print("   - Handle missing values appropriately")
print("   - Consider outlier treatment")
print("   - Scale/normalize features if needed")
print("\n3. Model Development:")
print("   - Start with simple baseline models")
print("   - Try ensemble methods (XGBoost, LightGBM, CatBoost)")
print("   - Implement cross-validation strategy")
print("   - Consider neural networks for complex patterns")
print("\n4. Validation Strategy:")
print("   - Time-based splits if temporal data")
print("   - Stratified splits for stable validation")
print("   - Monitor overfitting carefully")

## 13. Save Processed Data (Optional)

In [None]:
# Optionally save cleaned/processed versions of the data
# This cell can be uncommented and modified based on findings

# from utils import save_data
# from config import PROCESSED_DATA_DIR

# # Save processed training data
# save_data(train_df, PROCESSED_DATA_DIR / "train_processed.parquet")
# save_data(test_df, PROCESSED_DATA_DIR / "test_processed.parquet")

# print("Processed data saved successfully!")