# Exploratory Data Analysis - Microclimate Prediction

This notebook provides exploratory data analysis for the greenhouse microclimate dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style
plt.style.use('default')
sns.set_palette('husl')

# Load data
data_path = Path('../data/train.csv')
df = pd.read_csv(data_path)
df['timestamp'] = pd.to_datetime(df['timestamp'])

print(f"Dataset shape: {df.shape}")
print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")

In [None]:
# Basic statistics
print("Dataset Info:")
print(df.info())
print("\nBasic Statistics:")
print(df.describe())

In [None]:
# Time series plots
fig, axes = plt.subplots(3, 1, figsize=(15, 12))

# Temperature over time
axes[0].plot(df['timestamp'], df['temperature'], color='red', alpha=0.7)
axes[0].set_title('Temperature Over Time')
axes[0].set_ylabel('Temperature (°C)')
axes[0].grid(True, alpha=0.3)

# Humidity over time
axes[1].plot(df['timestamp'], df['humidity'], color='blue', alpha=0.7)
axes[1].set_title('Humidity Over Time')
axes[1].set_ylabel('Humidity (%)')
axes[1].grid(True, alpha=0.3)

# Light intensity over time
axes[2].plot(df['timestamp'], df['light_intensity'], color='orange', alpha=0.7)
axes[2].set_title('Light Intensity Over Time')
axes[2].set_ylabel('Light Intensity (Lux)')
axes[2].set_xlabel('Time')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis
numeric_cols = ['temperature', 'humidity', 'light_intensity', 'co2_level', 'soil_moisture', 'air_pressure', 'wind_speed']
correlation_matrix = df[numeric_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5)
plt.title('Correlation Matrix of Environmental Variables')
plt.tight_layout()
plt.show()

In [None]:
# Distribution plots
fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.ravel()

for i, col in enumerate(numeric_cols):
    axes[i].hist(df[col], bins=20, alpha=0.7, edgecolor='black')
    axes[i].set_title(f'Distribution of {col}')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Frequency')
    axes[i].grid(True, alpha=0.3)

# Remove empty subplot
fig.delaxes(axes[7])
plt.tight_layout()
plt.show()

In [None]:
# Daily patterns
df['hour'] = df['timestamp'].dt.hour

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Temperature by hour
hourly_temp = df.groupby('hour')['temperature'].mean()
axes[0].plot(hourly_temp.index, hourly_temp.values, marker='o', color='red')
axes[0].set_title('Average Temperature by Hour')
axes[0].set_xlabel('Hour of Day')
axes[0].set_ylabel('Temperature (°C)')
axes[0].grid(True, alpha=0.3)

# Humidity by hour
hourly_humidity = df.groupby('hour')['humidity'].mean()
axes[1].plot(hourly_humidity.index, hourly_humidity.values, marker='o', color='blue')
axes[1].set_title('Average Humidity by Hour')
axes[1].set_xlabel('Hour of Day')
axes[1].set_ylabel('Humidity (%)')
axes[1].grid(True, alpha=0.3)

# Light intensity by hour
hourly_light = df.groupby('hour')['light_intensity'].mean()
axes[2].plot(hourly_light.index, hourly_light.values, marker='o', color='orange')
axes[2].set_title('Average Light Intensity by Hour')
axes[2].set_xlabel('Hour of Day')
axes[2].set_ylabel('Light Intensity (Lux)')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Key Insights

1. **Temperature Patterns**: Shows clear daily cycles with peaks during midday hours
2. **Humidity Relationship**: Inversely correlated with temperature as expected
3. **Light Intensity**: Strong daily pattern following natural sunlight cycles
4. **Correlations**: Strong relationships between environmental variables suggest good predictability

These patterns indicate that the data is suitable for time-series prediction models.