# Weather Data Analysis
Analysis and visualization of weather observations and forecasts from EGLC (London City Airport)

**Data Sources:**
- Weather Underground (actual observations & forecasts)
- ThingSpeak API (Thames River water temperature)

In [None]:
# Import required libraries
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set plot style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

## 1. Load Data from SQLite Database

In [None]:
# Connect to SQLite database
import os

# Get the directory where this notebook is located
notebook_dir = os.path.dirname(os.path.abspath('__file__')) if '__file__' in dir() else os.getcwd()
db_path = os.path.join(notebook_dir, 'data', 'weather_data.db')

# Fallback to relative path if the above doesn't work
if not os.path.exists(db_path):
    db_path = 'data/weather_data.db'

print(f"Looking for database at: {db_path}")
print(f"Database exists: {os.path.exists(db_path)}")

conn = sqlite3.connect(db_path)
print(f"✓ Connected to: {db_path}")

In [None]:
# Load weather observations
query_obs = """
SELECT 
    id,
    observation_timestamp,
    location,
    temperature_f,
    dew_point_f,
    humidity_pct,
    wind_speed_mph,
    wind_direction,
    wind_gust_mph,
    pressure_in,
    precip_amount_in,
    condition,
    water_temp_0_35m_c,
    water_temp_2m_c,
    water_temp_7m_c,
    water_temp_entry_id,
    scrape_timestamp
FROM weather_observations
ORDER BY observation_timestamp
"""

df_obs = pd.read_sql_query(query_obs, conn)

# Convert timestamp columns to datetime
df_obs['observation_timestamp'] = pd.to_datetime(df_obs['observation_timestamp'])
df_obs['scrape_timestamp'] = pd.to_datetime(df_obs['scrape_timestamp'])

print(f"Loaded {len(df_obs)} weather observations")
df_obs.head()

In [None]:
# Load weather forecasts
query_forecast = """
SELECT 
    id,
    forecast_timestamp,
    location,
    temperature_f,
    feels_like_f,
    dew_point_f,
    humidity_pct,
    wind_speed_mph,
    wind_direction,
    pressure_in,
    precip_chance_pct,
    precip_amount_in,
    cloud_cover_pct,
    condition,
    scrape_timestamp
FROM weather_forecasts
ORDER BY forecast_timestamp
"""

df_forecast = pd.read_sql_query(query_forecast, conn)

if len(df_forecast) > 0:
    df_forecast['forecast_timestamp'] = pd.to_datetime(df_forecast['forecast_timestamp'])
    df_forecast['scrape_timestamp'] = pd.to_datetime(df_forecast['scrape_timestamp'])

print(f"Loaded {len(df_forecast)} weather forecasts")
df_forecast.head() if len(df_forecast) > 0 else print("No forecast data available yet")

In [None]:
# Close database connection
conn.close()
print("Database connection closed")

## 2. Data Overview & Statistics

In [None]:
# Display basic information
print("=" * 70)
print("WEATHER OBSERVATIONS SUMMARY")
print("=" * 70)
print(f"Total observations: {len(df_obs)}")
print(f"Date range: {df_obs['observation_timestamp'].min()} to {df_obs['observation_timestamp'].max()}")
print(f"Locations: {df_obs['location'].unique()}")
print(f"\nColumns: {len(df_obs.columns)}")
print(df_obs.columns.tolist())

In [None]:
# Statistical summary of observations
df_obs[['temperature_f', 'dew_point_f', 'humidity_pct', 'wind_speed_mph', 
        'pressure_in', 'water_temp_0_35m_c', 'water_temp_2m_c', 'water_temp_7m_c']].describe()

In [None]:
# Check for missing values
print("Missing values in observations:")
missing = df_obs.isnull().sum()
missing[missing > 0]

In [None]:
# Display all observations in a formatted table
display_cols = ['observation_timestamp', 'temperature_f', 'humidity_pct', 
                'wind_speed_mph', 'pressure_in', 'water_temp_0_35m_c', 'condition']

print("\n" + "=" * 100)
print("ALL WEATHER OBSERVATIONS")
print("=" * 100)
df_obs[display_cols].sort_values('observation_timestamp', ascending=False)

## 3. Temperature Analysis

In [None]:
# Temperature over time
fig, ax = plt.subplots(figsize=(14, 6))

ax.plot(df_obs['observation_timestamp'], df_obs['temperature_f'], 
        marker='o', linewidth=2, markersize=6, label='Air Temperature')
ax.plot(df_obs['observation_timestamp'], df_obs['dew_point_f'], 
        marker='s', linewidth=2, markersize=5, alpha=0.7, label='Dew Point')

ax.set_xlabel('Time', fontsize=12)
ax.set_ylabel('Temperature (°F)', fontsize=12)
ax.set_title('Air Temperature & Dew Point Over Time', fontsize=14, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Temperature distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
ax1.hist(df_obs['temperature_f'].dropna(), bins=15, edgecolor='black', alpha=0.7, color='coral')
ax1.axvline(df_obs['temperature_f'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {df_obs["temperature_f"].mean():.1f}°F')
ax1.set_xlabel('Temperature (°F)', fontsize=12)
ax1.set_ylabel('Frequency', fontsize=12)
ax1.set_title('Temperature Distribution', fontsize=13, fontweight='bold')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Box plot
ax2.boxplot([df_obs['temperature_f'].dropna(), df_obs['dew_point_f'].dropna()], 
            labels=['Temperature', 'Dew Point'],
            patch_artist=True,
            boxprops=dict(facecolor='lightblue', alpha=0.7))
ax2.set_ylabel('Temperature (°F)', fontsize=12)
ax2.set_title('Temperature & Dew Point Box Plot', fontsize=13, fontweight='bold')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Humidity & Pressure Analysis

In [None]:
# Humidity and pressure over time
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 8), sharex=True)

# Humidity
ax1.plot(df_obs['observation_timestamp'], df_obs['humidity_pct'], 
         marker='o', linewidth=2, markersize=5, color='steelblue')
ax1.fill_between(df_obs['observation_timestamp'], df_obs['humidity_pct'], alpha=0.3)
ax1.set_ylabel('Humidity (%)', fontsize=12)
ax1.set_title('Humidity Over Time', fontsize=13, fontweight='bold')
ax1.grid(True, alpha=0.3)
ax1.set_ylim(0, 100)

# Pressure
ax2.plot(df_obs['observation_timestamp'], df_obs['pressure_in'], 
         marker='o', linewidth=2, markersize=5, color='darkgreen')
ax2.fill_between(df_obs['observation_timestamp'], df_obs['pressure_in'], alpha=0.3)
ax2.set_xlabel('Time', fontsize=12)
ax2.set_ylabel('Pressure (in)', fontsize=12)
ax2.set_title('Atmospheric Pressure Over Time', fontsize=13, fontweight='bold')
ax2.grid(True, alpha=0.3)

plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 5. Wind Analysis

In [None]:
# Wind speed over time
fig, ax = plt.subplots(figsize=(14, 6))

ax.plot(df_obs['observation_timestamp'], df_obs['wind_speed_mph'], 
        marker='o', linewidth=2, markersize=6, color='purple', label='Wind Speed')

if df_obs['wind_gust_mph'].notna().any():
    ax.plot(df_obs['observation_timestamp'], df_obs['wind_gust_mph'], 
            marker='s', linewidth=2, markersize=5, alpha=0.6, color='red', label='Wind Gust')

ax.set_xlabel('Time', fontsize=12)
ax.set_ylabel('Wind Speed (mph)', fontsize=12)
ax.set_title('Wind Speed Over Time', fontsize=14, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Wind direction distribution
wind_dir_counts = df_obs['wind_direction'].value_counts()

if len(wind_dir_counts) > 0:
    fig, ax = plt.subplots(figsize=(10, 6))
    wind_dir_counts.plot(kind='bar', ax=ax, color='skyblue', edgecolor='black')
    ax.set_xlabel('Wind Direction', fontsize=12)
    ax.set_ylabel('Frequency', fontsize=12)
    ax.set_title('Wind Direction Distribution', fontsize=13, fontweight='bold')
    ax.grid(True, alpha=0.3, axis='y')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("No wind direction data available")

## 6. Water Temperature Analysis

In [None]:
# Thames River water temperature at different depths
fig, ax = plt.subplots(figsize=(14, 6))

ax.plot(df_obs['observation_timestamp'], df_obs['water_temp_0_35m_c'], 
        marker='o', linewidth=2, markersize=6, label='0.35m depth (surface)', color='lightskyblue')
ax.plot(df_obs['observation_timestamp'], df_obs['water_temp_2m_c'], 
        marker='s', linewidth=2, markersize=5, label='2m depth', color='royalblue')
ax.plot(df_obs['observation_timestamp'], df_obs['water_temp_7m_c'], 
        marker='^', linewidth=2, markersize=5, label='7m depth (deep)', color='darkblue')

ax.set_xlabel('Time', fontsize=12)
ax.set_ylabel('Water Temperature (°C)', fontsize=12)
ax.set_title('Thames River Water Temperature by Depth', fontsize=14, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Water temperature statistics by depth
water_temps = df_obs[['water_temp_0_35m_c', 'water_temp_2m_c', 'water_temp_7m_c']].describe()
print("\nWater Temperature Statistics (°C):")
water_temps

## 7. Multi-Variable Correlation Analysis

In [None]:
# Correlation heatmap
correlation_cols = ['temperature_f', 'dew_point_f', 'humidity_pct', 'wind_speed_mph', 
                    'pressure_in', 'water_temp_0_35m_c', 'water_temp_2m_c', 'water_temp_7m_c']

corr_matrix = df_obs[correlation_cols].corr()

fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8}, ax=ax)
ax.set_title('Weather Variables Correlation Matrix', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

In [None]:
# Scatter plot: Air temperature vs Water temperature
fig, ax = plt.subplots(figsize=(10, 6))

ax.scatter(df_obs['temperature_f'], df_obs['water_temp_0_35m_c'], 
           alpha=0.6, s=100, edgecolor='black', label='Surface water (0.35m)')

# Add trendline
z = np.polyfit(df_obs['temperature_f'].dropna(), 
               df_obs['water_temp_0_35m_c'].dropna(), 1)
p = np.poly1d(z)
ax.plot(df_obs['temperature_f'], p(df_obs['temperature_f']), 
        "r--", alpha=0.8, linewidth=2, label='Trend line')

ax.set_xlabel('Air Temperature (°F)', fontsize=12)
ax.set_ylabel('Water Temperature (°C)', fontsize=12)
ax.set_title('Air Temperature vs Thames Water Temperature', fontsize=13, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 8. Weather Conditions Analysis

In [None]:
# Weather condition frequency
condition_counts = df_obs['condition'].value_counts()

fig, ax = plt.subplots(figsize=(12, 6))
condition_counts.plot(kind='barh', ax=ax, color='teal', edgecolor='black')
ax.set_xlabel('Frequency', fontsize=12)
ax.set_ylabel('Weather Condition', fontsize=12)
ax.set_title('Weather Condition Distribution', fontsize=13, fontweight='bold')
ax.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

print("\nWeather Conditions Summary:")
print(condition_counts)

## 9. Time Series Overview Dashboard

In [None]:
# Comprehensive dashboard
fig, axes = plt.subplots(4, 1, figsize=(16, 14), sharex=True)

# Temperature
axes[0].plot(df_obs['observation_timestamp'], df_obs['temperature_f'], 
             marker='o', linewidth=2, markersize=4, color='red', label='Temperature')
axes[0].set_ylabel('Temp (°F)', fontsize=11)
axes[0].set_title('Weather Observation Dashboard - EGLC', fontsize=14, fontweight='bold')
axes[0].legend(loc='upper left')
axes[0].grid(True, alpha=0.3)

# Humidity
axes[1].plot(df_obs['observation_timestamp'], df_obs['humidity_pct'], 
             marker='o', linewidth=2, markersize=4, color='blue', label='Humidity')
axes[1].set_ylabel('Humidity (%)', fontsize=11)
axes[1].legend(loc='upper left')
axes[1].grid(True, alpha=0.3)

# Wind Speed
axes[2].plot(df_obs['observation_timestamp'], df_obs['wind_speed_mph'], 
             marker='o', linewidth=2, markersize=4, color='purple', label='Wind Speed')
axes[2].set_ylabel('Wind (mph)', fontsize=11)
axes[2].legend(loc='upper left')
axes[2].grid(True, alpha=0.3)

# Water Temperature
axes[3].plot(df_obs['observation_timestamp'], df_obs['water_temp_0_35m_c'], 
             marker='o', linewidth=2, markersize=4, color='teal', label='Water Temp (0.35m)')
axes[3].set_xlabel('Time', fontsize=12)
axes[3].set_ylabel('Water Temp (°C)', fontsize=11)
axes[3].legend(loc='upper left')
axes[3].grid(True, alpha=0.3)

plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 10. Data Export (Optional)

In [None]:
# Export to CSV for further analysis
# df_obs.to_csv('weather_observations_export.csv', index=False)
# print("Data exported to weather_observations_export.csv")

# Export to Excel
# df_obs.to_excel('weather_observations_export.xlsx', index=False)
# print("Data exported to weather_observations_export.xlsx")

## Summary Statistics

In [None]:
print("="*80)
print("WEATHER DATA SUMMARY")
print("="*80)
print(f"\nTotal Observations: {len(df_obs)}")
print(f"Date Range: {df_obs['observation_timestamp'].min()} to {df_obs['observation_timestamp'].max()}")
print(f"\nTemperature Range: {df_obs['temperature_f'].min():.1f}°F - {df_obs['temperature_f'].max():.1f}°F")
print(f"Average Temperature: {df_obs['temperature_f'].mean():.1f}°F")
print(f"\nHumidity Range: {df_obs['humidity_pct'].min():.0f}% - {df_obs['humidity_pct'].max():.0f}%")
print(f"Average Humidity: {df_obs['humidity_pct'].mean():.1f}%")
print(f"\nWind Speed Range: {df_obs['wind_speed_mph'].min():.1f} - {df_obs['wind_speed_mph'].max():.1f} mph")
print(f"Average Wind Speed: {df_obs['wind_speed_mph'].mean():.1f} mph")
print(f"\nWater Temperature (0.35m): {df_obs['water_temp_0_35m_c'].min():.2f}°C - {df_obs['water_temp_0_35m_c'].max():.2f}°C")
print(f"Average Water Temperature: {df_obs['water_temp_0_35m_c'].mean():.2f}°C")
print("="*80)