# Weather Data Analysis (Google Colab Version)
Analysis and visualization of weather observations and forecasts from EGLC (London City Airport)

**Data Sources:**
- Weather Underground (actual observations & forecasts)
- ThingSpeak API (Thames River water temperature)

## Step 1: Mount Google Drive (for Colab)

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

## Step 2: Set the path to your database
**Update the path below to match where your database is stored in Google Drive**

In [None]:
# IMPORTANT: Update this path to match your Google Drive folder structure
# Example: '/content/drive/MyDrive/Weather Predictor/data/weather_data.db'

import os

db_path = '/content/drive/MyDrive/Weather Predictor/data/weather_data.db'

# Check if file exists
if os.path.exists(db_path):
    print(f"✓ Database found at: {db_path}")
    print(f"Database size: {os.path.getsize(db_path) / 1024:.2f} KB")
else:
    print(f"✗ Database not found at: {db_path}")
    print(f"\nCurrent directory: {os.getcwd()}")
    print(f"\nPlease update the db_path variable above to point to your database file.")
    print(f"\nYou can browse your Google Drive files to find the correct path.")

## Step 3: Import Libraries

In [None]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.dpi'] = 100

print("✓ All libraries imported successfully")

## Step 4: Connect to Database

In [None]:
# Connect to SQLite database
conn = sqlite3.connect(db_path)
print(f"✓ Connected to database")

## Step 5: Load Data from Database

In [None]:
# Load weather observations
query_obs = """
SELECT 
    id,
    observation_timestamp,
    location,
    temperature_f,
    dew_point_f,
    humidity_pct,
    wind_speed_mph,
    wind_direction,
    wind_gust_mph,
    pressure_in,
    precip_amount_in,
    condition,
    water_temp_0_35m_c,
    water_temp_2m_c,
    water_temp_7m_c,
    water_temp_entry_id,
    scrape_timestamp
FROM weather_observations
ORDER BY observation_timestamp
"""

df_obs = pd.read_sql_query(query_obs, conn, parse_dates=['observation_timestamp', 'scrape_timestamp'])

print(f"✓ Loaded {len(df_obs)} weather observations")
df_obs.head()

In [None]:
# Load weather forecasts
query_forecast = """
SELECT 
    id,
    forecast_timestamp,
    location,
    temperature_f,
    feels_like_f,
    dew_point_f,
    humidity_pct,
    wind_speed_mph,
    wind_direction,
    pressure_in,
    precip_chance_pct,
    precip_amount_in,
    cloud_cover_pct,
    condition,
    scrape_timestamp
FROM weather_forecasts
ORDER BY forecast_timestamp
"""

df_forecast = pd.read_sql_query(query_forecast, conn, parse_dates=['forecast_timestamp', 'scrape_timestamp'])

print(f"✓ Loaded {len(df_forecast)} weather forecasts")
if len(df_forecast) > 0:
    display(df_forecast.head())
else:
    print("No forecast data available yet")

## Data Overview & Statistics

In [None]:
# Display basic information
print("=" * 70)
print("WEATHER OBSERVATIONS SUMMARY")
print("=" * 70)
print(f"Total observations: {len(df_obs)}")
print(f"Date range: {df_obs['observation_timestamp'].min()} to {df_obs['observation_timestamp'].max()}")
print(f"Locations: {df_obs['location'].unique()}")
print(f"\nColumns: {len(df_obs.columns)}")
print(df_obs.columns.tolist())

In [None]:
# Statistical summary
df_obs[['temperature_f', 'dew_point_f', 'humidity_pct', 'wind_speed_mph', 
        'pressure_in', 'water_temp_0_35m_c', 'water_temp_2m_c', 'water_temp_7m_c']].describe()

In [None]:
# Display all observations
display_cols = ['observation_timestamp', 'temperature_f', 'humidity_pct', 
                'wind_speed_mph', 'pressure_in', 'water_temp_0_35m_c', 'condition']

df_obs[display_cols].sort_values('observation_timestamp', ascending=False)

## Temperature Analysis

In [None]:
# Temperature over time
fig, ax = plt.subplots(figsize=(14, 6))

ax.plot(df_obs['observation_timestamp'], df_obs['temperature_f'], 
        marker='o', linewidth=2, markersize=6, label='Air Temperature')
ax.plot(df_obs['observation_timestamp'], df_obs['dew_point_f'], 
        marker='s', linewidth=2, markersize=5, alpha=0.7, label='Dew Point')

ax.set_xlabel('Time', fontsize=12)
ax.set_ylabel('Temperature (°F)', fontsize=12)
ax.set_title('Air Temperature & Dew Point Over Time', fontsize=14, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Water Temperature Analysis

In [None]:
# Thames River water temperature at different depths
fig, ax = plt.subplots(figsize=(14, 6))

ax.plot(df_obs['observation_timestamp'], df_obs['water_temp_0_35m_c'], 
        marker='o', linewidth=2, markersize=6, label='0.35m depth (surface)', color='lightskyblue')
ax.plot(df_obs['observation_timestamp'], df_obs['water_temp_2m_c'], 
        marker='s', linewidth=2, markersize=5, label='2m depth', color='royalblue')
ax.plot(df_obs['observation_timestamp'], df_obs['water_temp_7m_c'], 
        marker='^', linewidth=2, markersize=5, label='7m depth (deep)', color='darkblue')

ax.set_xlabel('Time', fontsize=12)
ax.set_ylabel('Water Temperature (°C)', fontsize=12)
ax.set_title('Thames River Water Temperature by Depth', fontsize=14, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Correlation Analysis

In [None]:
# Correlation heatmap
correlation_cols = ['temperature_f', 'dew_point_f', 'humidity_pct', 'wind_speed_mph', 
                    'pressure_in', 'water_temp_0_35m_c', 'water_temp_2m_c', 'water_temp_7m_c']

corr_matrix = df_obs[correlation_cols].corr()

fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8}, ax=ax)
ax.set_title('Weather Variables Correlation Matrix', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

## Comprehensive Dashboard

In [None]:
# 4-panel dashboard
fig, axes = plt.subplots(4, 1, figsize=(16, 14), sharex=True)

# Temperature
axes[0].plot(df_obs['observation_timestamp'], df_obs['temperature_f'], 
             marker='o', linewidth=2, markersize=4, color='red', label='Temperature')
axes[0].set_ylabel('Temp (°F)', fontsize=11)
axes[0].set_title('Weather Observation Dashboard - EGLC', fontsize=14, fontweight='bold')
axes[0].legend(loc='upper left')
axes[0].grid(True, alpha=0.3)

# Humidity
axes[1].plot(df_obs['observation_timestamp'], df_obs['humidity_pct'], 
             marker='o', linewidth=2, markersize=4, color='blue', label='Humidity')
axes[1].set_ylabel('Humidity (%)', fontsize=11)
axes[1].legend(loc='upper left')
axes[1].grid(True, alpha=0.3)

# Wind Speed
axes[2].plot(df_obs['observation_timestamp'], df_obs['wind_speed_mph'], 
             marker='o', linewidth=2, markersize=4, color='purple', label='Wind Speed')
axes[2].set_ylabel('Wind (mph)', fontsize=11)
axes[2].legend(loc='upper left')
axes[2].grid(True, alpha=0.3)

# Water Temperature
axes[3].plot(df_obs['observation_timestamp'], df_obs['water_temp_0_35m_c'], 
             marker='o', linewidth=2, markersize=4, color='teal', label='Water Temp (0.35m)')
axes[3].set_xlabel('Time', fontsize=12)
axes[3].set_ylabel('Water Temp (°C)', fontsize=11)
axes[3].legend(loc='upper left')
axes[3].grid(True, alpha=0.3)

plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## ML Prediction: Temperature Forecasting

Using weather variables and Thames water temperature to predict future air temperature

In [None]:
# Prepare data for ML model
# Select features for prediction
feature_cols = [
    'humidity_pct', 
    'pressure_in', 
    'wind_speed_mph',
    'water_temp_0_35m_c',
    'water_temp_2m_c',
    'water_temp_7m_c'
]

# Remove rows with missing values
df_ml = df_obs[feature_cols + ['temperature_f']].dropna()

print(f"Dataset size for ML: {len(df_ml)} observations")
print(f"\nFeatures used for prediction:")
for col in feature_cols:
    print(f"  - {col}")
print(f"\nTarget variable: temperature_f")

In [None]:
# Feature importance analysis
X = df_ml[feature_cols]
y = df_ml['temperature_f']

# Quick Random Forest to check feature importance
rf_temp = RandomForestRegressor(n_estimators=100, random_state=42)
rf_temp.fit(X, y)

# Plot feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_temp.feature_importances_
}).sort_values('importance', ascending=False)

fig, ax = plt.subplots(figsize=(10, 6))
ax.barh(feature_importance['feature'], feature_importance['importance'], color='steelblue', edgecolor='black')
ax.set_xlabel('Importance Score', fontsize=12)
ax.set_ylabel('Feature', fontsize=12)
ax.set_title('Feature Importance for Temperature Prediction', fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

print("\nFeature Importance Ranking:")
print(feature_importance.to_string(index=False))

In [None]:
# Train multiple models and compare
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

results = []

for name, model in models.items():
    # Train model
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Metrics
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    results.append({
        'Model': name,
        'MAE (°F)': f'{mae:.2f}',
        'RMSE (°F)': f'{rmse:.2f}',
        'R² Score': f'{r2:.4f}'
    })
    
    print(f"\n{name}:")
    print(f"  Mean Absolute Error: {mae:.2f}°F")
    print(f"  Root Mean Squared Error: {rmse:.2f}°F")
    print(f"  R² Score: {r2:.4f}")

# Display results table
results_df = pd.DataFrame(results)
print("\n" + "="*70)
print("MODEL COMPARISON")
print("="*70)
display(results_df)

In [None]:
# Visualize predictions vs actual (using best model - Random Forest)
best_model = RandomForestRegressor(n_estimators=100, random_state=42)
best_model.fit(X_train, y_train)
y_pred_rf = best_model.predict(X_test)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Scatter plot: Predicted vs Actual
ax1.scatter(y_test, y_pred_rf, alpha=0.6, s=100, edgecolor='black')
ax1.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 
         'r--', lw=2, label='Perfect Prediction')
ax1.set_xlabel('Actual Temperature (°F)', fontsize=12)
ax1.set_ylabel('Predicted Temperature (°F)', fontsize=12)
ax1.set_title('Random Forest: Predicted vs Actual Temperature', fontsize=13, fontweight='bold')
ax1.legend(fontsize=11)
ax1.grid(True, alpha=0.3)

# Residuals plot
residuals = y_test - y_pred_rf
ax2.scatter(y_pred_rf, residuals, alpha=0.6, s=100, edgecolor='black', color='coral')
ax2.axhline(y=0, color='r', linestyle='--', lw=2)
ax2.set_xlabel('Predicted Temperature (°F)', fontsize=12)
ax2.set_ylabel('Residuals (°F)', fontsize=12)
ax2.set_title('Residual Plot', fontsize=13, fontweight='bold')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nPrediction Error Distribution:")
print(f"  Mean Error: {residuals.mean():.2f}°F")
print(f"  Std Error: {residuals.std():.2f}°F")
print(f"  Min Error: {residuals.min():.2f}°F")
print(f"  Max Error: {residuals.max():.2f}°F")

## Summary Statistics

In [None]:
print("="*80)
print("WEATHER DATA SUMMARY")
print("="*80)
print(f"\nTotal Observations: {len(df_obs)}")
print(f"Date Range: {df_obs['observation_timestamp'].min()} to {df_obs['observation_timestamp'].max()}")
print(f"\nTemperature Range: {df_obs['temperature_f'].min():.1f}°F - {df_obs['temperature_f'].max():.1f}°F")
print(f"Average Temperature: {df_obs['temperature_f'].mean():.1f}°F")
print(f"\nHumidity Range: {df_obs['humidity_pct'].min():.0f}% - {df_obs['humidity_pct'].max():.0f}%")
print(f"Average Humidity: {df_obs['humidity_pct'].mean():.1f}%")
print(f"\nWind Speed Range: {df_obs['wind_speed_mph'].min():.1f} - {df_obs['wind_speed_mph'].max():.1f} mph")
print(f"Average Wind Speed: {df_obs['wind_speed_mph'].mean():.1f} mph")
print(f"\nWater Temperature (0.35m): {df_obs['water_temp_0_35m_c'].min():.2f}°C - {df_obs['water_temp_0_35m_c'].max():.2f}°C")
print(f"Average Water Temperature: {df_obs['water_temp_0_35m_c'].mean():.2f}°C")
print("="*80)

## Close Database Connection

In [None]:
# Close database connection
conn.close()
print("✓ Database connection closed")