In [1]:
pip install openmeteo-requests requests-cache retry-requests

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Cell 2: Fetch weather data from Open-Meteo API

import openmeteo_requests
import requests_cache
import pandas as pd
from retry_requests import retry
from datetime import datetime

def fetch_london_weather():
    """
    Fetch historical weather data for London from Open-Meteo API
    Matches the date range in coffee_sales_cleaned.csv
    """
    
    print("=" * 60)
    print("üåê OPEN-METEO WEATHER DATA FETCHER")
    print("=" * 60)
    
    # Step 1: Setup API client with caching and retry
    print("\nüì° Setting up Open-Meteo API client...")
    cache_session = requests_cache.CachedSession('.cache', expire_after=3600)
    retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
    openmeteo = openmeteo_requests.Client(session=retry_session)
    print("‚úÖ Client configured with caching and auto-retry")
    
    # Step 2: Load coffee sales to get date range
    print("\nüìÇ Loading coffee sales data...")
    try:
        # ADJUST THIS PATH to where your coffee_sales_cleaned.csv is located
        coffee_df = pd.read_csv('../data/coffee_sales_cleaned.csv')
        coffee_df['Date'] = pd.to_datetime(coffee_df['Date'])
        
        start_date = coffee_df['Date'].min().strftime('%Y-%m-%d')
        end_date = coffee_df['Date'].max().strftime('%Y-%m-%d')
        
        print(f"‚úÖ Coffee sales loaded: {len(coffee_df)} transactions")
        print(f"üìÖ Date range: {start_date} to {end_date}")
        
    except FileNotFoundError:
        print("‚ö†Ô∏è coffee_sales_cleaned.csv not found in current directory")
        print("Using default date range...")
        start_date = "2024-03-01"
        end_date = "2025-03-23"
    
    # Step 3: Configure API request parameters
    print(f"\nüåç Fetching weather for London...")
    print(f"   Latitude: 51.5085¬∞N")
    print(f"   Longitude: -0.1257¬∞W")
    print(f"   Timezone: Europe/London")
    print(f"   Period: {start_date} to {end_date}")
    
    url = "https://archive-api.open-meteo.com/v1/archive"
    params = {
        "latitude": 51.5085,
        "longitude": -0.1257,
        "start_date": start_date,
        "end_date": end_date,
        "daily": ["temperature_2m_mean", "precipitation_sum"],
        "timezone": "Europe/London"
    }
    
    # Step 4: Make API request
    print(f"\nüîÑ Making API request...")
    print(f"   URL: {url}")
    
    try:
        responses = openmeteo.weather_api(url, params=params)
        response = responses[0]
        
        print(f"\n‚úÖ API Response received!")
        print(f"   üìä Coordinates: {response.Latitude():.4f}¬∞N, {response.Longitude():.4f}¬∞E")
        print(f"   üèîÔ∏è Elevation: {response.Elevation():.1f} m above sea level")
        print(f"   üïê Timezone: {response.Timezone()} ({response.TimezoneAbbreviation()})")
        print(f"   ‚è±Ô∏è UTC Offset: {response.UtcOffsetSeconds()} seconds")
        
    except Exception as e:
        print(f"\n‚ùå API request failed: {e}")
        print("\nPossible solutions:")
        print("1. Check your internet connection")
        print("2. Verify Open-Meteo API is accessible (https://open-meteo.com)")
        print("3. Try again in a few minutes")
        return None
    
    # Step 5: Process daily weather data
    print(f"\nüìä Processing weather data...")
    
    daily = response.Daily()
    daily_temperature_2m_mean = daily.Variables(0).ValuesAsNumpy()
    daily_precipitation_sum = daily.Variables(1).ValuesAsNumpy()
    
    # Create date range
    daily_data = {
        "date": pd.date_range(
            start=pd.to_datetime(daily.Time(), unit="s", utc=True),
            end=pd.to_datetime(daily.TimeEnd(), unit="s", utc=True),
            freq=pd.Timedelta(seconds=daily.Interval()),
            inclusive="left"
        )
    }
    
    daily_data["temperature"] = daily_temperature_2m_mean
    daily_data["rain_mm"] = daily_precipitation_sum
    
    weather_df = pd.DataFrame(data=daily_data)
    
    # Convert to local timezone and remove time component
    weather_df['Date'] = weather_df['date'].dt.tz_convert('Europe/London').dt.date
    weather_df['Date'] = pd.to_datetime(weather_df['Date'])
    
    # Keep only necessary columns
    weather_df = weather_df[['Date', 'temperature', 'rain_mm']]
    
    # Step 6: Display statistics
    print(f"\n" + "=" * 60)
    print("üìà WEATHER DATA SUMMARY")
    print("=" * 60)
    
    print(f"\nüìÖ Total days fetched: {len(weather_df)}")
    
    print(f"\nüå°Ô∏è Temperature Statistics (¬∞C):")
    print(f"   Mean:    {weather_df['temperature'].mean():.1f}¬∞C")
    print(f"   Min:     {weather_df['temperature'].min():.1f}¬∞C")
    print(f"   Max:     {weather_df['temperature'].max():.1f}¬∞C")
    print(f"   Std Dev: {weather_df['temperature'].std():.1f}¬∞C")
    
    print(f"\nüåßÔ∏è Precipitation Statistics (mm):")
    print(f"   Mean:    {weather_df['rain_mm'].mean():.2f} mm/day")
    print(f"   Total:   {weather_df['rain_mm'].sum():.1f} mm")
    print(f"   Max:     {weather_df['rain_mm'].max():.1f} mm (single day)")
    
    rainy_days = (weather_df['rain_mm'] > 0).sum()
    meaningful_rain = (weather_df['rain_mm'] > 1).sum()
    cold_days = (weather_df['temperature'] < 10).sum()
    warm_days = (weather_df['temperature'] > 20).sum()
    
    print(f"\nüìä Weather Patterns:")
    print(f"   üåßÔ∏è Rainy days (>0mm):   {rainy_days:3d} ({rainy_days/len(weather_df)*100:.1f}%)")
    print(f"   ‚òî Meaningful rain (>1mm): {meaningful_rain:3d} ({meaningful_rain/len(weather_df)*100:.1f}%)")
    print(f"   ‚ùÑÔ∏è Cold days (<10¬∞C):    {cold_days:3d} ({cold_days/len(weather_df)*100:.1f}%)")
    print(f"   ‚òÄÔ∏è Warm days (>20¬∞C):    {warm_days:3d} ({warm_days/len(weather_df)*100:.1f}%)")
    
    # Step 7: Save to CSV
    output_file = 'weather_data_london_real.csv'
    weather_df.to_csv(output_file, index=False)
    print(f"\nüíæ Weather data saved to: {output_file}")
    
    # Step 8: Display sample
    print(f"\nüìã Sample of Weather Data (first 10 days):")
    print(weather_df.head(10).to_string(index=False))
    
    print(f"\n" + "=" * 60)
    print("‚úÖ SUCCESS! Real weather data fetched from Open-Meteo API")
    print("=" * 60)
    
    print(f"\nüìù Next Steps:")
    print(f"1. Copy 'weather_data_london_real.csv' to your project folder")
    print(f"2. Run the merge script to combine with coffee sales")
    print(f"3. Retrain XGBoost model with real weather features")
    
    return weather_df

# Run the function
weather_data = fetch_london_weather()

# Cell will automatically display if successful
if weather_data is not None:
    print(f"\nüéâ Weather data is ready for your AI pricing model!")
    display(weather_data.head(10))  # Show first 10 rows in nice format
else:
    print(f"\n‚ö†Ô∏è Please check the error messages above and try again.")

üåê OPEN-METEO WEATHER DATA FETCHER

üì° Setting up Open-Meteo API client...
‚úÖ Client configured with caching and auto-retry

üìÇ Loading coffee sales data...
‚úÖ Coffee sales loaded: 3547 transactions
üìÖ Date range: 2024-03-01 to 2025-03-23

üåç Fetching weather for London...
   Latitude: 51.5085¬∞N
   Longitude: -0.1257¬∞W
   Timezone: Europe/London
   Period: 2024-03-01 to 2025-03-23

üîÑ Making API request...
   URL: https://archive-api.open-meteo.com/v1/archive

‚úÖ API Response received!
   üìä Coordinates: 51.4938¬∞N, -0.1630¬∞E
   üèîÔ∏è Elevation: 23.0 m above sea level
   üïê Timezone: b'Europe/London' (None)
   ‚è±Ô∏è UTC Offset: 0 seconds

üìä Processing weather data...

üìà WEATHER DATA SUMMARY

üìÖ Total days fetched: 388

üå°Ô∏è Temperature Statistics (¬∞C):
   Mean:    11.2¬∞C
   Min:     -2.3¬∞C
   Max:     24.3¬∞C
   Std Dev: 5.2¬∞C

üåßÔ∏è Precipitation Statistics (mm):
   Mean:    2.22 mm/day
   Total:   860.6 mm
   Max:     51.6 mm (single day)

ü

Unnamed: 0,Date,temperature,rain_mm
0,2024-03-01,5.1315,6.6
1,2024-03-02,5.041917,2.2
2,2024-03-03,3.752334,0.0
3,2024-03-04,5.304417,1.1
4,2024-03-05,7.048166,1.8
5,2024-03-06,5.471083,0.0
6,2024-03-07,6.669,0.3
7,2024-03-08,6.6565,0.0
8,2024-03-09,8.90025,0.4
9,2024-03-10,8.006499,8.6


In [3]:
# Cell 3: Merge weather data with coffee sales

import pandas as pd
import numpy as np

print("üîÑ MERGING WEATHER WITH COFFEE SALES")
print("=" * 60)

# Load coffee sales
print("\nüìÇ Loading coffee sales data...")
coffee_df = pd.read_csv('../data/coffee_sales_cleaned.csv')
coffee_df['Date'] = pd.to_datetime(coffee_df['Date'])
print(f"‚úÖ Loaded: {len(coffee_df)} transactions")

# Load weather data (from the file we just created)
print("\nüìÇ Loading weather data...")
weather_df = pd.read_csv('weather_data_london_real.csv')
weather_df['Date'] = pd.to_datetime(weather_df['Date'])
print(f"‚úÖ Loaded: {len(weather_df)} days")

# Merge on Date
print("\nüîó Merging datasets on Date...")
merged_df = coffee_df.merge(weather_df, on='Date', how='left')
print(f"‚úÖ Merged dataset: {len(merged_df)} rows")

# Check for missing values
missing_weather = merged_df[['temperature', 'rain_mm']].isna().sum()
print(f"\nüîç Data quality check:")
print(f"   Missing temperature values: {missing_weather['temperature']}")
print(f"   Missing rain values: {missing_weather['rain_mm']}")

if missing_weather.sum() == 0:
    print("   ‚úÖ Perfect! No missing weather data!")
else:
    print(f"   ‚ö†Ô∏è {missing_weather.sum()} missing values found")

# Create weather feature flags
print("\nüéØ Creating weather features...")
merged_df['is_rainy'] = (merged_df['rain_mm'] > 1).astype(int)
merged_df['is_cold'] = (merged_df['temperature'] < 10).astype(int)

# Statistics
print("\nüìä Weather Feature Statistics:")
print(f"   üåßÔ∏è Rainy transactions (>1mm): {merged_df['is_rainy'].sum():,} ({merged_df['is_rainy'].mean()*100:.1f}%)")
print(f"   ‚ùÑÔ∏è Cold transactions (<10¬∞C): {merged_df['is_cold'].sum():,} ({merged_df['is_cold'].mean()*100:.1f}%)")

# Price analysis by weather
print("\nüí∞ Average Price by Weather Conditions:")
print(f"   Rainy days: ¬£{merged_df[merged_df['is_rainy']==1]['money'].mean():.2f}")
print(f"   Dry days:   ¬£{merged_df[merged_df['is_rainy']==0]['money'].mean():.2f}")
print(f"   Cold days:  ¬£{merged_df[merged_df['is_cold']==1]['money'].mean():.2f}")
print(f"   Warm days:  ¬£{merged_df[merged_df['is_cold']==0]['money'].mean():.2f}")

# Save merged dataset
output_file = 'coffee_sales_with_weather_REAL.csv'
merged_df.to_csv(output_file, index=False)
print(f"\nüíæ Saved merged dataset to: {output_file}")

print("\n" + "=" * 60)
print("‚úÖ MERGE COMPLETE!")
print("=" * 60)

# Display sample
print("\nüìã Sample of Merged Data (first 5 rows):")
display(merged_df[['Date', 'Time', 'coffee_name', 'money', 'temperature', 'rain_mm', 'is_rainy', 'is_cold']].head())

print(f"\n‚úÖ Ready to retrain XGBoost model with REAL weather!")

üîÑ MERGING WEATHER WITH COFFEE SALES

üìÇ Loading coffee sales data...
‚úÖ Loaded: 3547 transactions

üìÇ Loading weather data...
‚úÖ Loaded: 388 days

üîó Merging datasets on Date...
‚úÖ Merged dataset: 3547 rows

üîç Data quality check:
   Missing temperature values: 0
   Missing rain values: 0
   ‚úÖ Perfect! No missing weather data!

üéØ Creating weather features...

üìä Weather Feature Statistics:
   üåßÔ∏è Rainy transactions (>1mm): 1,252 (35.3%)
   ‚ùÑÔ∏è Cold transactions (<10¬∞C): 1,493 (42.1%)

üí∞ Average Price by Weather Conditions:
   Rainy days: ¬£31.71
   Dry days:   ¬£31.61
   Cold days:  ¬£32.12
   Warm days:  ¬£31.30

üíæ Saved merged dataset to: coffee_sales_with_weather_REAL.csv

‚úÖ MERGE COMPLETE!

üìã Sample of Merged Data (first 5 rows):


Unnamed: 0,Date,Time,coffee_name,money,temperature,rain_mm,is_rainy,is_cold
0,2024-03-01,10:15:50.520000,Latte,38.7,5.1315,6.6,1,1
1,2024-03-01,12:19:22.539000,Hot Chocolate,38.7,5.1315,6.6,1,1
2,2024-03-01,12:20:18.089000,Hot Chocolate,38.7,5.1315,6.6,1,1
3,2024-03-01,13:46:33.006000,Americano,28.9,5.1315,6.6,1,1
4,2024-03-01,13:48:14.626000,Latte,38.7,5.1315,6.6,1,1



‚úÖ Ready to retrain XGBoost model with REAL weather!


In [4]:
# Cell: Organize files - Move CSVs to data folder

import shutil
import os

print("üìÅ FILE ORGANIZATION")
print("=" * 60)

# Define source and destination
files_to_move = [
    'weather_data_london_real.csv',
    'coffee_sales_with_weather_REAL.csv'
]

source_dir = '.'  # Current directory (notebooks/)
dest_dir = '../data/'  # Data directory

# Move each file
for filename in files_to_move:
    source = os.path.join(source_dir, filename)
    destination = os.path.join(dest_dir, filename)
    
    if os.path.exists(source):
        shutil.move(source, destination)
        print(f"‚úÖ Moved: {filename}")
        print(f"   From: {source}")
        print(f"   To:   {destination}")
    else:
        print(f"‚ö†Ô∏è File not found: {filename}")

print("\n" + "=" * 60)
print("‚úÖ FILE ORGANIZATION COMPLETE!")
print("\nüìä Data folder now contains:")
for file in os.listdir('../data/'):
    if file.endswith('.csv'):
        size = os.path.getsize(f'../data/{file}') / 1024  # KB
        print(f"   üìÑ {file} ({size:.0f} KB)")

üìÅ FILE ORGANIZATION
‚úÖ Moved: weather_data_london_real.csv
   From: .\weather_data_london_real.csv
   To:   ../data/weather_data_london_real.csv
‚úÖ Moved: coffee_sales_with_weather_REAL.csv
   From: .\coffee_sales_with_weather_REAL.csv
   To:   ../data/coffee_sales_with_weather_REAL.csv

‚úÖ FILE ORGANIZATION COMPLETE!

üìä Data folder now contains:
   üìÑ coffee_sales_cleaned.csv (463 KB)
   üìÑ coffee_sales_with_weather_REAL.csv (528 KB)
   üìÑ Coffe_sales.csv (254 KB)
   üìÑ features_engineered_checkpoint.csv (486 KB)
   üìÑ weather_data_london_real.csv (10 KB)
