In [None]:
pip install requests pandas numpy matplotlib seaborn scipy requests-cache retry-requests

In [4]:
# ======== IMPORTS ========
import os
import sys
import time
import math
import random
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from scipy import stats

In [6]:
# VS Code script to analyze missing values for 2015, 2016, and 2017 Visual Crossing weather datasets

import pandas as pd
import os

# === Config ===
YEARS = [2015, 2016, 2017]
CHUNKSIZE = 200000  # Safe size for large CSVs
INPUT_DIR = "."

def check_missing_values(file_path):
    total_counts = None
    missing_counts = None

    for chunk in pd.read_csv(file_path, chunksize=CHUNKSIZE, low_memory=False):
        if total_counts is None:
            total_counts = chunk.count()
            missing_counts = chunk.isnull().sum()
        else:
            total_counts += chunk.count()
            missing_counts += chunk.isnull().sum()

    # Estimate total rows (average across columns)
    total_rows = sum(total_counts) / len(total_counts)
    
    # Build report
    missing_report = pd.DataFrame({
        "Missing Values": missing_counts,
        "Missing %": (missing_counts / total_rows * 100).round(2)
    }).sort_values(by="Missing %", ascending=False)

    return missing_report

if __name__ == "__main__":
    for year in YEARS:
        file_name = f"visualcrossing_weather_{year}.csv"
        if not os.path.exists(file_name):
            print(f"Skipping {year} - file not found: {file_name}")
            continue
        
        print(f"\n=== Processing {file_name} ===")
        report = check_missing_values(file_name)
        print(report)
        
        # Save a report for each year
        output_name = f"Weather_Missing_Report_{year}.csv"
        report.to_csv(output_name)
        print(f"Missing value report saved as {output_name}")




=== Processing visualcrossing_weather_2015.csv ===
                Missing Values  Missing %
preciptype              657199      50.75
solarradiation           48910       3.78
tempmin                      0       0.00
temp                         0       0.00
datetime                     0       0.00
tempmax                      0       0.00
precip                       0       0.00
humidity                     0       0.00
windspeed                    0       0.00
snow                         0       0.00
cloudcover                   0       0.00
latitude                     0       0.00
longitude                    0       0.00
year                         0       0.00
Missing value report saved as Weather_Missing_Report_2015.csv

=== Processing visualcrossing_weather_2016.csv ===
                Missing Values  Missing %
preciptype              495070      49.06
solarradiation           58194       5.77
tempmin                      0       0.00
temp                         0      

In [7]:
# Cleans Visual Crossing weather data (2015–2017):
# - Fills missing solarradiation with hybrid method (interpolation → rolling mean → ffill/bfill)
# - Fills preciptype using temperature + precipitation rules
# - Adds binary flags for rain, snow, freezingrain
# - Outputs cleaned weather files for each year

import pandas as pd
import os

# === Configuration ===
YEARS = [2015, 2016, 2017]
INPUT_DIR = "."   
OUTPUT_DIR = "./cleaned_weather"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Column names
DATE_COL = "datetime"
LAT_COL = "latitude"
LON_COL = "longitude"
SOLAR_COL = "solarradiation"
PRECIP_COL = "precip"
TEMP_COL = "temp"
PRECIPTYPE_COL = "preciptype"

# === Step 1: Hybrid Fill for Solar Radiation ===
def fill_solar(group):
    # Linear interpolation first (handles most gaps)
    group[SOLAR_COL] = group[SOLAR_COL].interpolate(method='linear', limit_direction='both')
    
    # Rolling mean (7-day window) for any remaining NaNs
    group[SOLAR_COL] = group[SOLAR_COL].fillna(group[SOLAR_COL].rolling(window=7, min_periods=1).mean())
    
    # Forward/backward fill as final fallback (edges only)
    group[SOLAR_COL] = group[SOLAR_COL].fillna(method='ffill').fillna(method='bfill')
    return group

# === Step 2: Fill Precipitation Type (Categorical) ===
def infer_preciptype(row):
    # Keep existing label if present
    if pd.notnull(row[PRECIPTYPE_COL]):
        return row[PRECIPTYPE_COL]
    
    # If no precipitation
    if pd.isnull(row[PRECIP_COL]) or row[PRECIP_COL] == 0:
        return "none"
    
    # Infer based on temperature
    if row[TEMP_COL] < 0:
        return "snow"
    elif 0 <= row[TEMP_COL] <= 2:
        return "freezingrain"
    else:
        return "rain"

# === Main Cleaning Function ===
def clean_weather_file(year):
    input_file = os.path.join(INPUT_DIR, f"visualcrossing_weather_{year}.csv")
    output_file = os.path.join(OUTPUT_DIR, f"visualcrossing_weather_{year}_cleaned.csv")
    
    print(f"Processing {year}...")
    df = pd.read_csv(input_file, parse_dates=[DATE_COL])
    df = df.sort_values(by=[LAT_COL, LON_COL, DATE_COL])
    
    # Fill solar radiation with hybrid method
    df = df.groupby([LAT_COL, LON_COL], group_keys=False).apply(fill_solar)
    
    # Fill preciptype using rules
    df[PRECIPTYPE_COL] = df.apply(infer_preciptype, axis=1)
    df[PRECIPTYPE_COL] = df[PRECIPTYPE_COL].fillna("unknown")
    
    # Add binary weather flags for ML/RL features
    df["is_rain"] = (df[PRECIPTYPE_COL] == "rain").astype(int)
    df["is_snow"] = (df[PRECIPTYPE_COL] == "snow").astype(int)
    df["is_freezingrain"] = (df[PRECIPTYPE_COL] == "freezingrain").astype(int)
    
    # Save cleaned file
    df.to_csv(output_file, index=False)
    print(f"Saved cleaned weather file for {year}: {output_file}")

if __name__ == "__main__":
    for year in YEARS:
        clean_weather_file(year)
    print("\nAll weather files (2015–2017) have been cleaned and saved in ./cleaned_weather")

Processing 2015...


  group[SOLAR_COL] = group[SOLAR_COL].fillna(method='ffill').fillna(method='bfill')
  df = df.groupby([LAT_COL, LON_COL], group_keys=False).apply(fill_solar)


Saved cleaned weather file for 2015: ./cleaned_weather\visualcrossing_weather_2015_cleaned.csv
Processing 2016...


  group[SOLAR_COL] = group[SOLAR_COL].fillna(method='ffill').fillna(method='bfill')
  df = df.groupby([LAT_COL, LON_COL], group_keys=False).apply(fill_solar)


Saved cleaned weather file for 2016: ./cleaned_weather\visualcrossing_weather_2016_cleaned.csv
Processing 2017...


  group[SOLAR_COL] = group[SOLAR_COL].fillna(method='ffill').fillna(method='bfill')
  df = df.groupby([LAT_COL, LON_COL], group_keys=False).apply(fill_solar)


Saved cleaned weather file for 2017: ./cleaned_weather\visualcrossing_weather_2017_cleaned.csv

All weather files (2015–2017) have been cleaned and saved in ./cleaned_weather
