In [None]:
pip install requests pandas numpy matplotlib seaborn scipy requests-cache retry-requests

In [4]:
# ======== IMPORTS ========
import os
import sys
import time
import math
import random
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from scipy import stats

In [5]:
# Merges cleaned Data_Co_Daily_By_Location.csv (lat/lon in degrees) with Visual Crossing weather files (2015–2017)

import pandas as pd
import os

# === Config ===
SALES_FILE = "Data_Co_Daily_By_Location.csv"      # Daily aggregated sales with degrees lat/lon
WEATHER_DIR = "./cleaned_weather"     # Cleaned weather files for 2015–2017
OUTPUT_DIR = "./merged_data_final"    # Output folder
os.makedirs(OUTPUT_DIR, exist_ok=True)

YEARS = [2015, 2016, 2017]

# Column names
SALES_DATE_COL = "date_only"
WEATHER_DATE_COL = "datetime"
LAT_SALES = "Latitude"
LON_SALES = "Longitude"

# === Load DataCo Sales ===
print("Loading DataCo daily sales...")
sales_df = pd.read_csv(SALES_FILE)

# Parse and normalize date
sales_df[SALES_DATE_COL] = pd.to_datetime(sales_df[SALES_DATE_COL], errors='coerce').dt.normalize()

# Rename lat/lon to match weather files
sales_df = sales_df.rename(columns={LAT_SALES: "latitude", LON_SALES: "longitude"})

# Round lat/lon for matching
sales_df["latitude"] = sales_df["latitude"].round(3)
sales_df["longitude"] = sales_df["longitude"].round(3)

print("\n=== Sample DataCo Coordinates ===")
print(sales_df[['latitude','longitude']].drop_duplicates().head())

# === Process each year ===
for year in YEARS:
    weather_file = os.path.join(WEATHER_DIR, f"visualcrossing_weather_{year}_cleaned.csv")
    if not os.path.exists(weather_file):
        print(f"\nSkipping {year} - weather file not found")
        continue

    print(f"\n=== Processing Year {year} ===")
    weather_df = pd.read_csv(weather_file)

    # Parse and normalize weather date
    weather_df[WEATHER_DATE_COL] = pd.to_datetime(weather_df[WEATHER_DATE_COL], errors='coerce').dt.normalize()

    # Round weather lat/lon for matching
    weather_df["latitude"] = weather_df["latitude"].round(3)
    weather_df["longitude"] = weather_df["longitude"].round(3)

    print(f"Weather sample coords for {year}:", weather_df[['latitude','longitude']].drop_duplicates().head().values.tolist())

    # Filter sales data for current year
    sales_year = sales_df[sales_df[SALES_DATE_COL].dt.year == year].copy()

    # Merge on date + lat/lon
    merged = pd.merge(
        sales_year,
        weather_df,
        left_on=[SALES_DATE_COL, "latitude", "longitude"],
        right_on=[WEATHER_DATE_COL, "latitude", "longitude"],
        how="left"
    ).drop(columns=[WEATHER_DATE_COL])

    # Save merged file
    output_file = os.path.join(OUTPUT_DIR, f"DataCo_Weather_{year}_Merged_new.csv")
    merged.to_csv(output_file, index=False)
    print(f"Saved merged dataset for {year}: {output_file} (Rows: {merged.shape[0]})")


Loading DataCo daily sales...

=== Sample DataCo Coordinates ===
   latitude  longitude
0    18.204    -66.371
1    18.208    -66.371
2    18.212    -66.371
3    18.217    -66.371
4    18.218    -66.371

=== Processing Year 2015 ===
Weather sample coords for 2015: [[17.982, -66.113], [18.007, -66.636], [18.018, -66.616], [18.025, -66.613], [18.025, -66.615]]
Saved merged dataset for 2015: ./merged_data_final\DataCo_Weather_2015_Merged_new.csv (Rows: 122814)

=== Processing Year 2016 ===
Weather sample coords for 2016: [[17.982, -66.113], [18.007, -66.636], [18.025, -66.613], [18.025, -66.615], [18.033, -66.852]]
Saved merged dataset for 2016: ./merged_data_final\DataCo_Weather_2016_Merged_new.csv (Rows: 89084)

=== Processing Year 2017 ===
Weather sample coords for 2017: [[-33.938, 18.571], [17.982, -66.113], [18.007, -66.636], [18.018, -66.616], [18.025, -66.613]]
Saved merged dataset for 2017: ./merged_data_final\DataCo_Weather_2017_Merged_new.csv (Rows: 323515)


In [6]:
# Concatenates DataCo + Weather merged files (2015–2017) into one dataset

import pandas as pd
import os

# === Config ===
MERGED_DIR = "./merged_data_final"    # Folder containing year-wise merged files
OUTPUT_FILE = "./final_datasets/DataCo_Weather_2015_2017_All_new.csv"
os.makedirs("./final_datasets", exist_ok=True)

# File paths
file_2015 = os.path.join(MERGED_DIR, "DataCo_Weather_2015_Merged_new.csv")
file_2016 = os.path.join(MERGED_DIR, "DataCo_Weather_2016_Merged_new.csv")
file_2017 = os.path.join(MERGED_DIR, "DataCo_Weather_2017_Merged_new.csv")

# Load and concatenate all years
print("Loading all year-wise merged files...")
df_2015 = pd.read_csv(file_2015)
df_2016 = pd.read_csv(file_2016)
df_2017 = pd.read_csv(file_2017)

all_df = pd.concat([df_2015, df_2016, df_2017], ignore_index=True)

# Save the combined dataset
all_df.to_csv(OUTPUT_FILE, index=False)
print(f"Saved combined dataset: {OUTPUT_FILE} (Rows: {all_df.shape[0]})")


Loading all year-wise merged files...
Saved combined dataset: ./final_datasets/DataCo_Weather_2015_2017_All_new.csv (Rows: 535413)


In [7]:
# Adds lag features (1, 7, 30 days) for Sales and Temperature to DataCo + Weather dataset.

import pandas as pd
import os

# === Config ===
INPUT_FILE = "./final_datasets/DataCo_Weather_2015_2017_All_new.csv"
OUTPUT_FILE = "./final_datasets/DataCo_Weather_Lagged_new.csv"

# Columns to lag
TARGET_COLS = ["Sales", "temp"]  

# Lag periods (in days)
LAGS = [1, 7, 30]

# Load dataset
print("Loading combined dataset...")
df = pd.read_csv(INPUT_FILE)

# Ensure date is datetime
df["date_only"] = pd.to_datetime(df["date_only"], errors='coerce')

# Sort by location and date
df = df.sort_values(by=["latitude", "longitude", "date_only"])

# Generate lag features for each target variable
for col in TARGET_COLS:
    for lag in LAGS:
        df[f"{col}_lag_{lag}"] = df.groupby(["latitude", "longitude"])[col].shift(lag)

# Save output
df.to_csv(OUTPUT_FILE, index=False)
print(f"Saved dataset with Sales + Temp lag features: {OUTPUT_FILE} (Rows: {df.shape[0]})")


Loading combined dataset...
Saved dataset with Sales + Temp lag features: ./final_datasets/DataCo_Weather_Lagged_new.csv (Rows: 535413)
