# Set Up

## Mount Google Drive

Ignore if not using Google Collab:

In [7]:
from google.colab import drive

# mount google drive
drive.mount('/content/drive')
%cd /content/drive/My Drive
!git clone https://github.com/FranciscoLozCoding/cooling_with_code.git
%cd cooling_with_code
!git pull

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive
fatal: destination path 'cooling_with_code' already exists and is not an empty directory.
/content/drive/My Drive/cooling_with_code
Already up to date.


## Import Libraries

Download libraries not in google collab (can be disregarded if not using collab)

In [None]:
%pip install stackstac
%pip install pystac-client
%pip install planetary-computer
%pip install odc-stac
%pip install rioxarray
%pip install geopandas
%pip install geopy
%pip install folium

Collecting stackstac
  Downloading stackstac-0.5.1-py3-none-any.whl.metadata (8.1 kB)
Collecting dask>=2022.1.1 (from dask[array]>=2022.1.1->stackstac)
  Downloading dask-2025.2.0-py3-none-any.whl.metadata (3.8 kB)
Collecting pyproj<4.0.0,>=3.0.0 (from stackstac)
  Downloading pyproj-3.7.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (31 kB)
Collecting rasterio<2.0.0,>=1.3.0 (from stackstac)
  Downloading rasterio-1.4.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.1 kB)
Collecting xarray>=0.18 (from stackstac)
  Downloading xarray-2025.1.2-py3-none-any.whl.metadata (11 kB)
Collecting partd>=1.4.0 (from dask>=2022.1.1->dask[array]>=2022.1.1->stackstac)
  Downloading partd-1.4.2-py3-none-any.whl.metadata (4.6 kB)
Collecting importlib_metadata>=4.13.0 (from dask>=2022.1.1->dask[array]>=2022.1.1->stackstac)
  Downloading importlib_metadata-8.6.1-py3-none-any.whl.metadata (4.7 kB)
Collecting affine (from rasterio<2.0.0,>=1.3.0->stackstac)
  D

Collecting pystac-client
  Downloading pystac_client-0.8.6-py3-none-any.whl.metadata (3.0 kB)
Collecting pystac>=1.10.0 (from pystac[validation]>=1.10.0->pystac-client)
  Downloading pystac-1.12.2-py3-none-any.whl.metadata (4.6 kB)
Downloading pystac_client-0.8.6-py3-none-any.whl (41 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/41.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.4/41.4 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pystac-1.12.2-py3-none-any.whl (194 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/194.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m184.3/194.2 kB[0m [31m5.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.2/194.2 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pystac, pystac-

In [None]:
#data science
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
import pickle

#custom tools for this project
from tools.environment import VALID_SPLIT, RANDOM_STATE
from tools.build_dataset import (
    generate_buffer_dataset,
    generate_median,
    generate_building_gdf,
    generate_traffic,
    generate_weather_data
)

# Generating a 200m Buffer Dataset

This notebook is for generating a new dataset using 200m Buffer Zones. After doing [04_EDA](/04_EDA.ipynb) we found that increasing the buffer zone from 150m to a value between 200m and 400m it might help our models capture the relationship between vegeatation and UHI better. For details on the features and how we generate our datasets see our past notebooks:
- [01_dataset_generation](/01_dataset_generation.ipynb)
- [02_more_dataset_generation](/02_more_dataset_generation.ipynb)

>NOTE: we will use our custom tools here, for a in-depth explanation of these tools see the notebooks above.

In [None]:
buffer_radius = 100  # radius in meters (diameter will be 200)

## Training Dataset

We will first create the training dataset.

In [None]:
# Generate the satellite image median.
median = generate_median()

# Generate the building geodataframe.
buildings_gdf = generate_building_gdf()

# Generate traffic data for UHI geodataframe.
uhi_gdf = generate_traffic()

# Read values into a series
uhi = uhi_gdf['UHI Index'].values
traffic_volume = uhi_gdf['traffic_volume'].values
latitudes = uhi_gdf['Latitude'].values
longitudes = uhi_gdf['Longitude'].values
datetimes = uhi_gdf['datetime'].values

# Apply Buffer Zone
train_df = generate_buffer_dataset(
    latitudes, longitudes,
    buffer_radius, traffic_volume,
    median, buildings_gdf,
    UHI=uhi, datetimes=datetimes)

# Add the weather data
train_df = generate_weather_data(train_df)

# remove cols we dont need
cols = ['Latitude', 'Longitude', 'datetime']
train_df.drop(cols, axis=1, inplace=True)

In [None]:
# show features
train_df.describe()

In [None]:
# Save to csv file
train_df.to_csv(f"{buffer_radius*2}m_buffer_dataset.csv", index=False)

## Test Dataset

Now, we will create our testing dataset.

In [None]:
#csv path for target variable for testing dataset
test_csv = "data/Testing_data_uhi_index.csv"

# Generate traffic data for UHI geodataframe.
test_uhi_df = generate_traffic(uhi_csv_file="data/Testing_data_uhi_index.csv")

# Read values into a series
traffic_volume = test_uhi_df['traffic_volume'].values
latitudes = test_uhi_df['Latitude'].values
longitudes = test_uhi_df['Longitude'].values

# Apply Buffer Zone
test_df = generate_buffer_dataset(
    latitudes, longitudes,
    buffer_radius, traffic_volume,
    median, buildings_gdf)

# Add the weather data
test_df = generate_weather_data(test_df)

# drop variables we dont need
# NOTE: We need the lat and lon here, since they are required in the final submission
cols = ['datetime', 'UHI']
test_df.drop(cols, axis=1, inplace=True)

In [None]:
# show features
test_df.describe()

In [None]:
# Save to csv file
test_df.to_csv(f"{buffer_radius*2}m_buffer_test_dataset.csv", index=False)

# Evaluating the 200m Buffer Dataset

Finally, let's evaluate how this new dataset does on a simple RandomForestRegressor.

In [None]:
dataset = train_df.copy()

# Scale data using standardscaler
sc = StandardScaler()
scaled_dataset = sc.fit_transform(dataset)

# Convert back to a DataFrame with original columns and index
dataset = pd.DataFrame(scaled_dataset, columns=dataset.columns, index=dataset.index)

# Split the data into features (X) and target (y), and then into training and validation sets
x = dataset.drop(columns=['UHI']).values
y = dataset['UHI'].values
x_train, x_valid, y_train, y_valid = train_test_split(
    x, y,
    test_size=VALID_SPLIT,
    random_state=RANDOM_STATE)
x_names = list(dataset.drop(columns=['UHI']).columns)

# Train the Random Forest model on the training data
rf_model = RandomForestRegressor(
    n_estimators=100,
    random_state=RANDOM_STATE,
    criterion="squared_error")
rf_model.fit(x_train, y_train)

# Make predictions on the training data
insample_predictions = rf_model.predict(x_train)

# calculate R-squared score for in-sample predictions
print(f"In-Sample Evaluation:")
insample_r2 = r2_score(y_train, insample_predictions)
print(f"  R-squared: {insample_r2}")

# Make predictions on the validation data
out_of_sample_predictions = rf_model.predict(x_valid)

# calculate R-squared score for out-sample predictions
print(f"Out-Of-Sample Evaluation:")
out_of_sample_r2 = r2_score(y_valid, out_of_sample_predictions)
print(f"  R-squared: {out_of_sample_r2}")

>TODO: Create a simple RF regressor and compare to 150m simple RF regressor