In [1]:
import os
# Supress Warnings
import warnings

from tqdm import tqdm

warnings.filterwarnings('ignore')

# Import common GIS tools
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
import rioxarray as rio
import rasterio
import pandas as pd
from matplotlib.cm import RdYlGn,jet,RdBu

# Import Planetary Computer tools
import stackstac
import pystac_client
import planetary_computer
from odc.stac import stac_load

In [2]:
# Calculate NDVI
training_data = pd.read_csv("../data_test/training_data_uhi_index.csv")
print(training_data.columns)
training_data['datetime'] = pd.to_datetime(training_data['datetime'], format='%d-%m-%Y %H:%M')
training_data.describe()

Index(['Longitude', 'Latitude', 'datetime', 'UHI Index'], dtype='object')


Unnamed: 0,Longitude,Latitude,datetime,UHI Index
count,11229.0,11229.0,11229,11229.0
mean,-73.933927,40.8088,2021-07-24 15:34:29.056906240,1.000001
min,-73.994457,40.758792,2021-07-24 15:01:00,0.956122
25%,-73.955703,40.790905,2021-07-24 15:22:00,0.988577
50%,-73.932968,40.810688,2021-07-24 15:36:00,1.000237
75%,-73.909647,40.824515,2021-07-24 15:48:00,1.011176
max,-73.879458,40.859497,2021-07-24 15:59:00,1.046036
std,0.028253,0.023171,,0.016238


In [3]:
# Calculate the bounds for doing an archive data search
# bounds = (min_lon, min_lat, max_lon, max_lat)
lower_left = (40.75, -74.01)
upper_right = (40.88, -73.86)
bounds = (lower_left[1], lower_left[0], upper_right[1], upper_right[0])
time_window = "2021-07-23/2021-07-25"
height = 100
width = 100

In [4]:
stac = pystac_client.Client.open("https://planetarycomputer.microsoft.com/api/stac/v1")

search = stac.search(
    bbox=bounds,
    datetime=time_window,
    collections=["sentinel-2-l2a"],
    query={"eo:cloud_cover": {"lt": 30}},
)

In [5]:
items = list(search.get_items())
print('This is the number of scenes that touch our region:',len(items))
signed_items = [planetary_computer.sign(item).to_dict() for item in items]

This is the number of scenes that touch our region: 1


In [6]:
resolution = 10  # meters per pixel
scale = resolution / 111320.0 # degrees per pixel for crs=4326

In [7]:
data = stac_load(
    items,
    bands=["B01", "B02", "B03", "B04", "B05", "B06", "B07", "B08", "B8A", "B11", "B12"],
    crs="EPSG:4326",  # Latitude-Longitude
    resolution=scale,  # Degrees
    chunks={"x": 2048, "y": 2048},
    dtype="uint16",
    patch_url=planetary_computer.sign,
    bbox=bounds
)


In [8]:
df = train_feat = data.to_dataframe().reset_index()
print(df.head())

df['time'] = pd.to_datetime(df['time'])
df['time'] = df['time'].dt.strftime('%d-%m-%Y %H:%M')
display(df)
training_data['datetime'] = pd.to_datetime(training_data['datetime'], format='%d-%m-%Y %H:%M')

    latitude  longitude                    time  spatial_ref  B01  B02   B03  \
0  40.880031 -74.010016 2021-07-24 15:49:11.024         4326  666  639   728   
1  40.880031 -74.009926 2021-07-24 15:49:11.024         4326  666  639   728   
2  40.880031 -74.009837 2021-07-24 15:49:11.024         4326  666  395   579   
3  40.880031 -74.009747 2021-07-24 15:49:11.024         4326  666  562   775   
4  40.880031 -74.009657 2021-07-24 15:49:11.024         4326  710  919  1036   

    B04   B05   B06   B07   B08   B8A   B11   B12  
0   839  1023  2034  2064  1440  2438  1578  1083  
1   839  1023  2034  2064  1440  2438  1578  1083  
2   415   889  1850  2216  2354  2133  1554  1029  
3   688   889  1850  2216  2270  2133  1554  1029  
4  1108  1201  1566  1757  1584  1908  1742  1353  


Unnamed: 0,latitude,longitude,time,spatial_ref,B01,B02,B03,B04,B05,B06,B07,B08,B8A,B11,B12
0,40.880031,-74.010016,24-07-2021 15:49,4326,666,639,728,839,1023,2034,2064,1440,2438,1578,1083
1,40.880031,-74.009926,24-07-2021 15:49,4326,666,639,728,839,1023,2034,2064,1440,2438,1578,1083
2,40.880031,-74.009837,24-07-2021 15:49,4326,666,395,579,415,889,1850,2216,2354,2133,1554,1029
3,40.880031,-74.009747,24-07-2021 15:49,4326,666,562,775,688,889,1850,2216,2270,2133,1554,1029
4,40.880031,-74.009657,24-07-2021 15:49,4326,710,919,1036,1108,1201,1566,1757,1584,1908,1742,1353
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2419603,40.750045,-73.860358,24-07-2021 15:49,4326,1175,1412,1362,1602,1858,1940,1908,1588,2050,2202,1989
2419604,40.750045,-73.860268,24-07-2021 15:49,4326,1175,980,1290,1444,1858,1940,1908,1684,2050,2202,1989
2419605,40.750045,-73.860178,24-07-2021 15:49,4326,1182,1202,1326,1416,1530,1557,1903,1842,1797,1909,1721
2419606,40.750045,-73.860088,24-07-2021 15:49,4326,1182,1220,1398,1418,1530,1557,1903,1788,1797,1909,1721


In [9]:
training_data.rename(columns={'Latitude': 'latitude', 'Longitude': 'longitude'}, inplace=True)

In [10]:
import numpy as np
import pandas as pd

# Função para extrair as features de training_data para um ponto (lat, lon)
def extract_features_from_dataframe(lat, lon, df):
    # Calcula a distância euclidiana entre (lat, lon) e todas as linhas de df
    df["distance"] = np.sqrt((df["latitude"] - lat)**2 + (df["longitude"] - lon)**2)
    # Seleciona a linha com a menor distância (ponto mais próximo)
    closest_row = df.loc[df["distance"].idxmin()]
    # Retorna os valores dessa linha como dicionário
    return closest_row.to_dict()

# Itera sobre as linhas de df e extrai features do training_data (que contém o UHI Index)
features = []
for idx, row in df.iterrows():
    lat, lon = row["latitude"], row["longitude"]
    features.append(extract_features_from_dataframe(lat, lon, training_data))

# Cria um DataFrame com as features extraídas
extracted_features = pd.DataFrame(features)

# Realiza um left merge de df com as features extraídas usando o índice
df_with_features = df.merge(extracted_features, left_index=True, right_index=True, how='left')

# Remove colunas duplicadas, se existirem
df_with_features = df_with_features.loc[:, ~df_with_features.columns.duplicated()]

# Converte a distância de graus para metros (aproximadamente no equador)
conversion_factor = 111320  # metros por grau
df_with_features['distance_meters'] = df_with_features['distance'] * conversion_factor

# Agora, df_with_features contém o "UHI Index" e demais features extraídas de training_data
print(df_with_features.head())


   latitude_x  longitude_x              time  spatial_ref  B01  B02   B03  \
0   40.880031   -74.010016  24-07-2021 15:49         4326  666  639   728   
1   40.880031   -74.009926  24-07-2021 15:49         4326  666  639   728   
2   40.880031   -74.009837  24-07-2021 15:49         4326  666  395   579   
3   40.880031   -74.009747  24-07-2021 15:49         4326  666  562   775   
4   40.880031   -74.009657  24-07-2021 15:49         4326  710  919  1036   

    B04   B05   B06  ...   B08   B8A   B11   B12  longitude_y  latitude_y  \
0   839  1023  2034  ...  1440  2438  1578  1083   -73.938388   40.853352   
1   839  1023  2034  ...  1440  2438  1578  1083   -73.938388   40.853352   
2   415   889  1850  ...  2354  2133  1554  1029   -73.938388   40.853352   
3   688   889  1850  ...  2270  2133  1554  1029   -73.938388   40.853352   
4  1108  1201  1566  ...  1584  1908  1742  1353   -73.938388   40.853352   

             datetime UHI Index  distance  distance_meters  
0 2021-07-24 

In [11]:
df_with_features_100 = df_with_features[df_with_features['distance_meters']<100]
df_with_features_80 = df_with_features[df_with_features['distance_meters']<80]
df_with_features_150 = df_with_features[df_with_features['distance_meters']<150]
df_with_features_200 = df_with_features[df_with_features['distance_meters']<200]

In [12]:
training_data_with_features = df_with_features_150
print(training_data_with_features.shape)

(304467, 21)


In [13]:
# Exemplo: Criação de NDVI e SAVI
training_data_with_features['NDVI'] = (training_data_with_features['B08'] - training_data_with_features['B04']) / (training_data_with_features['B08'] + training_data_with_features['B04'] + 1e-6)
L = 0.5
training_data_with_features['SAVI'] = ((training_data_with_features['B08'] - training_data_with_features['B04']) * (1 + L)) / (training_data_with_features['B08'] + training_data_with_features['B04'] + L + 1e-6)

# Criação de NDBI
training_data_with_features['NDBI'] = (training_data_with_features['B11'] - training_data_with_features['B08']) / (training_data_with_features['B11'] + training_data_with_features['B08'] + 1e-6)

# Criação de MNDWI
training_data_with_features['MNDWI'] = (training_data_with_features['B03'] - training_data_with_features['B11']) / (training_data_with_features['B03'] + training_data_with_features['B11'] + 1e-6)

# Criação de EVI
training_data_with_features['EVI'] = 2.5 * (training_data_with_features['B08'] - training_data_with_features['B04']) / (training_data_with_features['B08'] + 6 * training_data_with_features['B04'] - 7.5 * training_data_with_features['B02'] + 1)

In [14]:
print(training_data_with_features.columns)
training_data_with_features = training_data_with_features.drop(columns=["latitude_x",
                                                                        "longitude_y",
                                                                        "latitude_x",
                                                                        "longitude_y",
                                                                        'datetime',
                                                                        'time'], axis=1, errors = 'ignore')

Index(['latitude_x', 'longitude_x', 'time', 'spatial_ref', 'B01', 'B02', 'B03',
       'B04', 'B05', 'B06', 'B07', 'B08', 'B8A', 'B11', 'B12', 'longitude_y',
       'latitude_y', 'datetime', 'UHI Index', 'distance', 'distance_meters',
       'NDVI', 'SAVI', 'NDBI', 'MNDWI', 'EVI'],
      dtype='object')


In [15]:
def remove_outliers_iqr(df, factor=3):
    print(df)
    df.reset_index(drop=True)
    df_clean = df.copy()
    # Get list of numeric columns
    numeric_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()

    for col in numeric_cols:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - factor * IQR
        upper_bound = Q3 + factor * IQR

        # Keep rows within the bounds
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]
    return df_clean

# Example usage:
# Assuming df is your DataFrame
df_clean_factor3 = remove_outliers_iqr(training_data_with_features, factor=3)
df_clean_factor4 = remove_outliers_iqr(training_data_with_features, factor=4)
df_clean_factor5 = remove_outliers_iqr(training_data_with_features, factor=5)
df_clean_factor2 = remove_outliers_iqr(training_data_with_features, factor=2)
print("Data shape before outlier removal:", training_data_with_features.shape)
print("Data shape after outlier removal:", df_clean_factor3.shape)


         longitude_x  spatial_ref  B01  B02  B03  B04   B05   B06   B07   B08  \
358458    -73.932402         4326  341  792  941  843   909  1985  2300  2130   
358459    -73.932312         4326  310  616  791  810   929  1627  2130  1764   
358460    -73.932222         4326  310  749  803  750   929  1627  2130  1778   
358461    -73.932133         4326  310  749  803  750   929  1627  2130  1778   
358462    -73.932043         4326  310  444  613  530   914  2601  3347  2864   
...              ...          ...  ...  ...  ...  ...   ...   ...   ...   ...   
2279803   -73.959801         4326  889  532  777  585  1100  2664  2932  3208   
2279804   -73.959711         4326  889  637  850  688  1100  2664  2932  3440   
2279805   -73.959621         4326  889  637  850  688  1100  2664  2932  3440   
2279806   -73.959531         4326  956  731  877  810  1236  1834  2272  2876   
2279807   -73.959441         4326  956  843  975  862  1236  1834  2272  2182   

         ...   B12  latitud

In [None]:
import pandas as pd
import joblib
import numpy as np
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold
from sklearn.metrics import r2_score, mean_absolute_percentage_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler

# ------------------------------
# 1. Carregar Dados e Pré-processamento
# ------------------------------
# Supondo que `df1_filtered` seja o dataset processado disponível
# data = df1_filtered.copy()
# data = df_clean_factor3.copy()
data = df_clean_factor5.copy()
data = df_clean_factor4.copy()

# Remova as colunas irrelevantes
cols_to_drop = ["latitude_x",
                "longitude_y",
                "latitude_y",
                "longitude_x",
                "latitude",
                "longitude",
                "datetime",
                "distance",
                "distance_meters",
                "time",
                "spatial_ref",
                "EVI"]
data = data.drop(columns=cols_to_drop, errors="ignore")

# Defina a variável target
target = "UHI Index"
X = data.drop(target, axis=1)
y = data[target]

# ------------------------------
# 2. Construção do Pipeline
# ------------------------------
pipeline = Pipeline([
    ("select_from_model", SelectFromModel(
        estimator=ExtraTreesRegressor(n_estimators=300, random_state=42),
        threshold="1*mean"
    )),
    ("scaler", StandardScaler()),
    ("model", ExtraTreesRegressor(random_state=42, n_estimators=600, max_depth=150, min_samples_split=2))
])

# ------------------------------
# 3. Validação Cruzada (Cross-Validation)
# ------------------------------
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5-Fold Cross-Validation

# Avaliação do R² e MAPE usando cross-validation
r2_scores = cross_val_score(pipeline, X, y, cv=kf, scoring='r2')
mape_scores = cross_val_score(pipeline, X, y, cv=kf, scoring='neg_mean_absolute_percentage_error')

# Convertendo MAPE para valores positivos
mape_scores = np.abs(mape_scores)

print(f"Cross-Validation R²: {r2_scores.mean():.4f} ± {r2_scores.std():.4f}")
print(f"Cross-Validation MAPE: {mape_scores.mean() * 100:.2f}% ± {mape_scores.std() * 100:.2f}%")

# Treinar o pipeline final com todos os dados
pipeline.fit(X, y)
print("Modelo final treinado com todos os dados.")

# Obter as features selecionadas pelo `SelectFromModel`
selected_mask = pipeline.named_steps["select_from_model"].get_support()
selected_features = X.columns[selected_mask]
print("\nFeatures usadas no modelo final:", selected_features.tolist())

In [17]:
model_filename = "final_model_v1.pkl"
joblib.dump(pipeline, model_filename)
print("Pipeline salvo como '{}'".format(model_filename))

Pipeline salvo como 'final_model_v1.pkl'


In [18]:
sub_temp = pd.read_csv('../data/Submission_template.csv')
sub_temp.rename(columns={"Latitude": "latitude", "Longitude": "longitude"}, inplace=True)
features = []
for _, row in sub_temp.iterrows():
    lat, lon = row["latitude"], row["longitude"]
    features.append(extract_features_from_dataframe(lat, lon, train_feat))

val_features = pd.DataFrame(features)
val_data_with_features = pd.concat([sub_temp.reset_index(drop=True), val_features.reset_index(drop=True)], axis=1)
val_data_with_features = val_data_with_features.loc[:, ~val_data_with_features.columns.duplicated()]
val_data_with_features

Unnamed: 0,longitude,latitude,UHI Index,time,spatial_ref,B01,B02,B03,B04,B05,B06,B07,B08,B8A,B11,B12,distance
0,-73.971665,40.788763,,24-07-2021 15:49,4326,811,459,617,432,984,2089,2405,2502,2552,1474,893,0.000007
1,-73.971928,40.788875,,24-07-2021 15:49,4326,1208,667,800,745,1112,2076,2248,2288,2445,1751,1188,0.000023
2,-73.967080,40.789080,,24-07-2021 15:49,4326,899,955,1052,1188,979,995,1158,1246,1056,1101,763,0.000042
3,-73.972550,40.789082,,24-07-2021 15:49,4326,1193,1132,1364,1512,1866,1939,2076,1774,2196,2521,2346,0.000040
4,-73.969697,40.787953,,24-07-2021 15:49,4326,1097,1506,1642,1688,1294,2204,2411,2834,2601,2248,1848,0.000015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1035,-73.919388,40.813803,,24-07-2021 15:49,4326,1474,1086,1382,1474,1824,1553,1995,1578,1828,2421,2089,0.000025
1036,-73.931033,40.833178,,24-07-2021 15:49,4326,1014,576,883,965,2034,2393,2701,2664,2679,2019,1201,0.000045
1037,-73.934647,40.854542,,24-07-2021 15:49,4326,1268,1466,1608,1762,2040,2040,2246,1992,2184,2119,1682,0.000023
1038,-73.917223,40.815413,,24-07-2021 15:49,4326,1890,1066,1244,1368,2302,2587,2621,2094,2723,3066,2379,0.000029


In [19]:
# Exemplo: Criação de NDVI e SAVI
val_data_with_features['NDVI'] = (val_data_with_features['B08'] - val_data_with_features['B04']) / (val_data_with_features['B08'] + val_data_with_features['B04'] + 1e-6)
L = 0.5
val_data_with_features['SAVI'] = ((val_data_with_features['B08'] - val_data_with_features['B04']) * (1 + L)) / (val_data_with_features['B08'] + val_data_with_features['B04'] + L + 1e-6)

# Criação de NDBI
val_data_with_features['NDBI'] = (val_data_with_features['B11'] - val_data_with_features['B08']) / (val_data_with_features['B11'] + val_data_with_features['B08'] + 1e-6)

# Criação de MNDWI
val_data_with_features['MNDWI'] = (val_data_with_features['B03'] - val_data_with_features['B11']) / (val_data_with_features['B03'] + val_data_with_features['B11'] + 1e-6)

# Criação de EVI
val_data_with_features['EVI'] = 2.5 * (val_data_with_features['B08'] - val_data_with_features['B04']) / (val_data_with_features['B08'] + 6 * val_data_with_features['B04'] - 7.5 * val_data_with_features['B02'] + 1)

# val_data_with_features = val_data_with_features[['longitude', 'latitude', 'B01', 'B02', 'B03', 'B04', 'B05', 'B06','B07', 'B08', 'B8A', 'B11', 'B12']]
copy_val_data = val_data_with_features[['B01', 'B02', 'B03', 'B04', 'B05', 'B06','B07', 'B08', 'B8A', 'B11', 'B12', 'NDVI', 'SAVI', 'NDBI', 'MNDWI']].copy()
pred_vals = pipeline.predict(copy_val_data)

In [20]:
data_to_send = pd.DataFrame()
data_to_send['UHI Index'] = pred_vals
data_to_send['Latitude'] = val_data_with_features['latitude']
data_to_send['Longitude'] = val_data_with_features['longitude']

data_to_send = data_to_send[['Longitude', 'Latitude', 'UHI Index']]
data_to_send.to_csv('../outputs/test_80_predicted_values.csv', index=False)

In [21]:
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from lazypredict.Supervised import LazyRegressor

# ----------------------------------------
# 1. Carregar e pré-processar os dados
# ----------------------------------------

# Supondo que `df_clean_factor5` esteja carregado no seu ambiente
data = df_clean_factor5.copy()

# Remover colunas irrelevantes
cols_to_drop = [
    "latitude_x", "longitude_y", "latitude_y", "longitude_x",
    "latitude", "longitude", "datetime", "distance",
    "distance_meters", "time", "spatial_ref", "EVI"
]
data = data.drop(columns=cols_to_drop, errors="ignore")

# Definir variável alvo (target)
target = "UHI Index"
X = data.drop(target, axis=1)
y = data[target]

# Dividir em treino e teste (necessário para o LazyRegressor)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ----------------------------------------
# 2. Seleção de features usando ExtraTrees
# ----------------------------------------

# Selecionar features relevantes com ExtraTrees
selector = SelectFromModel(
    ExtraTreesRegressor(n_estimators=300, random_state=42),
    threshold='1*mean'
)

# Fit selector apenas nos dados de treino
selector.fit(X_train, y_train)

# Reduzir dados de treino e teste para features selecionadas
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

# Obter nomes das features selecionadas
selected_features = X.columns[selector.get_support()]
print(f"\n✅ Features selecionadas ({len(selected_features)}): {selected_features.tolist()}")

# ----------------------------------------
# 3. Normalização dos dados (StandardScaler)
# ----------------------------------------

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

# ----------------------------------------
# 4. LazyRegressor - Rodar todos os modelos
# ----------------------------------------

# Instanciar LazyRegressor com máxima verbosidade
lazy_regressor = LazyRegressor(verbose=1, ignore_warnings=False, custom_metric=None)

# Fit LazyRegressor (automaticamente treina e avalia dezenas de modelos)
models, predictions = lazy_regressor.fit(X_train_scaled, X_test_scaled, y_train, y_test)

# Exibir os resultados finais de todos os modelos ordenados por desempenho
print("\n🏅 Resultados completos dos modelos testados:")
print(models)



✅ Features selecionadas (7): ['B01', 'B05', 'B06', 'B07', 'B8A', 'B11', 'B12']


  2%|▏         | 1/42 [00:05<03:30,  5.14s/it]

{'Model': 'AdaBoostRegressor', 'R-Squared': 0.12965449462550094, 'Adjusted R-Squared': 0.12953356771778524, 'RMSE': np.float64(0.01565004688803899), 'Time taken': 5.136677980422974}


  5%|▍         | 2/42 [00:14<05:04,  7.62s/it]

{'Model': 'BaggingRegressor', 'R-Squared': 0.9349815153217749, 'Adjusted R-Squared': 0.9349724815711, 'RMSE': np.float64(0.004277482283021543), 'Time taken': 9.351735830307007}
{'Model': 'BayesianRidge', 'R-Squared': 0.08787064231905417, 'Adjusted R-Squared': 0.08774390990993619, 'RMSE': np.float64(0.016021309663830842), 'Time taken': 0.0681159496307373}


 10%|▉         | 4/42 [00:15<02:04,  3.27s/it]

{'Model': 'DecisionTreeRegressor', 'R-Squared': 0.939477235524832, 'Adjusted R-Squared': 0.9394688264152207, 'RMSE': np.float64(0.0041269497038525), 'Time taken': 1.2985453605651855}
{'Model': 'DummyRegressor', 'R-Squared': -2.4062779576672355e-05, 'Adjusted R-Squared': -0.0001630073904312912, 'RMSE': np.float64(0.01677548259328708), 'Time taken': 0.02166271209716797}
{'Model': 'ElasticNet', 'R-Squared': -2.4062779576672355e-05, 'Adjusted R-Squared': -0.0001630073904312912, 'RMSE': np.float64(0.01677548259328708), 'Time taken': 0.05158805847167969}


 17%|█▋        | 7/42 [00:16<00:53,  1.54s/it]

{'Model': 'ElasticNetCV', 'R-Squared': 0.08782146436057359, 'Adjusted R-Squared': 0.08769472511860776, 'RMSE': np.float64(0.016021741556892472), 'Time taken': 0.6700253486633301}


 19%|█▉        | 8/42 [00:16<00:42,  1.26s/it]

{'Model': 'ExtraTreeRegressor', 'R-Squared': 0.9326500283732179, 'Adjusted R-Squared': 0.9326406706827912, 'RMSE': np.float64(0.004353499572532836), 'Time taken': 0.2620096206665039}


 24%|██▍       | 10/42 [00:39<02:32,  4.78s/it]

{'Model': 'ExtraTreesRegressor', 'R-Squared': 0.956429790998697, 'Adjusted R-Squared': 0.9564237372986313, 'RMSE': np.float64(0.0035015845647282987), 'Time taken': 22.859192848205566}
{'Model': 'GammaRegressor', 'R-Squared': 0.06839890942587645, 'Adjusted R-Squared': 0.06826947158950913, 'RMSE': np.float64(0.016191414532671176), 'Time taken': 0.1553936004638672}
GaussianProcessRegressor model failed to execute
Unable to allocate 303. GiB for an array with shape (201552, 201552) and data type float64


 29%|██▊       | 12/42 [01:02<03:42,  7.41s/it]

{'Model': 'GradientBoostingRegressor', 'R-Squared': 0.17185488150040373, 'Adjusted R-Squared': 0.17173981796793114, 'RMSE': np.float64(0.015265921355743242), 'Time taken': 22.12544560432434}


 31%|███       | 13/42 [01:02<02:48,  5.80s/it]

{'Model': 'HistGradientBoostingRegressor', 'R-Squared': 0.2168443376378344, 'Adjusted R-Squared': 0.21673552499742355, 'RMSE': np.float64(0.01484546631048462), 'Time taken': 0.402968168258667}


 33%|███▎      | 14/42 [01:03<02:07,  4.54s/it]

{'Model': 'HuberRegressor', 'R-Squared': 0.08766631261028834, 'Adjusted R-Squared': 0.08753955181134165, 'RMSE': np.float64(0.016023104061849033), 'Time taken': 0.7032721042633057}


 36%|███▌      | 15/42 [01:04<01:37,  3.61s/it]

{'Model': 'KNeighborsRegressor', 'R-Squared': 0.686048438274272, 'Adjusted R-Squared': 0.6860048174463391, 'RMSE': np.float64(0.009399421419865583), 'Time taken': 0.977992057800293}
KernelRidge model failed to execute
Unable to allocate 303. GiB for an array with shape (201552, 201552) and data type float64
{'Model': 'Lars', 'R-Squared': 0.08787716580441296, 'Adjusted R-Squared': 0.08775043430167628, 'RMSE': np.float64(0.016021252372085425), 'Time taken': 0.06370091438293457}


 43%|████▎     | 18/42 [01:04<00:42,  1.76s/it]

{'Model': 'LarsCV', 'R-Squared': 0.08787716580441296, 'Adjusted R-Squared': 0.08775043430167628, 'RMSE': np.float64(0.016021252372085425), 'Time taken': 0.20237374305725098}
{'Model': 'Lasso', 'R-Squared': -2.4062779576672355e-05, 'Adjusted R-Squared': -0.0001630073904312912, 'RMSE': np.float64(0.01677548259328708), 'Time taken': 0.04638266563415527}


 48%|████▊     | 20/42 [01:05<00:28,  1.28s/it]

{'Model': 'LassoCV', 'R-Squared': 0.08782158074491486, 'Adjusted R-Squared': 0.08769484151911966, 'RMSE': np.float64(0.016021740534789986), 'Time taken': 0.6518111228942871}
{'Model': 'LassoLars', 'R-Squared': -2.4062779576672355e-05, 'Adjusted R-Squared': -0.0001630073904312912, 'RMSE': np.float64(0.01677548259328708), 'Time taken': 0.03746986389160156}


 57%|█████▋    | 24/42 [01:05<00:11,  1.57it/s]

{'Model': 'LassoLarsCV', 'R-Squared': 0.08787716580441296, 'Adjusted R-Squared': 0.08775043430167628, 'RMSE': np.float64(0.016021252372085425), 'Time taken': 0.18646788597106934}
{'Model': 'LassoLarsIC', 'R-Squared': 0.08787716580441296, 'Adjusted R-Squared': 0.08775043430167628, 'RMSE': np.float64(0.016021252372085425), 'Time taken': 0.06368350982666016}
{'Model': 'LinearRegression', 'R-Squared': 0.08787716580441296, 'Adjusted R-Squared': 0.08775043430167628, 'RMSE': np.float64(0.016021252372085425), 'Time taken': 0.04468536376953125}


 60%|█████▉    | 25/42 [01:13<00:33,  1.96s/it]

{'Model': 'LinearSVR', 'R-Squared': 0.03607532711599193, 'Adjusted R-Squared': 0.035941398200126984, 'RMSE': np.float64(0.016469914550854454), 'Time taken': 7.813507556915283}


 62%|██████▏   | 26/42 [01:15<00:32,  2.05s/it]

{'Model': 'MLPRegressor', 'R-Squared': 0.015147172809739096, 'Adjusted R-Squared': 0.015010336109587463, 'RMSE': np.float64(0.016647746939207186), 'Time taken': 2.405911922454834}


 69%|██████▉   | 29/42 [2:08:36<3:49:34, 1059.58s/it]

{'Model': 'NuSVR', 'R-Squared': 0.1767526308489613, 'Adjusted R-Squared': 0.17663824781599136, 'RMSE': np.float64(0.015220712170476276), 'Time taken': 7641.04612159729}
{'Model': 'OrthogonalMatchingPursuit', 'R-Squared': 0.06659845773343287, 'Adjusted R-Squared': 0.06646876974002525, 'RMSE': np.float64(0.016207053086853877), 'Time taken': 0.041831016540527344}
{'Model': 'OrthogonalMatchingPursuitCV', 'R-Squared': 0.08750356116427627, 'Adjusted R-Squared': 0.08737677775243735, 'RMSE': np.float64(0.01602453318098674), 'Time taken': 0.10080409049987793}


 74%|███████▍  | 31/42 [2:08:36<2:04:05, 676.86s/it] 

{'Model': 'PassiveAggressiveRegressor', 'R-Squared': -0.8603677347819745, 'Adjusted R-Squared': -0.8606262166331382, 'RMSE': np.float64(0.022880694208412336), 'Time taken': 0.07720017433166504}
{'Model': 'PoissonRegressor', 'R-Squared': 0.06831780995724479, 'Adjusted R-Squared': 0.06818836085281443, 'RMSE': np.float64(0.016192119279880894), 'Time taken': 0.031484365463256836}


 74%|███████▍  | 31/42 [12:42:38<4:30:36, 1476.08s/it]


KeyboardInterrupt: 

In [24]:
import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# ------------------------------
# 1. Carregar Dados e Pré-processamento
# ------------------------------
data = df_clean_factor5.copy()

cols_to_drop = ["latitude_x", "longitude_y", "latitude_y", "longitude_x",
                "latitude", "longitude", "datetime", "distance",
                "distance_meters", "time", "spatial_ref", "EVI"]
data = data.drop(columns=cols_to_drop, errors="ignore")

target = "UHI Index"
X = data.drop(target, axis=1)
y = data[target]

# ------------------------------
# 2. Obter Ranking Global das Features
# ------------------------------
et_all = ExtraTreesRegressor(n_estimators=300, random_state=42)
et_all.fit(X, y)
feat_imp_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': et_all.feature_importances_
}).sort_values(by='Importance', ascending=False).reset_index(drop=True)

sorted_features = feat_imp_df['Feature'].tolist()

print("Ranked features by importance:")
print(feat_imp_df)

# ------------------------------
# 3. Seleção Forward Incremental com Exibição Imediata dos Resultados
# ------------------------------
kf = KFold(n_splits=10, shuffle=True, random_state=42)
results = []
current_subset = []  # Subconjunto atual de features

for i, feature in enumerate(sorted_features, start=1):
    current_subset.append(feature)
    print(f"\nAdded feature: {feature}. Current subset: {current_subset}")

    X_subset = X[current_subset]

    pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("model", ExtraTreesRegressor(random_state=42, n_estimators=600, max_depth=150, min_samples_split=2))
    ])

    r2_scores = cross_val_score(pipeline, X_subset, y, cv=kf, scoring='r2', n_jobs=-1)
    mape_scores = np.abs(cross_val_score(pipeline, X_subset, y, cv=kf, scoring='neg_mean_absolute_percentage_error', n_jobs=-1))

    mean_r2 = np.mean(r2_scores)
    std_r2 = np.std(r2_scores)
    mean_mape = np.mean(mape_scores)
    std_mape = np.std(mape_scores)

    print(f"CV R²: {mean_r2:.4f} ± {std_r2:.4f}")
    print(f"CV MAPE: {mean_mape*100:.2f}% ± {std_mape*100:.2f}%")

    results.append({
        'Num_features': i,
        'Features': current_subset.copy(),
        'R2_mean': mean_r2,
        'R2_std': std_r2,
        'MAPE_mean': mean_mape,
        'MAPE_std': std_mape
    })

results_df = pd.DataFrame(results)
results_df.to_csv("forward_feature_selection_results.csv", index=False)

print("\nSummary of forward feature selection results:")
print(results_df)

# ------------------------------
# 4. Identificar o Melhor Conjunto (Menor MAPE)
# ------------------------------
best_idx = results_df['MAPE_mean'].idxmin()
best_result = results_df.iloc[best_idx]
print("\nBest subset based on lowest cross-validation MAPE:")
print(best_result)

joblib.dump(best_result, "best_feature_subset.pkl")
print("\nBest feature subset saved as 'best_feature_subset.pkl'")

Ranked features by importance:
   Feature  Importance
0      B01        0.22
1      B05        0.09
2      B12        0.09
3      B07        0.08
4      B8A        0.08
5      B11        0.08
6      B06        0.08
7     SAVI        0.04
8     NDVI        0.04
9    MNDWI        0.04
10     B02        0.04
11     B03        0.04
12    NDBI        0.03
13     B04        0.03
14     B08        0.03

Added feature: B01. Current subset: ['B01']
CV R²: 0.3824 ± 0.0053
CV MAPE: 1.01% ± 0.01%

Added feature: B05. Current subset: ['B01', 'B05']
CV R²: 0.9506 ± 0.0025
CV MAPE: 0.16% ± 0.00%

Added feature: B12. Current subset: ['B01', 'B05', 'B12']
CV R²: 0.9638 ± 0.0016
CV MAPE: 0.14% ± 0.00%

Added feature: B07. Current subset: ['B01', 'B05', 'B12', 'B07']
CV R²: 0.9644 ± 0.0014
CV MAPE: 0.14% ± 0.00%

Added feature: B8A. Current subset: ['B01', 'B05', 'B12', 'B07', 'B8A']
CV R²: 0.9645 ± 0.0013
CV MAPE: 0.14% ± 0.00%

Added feature: B11. Current subset: ['B01', 'B05', 'B12', 'B07', 'B8A', 'B1

In [25]:
import pandas as pd
import numpy as np
import joblib
import warnings
warnings.filterwarnings("ignore")

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import make_scorer, mean_absolute_percentage_error

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# ------------------------------
# Assume previous forward selection results are saved in results_df
# results_df should have columns: 'Num_features', 'Features', 'R2_mean', 'R2_std', 'MAPE_mean', 'MAPE_std'
# ------------------------------
# For demonstration, let's load results_df from CSV (if previously saved)
results_df = pd.read_csv("forward_feature_selection_results.csv")

# Convert the string representation of list back to a list (if necessary)
# Here we assume 'Features' column is stored as a string representation of list
import ast
results_df['Features'] = results_df['Features'].apply(ast.literal_eval)

# Get the top 3 best feature subsets based on lowest MAPE_mean
top3 = results_df.nsmallest(3, 'MAPE_mean')
print("Top 3 best feature subsets based on CV MAPE:")
print(top3[['Num_features', 'Features', 'MAPE_mean']])

# Define cross-validation scheme
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define scoring: use negative MAPE so that higher is better in GridSearchCV
scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

# Define parameter grids for LGBMRegressor and XGBRegressor
param_grid_lgbm = {
    'model__n_estimators': [100, 300, 600],
    'model__max_depth': [None, 10, 20],
    'model__learning_rate': [0.01, 0.05, 0.1]
}

param_grid_xgb = {
    'model__n_estimators': [100, 300, 600],
    'model__max_depth': [3, 6, 10],
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__subsample': [0.8, 1.0]
}

# Prepare to store the best model info
best_overall_score = np.inf  # Lower MAPE is better
best_pipeline = None
best_features = None
best_model_name = None
results_list = []  # To log all tuning experiments

# Loop over each of the top 3 feature subsets
for idx, row in top3.iterrows():
    features_subset = row['Features']
    print(f"\nTuning for feature subset: {features_subset}")

    # Subset X for current feature set
    X_subset = X[features_subset]

    # Define common pipeline structure: scaling then model
    for model_name, regressor, param_grid in [
        ('LGBMRegressor', LGBMRegressor(random_state=42), param_grid_lgbm),
        ('XGBRegressor', XGBRegressor(random_state=42, objective='reg:squarederror'), param_grid_xgb)
    ]:
        pipeline = Pipeline([
            ("scaler", StandardScaler()),
            ("model", regressor)
        ])

        print(f"  Tuning {model_name}...")
        grid_search = GridSearchCV(
            pipeline,
            param_grid=param_grid,
            cv=kf,
            scoring=scorer,
            n_jobs=-1,
            verbose=1
        )
        grid_search.fit(X_subset, y)

        best_score = -grid_search.best_score_  # Convert back to positive MAPE
        print(f"    Best {model_name} MAPE: {best_score:.4f} (params: {grid_search.best_params_})")

        # Save details
        results_list.append({
            'Model': model_name,
            'Features': features_subset,
            'Num_features': len(features_subset),
            'Best_MAPE': best_score,
            'Best_Params': grid_search.best_params_
        })

        # Check if this is the best overall model
        if best_score < best_overall_score:
            best_overall_score = best_score
            best_pipeline = grid_search.best_estimator_
            best_features = features_subset
            best_model_name = model_name

# Save the tuning experiments results as a DataFrame
tuning_results_df = pd.DataFrame(results_list)
tuning_results_df.to_csv("tuning_results.csv", index=False)
print("\nTuning experiments results:")
print(tuning_results_df)

# Save the best overall model
joblib.dump(best_pipeline, "best_model_pipeline.pkl")
print("\nBest overall model saved!")
print(f"Model: {best_model_name}")
print(f"Features: {best_features}")
print(f"Best MAPE: {best_overall_score:.4f}")


Top 3 best feature subsets based on CV MAPE:
   Num_features                             Features  MAPE_mean
5             6       [B01, B05, B12, B07, B8A, B11]       0.00
6             7  [B01, B05, B12, B07, B8A, B11, B06]       0.00
3             4                 [B01, B05, B12, B07]       0.00

Tuning for feature subset: ['B01', 'B05', 'B12', 'B07', 'B8A', 'B11']
  Tuning LGBMRegressor...
Fitting 5 folds for each of 27 candidates, totalling 135 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003703 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1530
[LightGBM] [Info] Number of data points in the train set: 251941, number of used features: 6
[LightGBM] [Info] Start training from score 0.998999
    Best LGBMRegressor MAPE: 0.0106 (params: {'model__learning_rate': 0.1, 'model__max_depth': 20, 'model__n_estimators': 600})
  Tuning XGBRegressor...
Fitting 5 folds for each of 54 candidates, to

In [33]:
final_copy_val_data = copy_val_data[best_features]
pred_vals_fin = best_pipeline.predict(final_copy_val_data)

In [34]:
data_to_send_fin = pd.DataFrame()
data_to_send_fin['UHI Index'] = pred_vals_fin
data_to_send_fin['Latitude'] = val_data_with_features['latitude']
data_to_send_fin['Longitude'] = val_data_with_features['longitude']

data_to_send_fin = data_to_send_fin[['Longitude', 'Latitude', 'UHI Index']]
data_to_send_fin.to_csv('../outputs/xgb_predicted_values.csv', index=False)

In [35]:
tuning_results_df

Unnamed: 0,Model,Features,Num_features,Best_MAPE,Best_Params
0,LGBMRegressor,"[B01, B05, B12, B07, B8A, B11]",6,0.01,"{'model__learning_rate': 0.1, 'model__max_dept..."
1,XGBRegressor,"[B01, B05, B12, B07, B8A, B11]",6,0.0,"{'model__learning_rate': 0.1, 'model__max_dept..."
2,LGBMRegressor,"[B01, B05, B12, B07, B8A, B11, B06]",7,0.01,"{'model__learning_rate': 0.1, 'model__max_dept..."
3,XGBRegressor,"[B01, B05, B12, B07, B8A, B11, B06]",7,0.0,"{'model__learning_rate': 0.1, 'model__max_dept..."
4,LGBMRegressor,"[B01, B05, B12, B07]",4,0.01,"{'model__learning_rate': 0.1, 'model__max_dept..."
5,XGBRegressor,"[B01, B05, B12, B07]",4,0.01,"{'model__learning_rate': 0.1, 'model__max_dept..."


In [40]:
import numpy as np
import pandas as pd
import joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer, mean_absolute_percentage_error
from xgboost import XGBRegressor

# Assume X and y are already defined from your preprocessed data.
# Also assume best_features is defined from your previous tuning, e.g.:
best_features = ['B01', 'B05', 'B12', 'B07', 'B8A', 'B11', 'B06']

# Subset the data to the best features
X_best = X[best_features]

# Define cross-validation scheme and scoring metrics
kf = KFold(n_splits=5, shuffle=True, random_state=42)
# Negative MAPE scorer so that higher values indicate lower error
scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

# Define candidate parameter sets (including options with reduced n_estimators)
candidate_params = [
    {'model__learning_rate': 0.05, 'model__max_depth': 10, 'model__n_estimators': 300, 'model__subsample': 0.8},
    {'model__learning_rate': 0.1,  'model__max_depth': 10, 'model__n_estimators': 600, 'model__subsample': 0.8, 'model__reg_lambda': 1},
    {'model__learning_rate': 0.1,  'model__max_depth': 10, 'model__n_estimators': 600, 'model__subsample': 0.8, 'model__reg_alpha': 0.1},
    {'model__learning_rate': 0.1,  'model__max_depth': 8,  'model__n_estimators': 300, 'model__subsample': 0.8},
    {'model__learning_rate': 0.1,  'model__max_depth': 10, 'model__n_estimators': 300, 'model__subsample': 0.8, 'model__colsample_bytree': 0.8},
    {'model__learning_rate': 0.1,  'model__max_depth': 10, 'model__n_estimators': 600, 'model__subsample': 0.7},
    {'model__learning_rate': 0.05, 'model__max_depth': 10, 'model__n_estimators': 400, 'model__subsample': 0.8},
    {'model__learning_rate': 0.05, 'model__max_depth': 8,  'model__n_estimators': 300, 'model__subsample': 0.8, 'model__reg_lambda': 1, 'model__reg_alpha': 0.1},
    {'model__learning_rate': 0.1,  'model__max_depth': 8,  'model__n_estimators': 300, 'model__subsample': 0.7, 'model__colsample_bytree': 0.8},
    {'model__learning_rate': 0.05, 'model__max_depth': 8,  'model__n_estimators': 200, 'model__subsample': 0.7, 'model__reg_lambda': 1, 'model__colsample_bytree': 0.8}
]

# List to store results from all candidate experiments
overfitting_results = []

print("\nRunning experiments to reduce overfitting (including reducing n_estimators):")
# Loop over each candidate parameter set
for i, params in enumerate(candidate_params, start=1):
    print(f"\nExperiment {i} with parameters: {params}")
    # Build a pipeline with StandardScaler and XGBRegressor
    pipeline = Pipeline([
        ("scaler", RobustScaler()),
        ("model", XGBRegressor(random_state=42, objective='reg:squarederror'))
    ])
    # Set candidate parameters in the pipeline
    pipeline.set_params(**params)

    # Evaluate using cross-validation: MAPE and R²
    mape_scores = cross_val_score(pipeline, X_best, y, cv=kf, scoring=scorer, n_jobs=-1)
    cv_mape = -np.mean(mape_scores)  # Convert negative MAPE to positive value
    mape_std = np.std(-mape_scores)

    r2_scores = cross_val_score(pipeline, X_best, y, cv=kf, scoring='r2', n_jobs=-1)
    cv_r2 = np.mean(r2_scores)
    r2_std = np.std(r2_scores)

    print(f"CV MAPE: {cv_mape:.4f} (std: {mape_std:.4f}) | CV R²: {cv_r2:.4f} (std: {r2_std:.4f})")

    overfitting_results.append({
        'Experiment': i,
        'Parameters': params,
        'CV_MAPE_mean': cv_mape,
        'CV_MAPE_std': mape_std,
        'CV_R2_mean': cv_r2,
        'CV_R2_std': r2_std
    })

# Convert experiment results to a DataFrame and sort by CV_MAPE_mean
overfitting_df = pd.DataFrame(overfitting_results).sort_values(by='CV_MAPE_mean')
print("\nOverfitting Reduction Experiments Results:")
print(overfitting_df)

# Save the full experiments DataFrame to a CSV file
overfitting_df.to_csv("overfitting_reduction_experiments.csv", index=False)

# Select the best candidate experiment (lowest CV_MAPE_mean)
best_candidate = overfitting_df.iloc[0]
print("\nBest candidate experiment:")
print(best_candidate)

# Re-fit the best candidate pipeline on the entire training data using the best features
best_params_candidate = best_candidate['Parameters']
final_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", XGBRegressor(random_state=42, objective='reg:squarederror'))
])
final_pipeline.set_params(**best_params_candidate)
final_pipeline.fit(X_best, y)

# Save the final refined model
joblib.dump(final_pipeline, "best_model_overfitting_refined.pkl")
print("\nFinal refined model saved!")
print(f"Model: XGBRegressor")
print(f"Features: {best_features}")
print(f"Best CV MAPE after overfitting reduction experiments: {best_candidate['CV_MAPE_mean']:.4f}")
print(f"Best CV R²: {best_candidate['CV_R2_mean']:.4f}")



Running experiments to reduce overfitting (including reducing n_estimators):

Experiment 1 with parameters: {'model__learning_rate': 0.05, 'model__max_depth': 10, 'model__n_estimators': 300, 'model__subsample': 0.8}
CV MAPE: 0.0086 (std: 0.0001) | CV R²: 0.5675 (std: 0.0042)

Experiment 2 with parameters: {'model__learning_rate': 0.1, 'model__max_depth': 10, 'model__n_estimators': 600, 'model__subsample': 0.8, 'model__reg_lambda': 1}
CV MAPE: 0.0044 (std: 0.0000) | CV R²: 0.8679 (std: 0.0018)

Experiment 3 with parameters: {'model__learning_rate': 0.1, 'model__max_depth': 10, 'model__n_estimators': 600, 'model__subsample': 0.8, 'model__reg_alpha': 0.1}
CV MAPE: 0.0058 (std: 0.0000) | CV R²: 0.7853 (std: 0.0026)

Experiment 4 with parameters: {'model__learning_rate': 0.1, 'model__max_depth': 8, 'model__n_estimators': 300, 'model__subsample': 0.8}
CV MAPE: 0.0091 (std: 0.0001) | CV R²: 0.5215 (std: 0.0018)

Experiment 5 with parameters: {'model__learning_rate': 0.1, 'model__max_depth': 