# Importação das Bibliotecas

In [1]:
import boto3
import io
import pyarrow.parquet as pq
from sklearn.model_selection import train_test_split
import pandas as pd
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import joblib
import sagemaker
from sagemaker.sklearn.model import SKLearnModel
from joblib import load
from joblib import dump
from joblib import load


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


# Coleta dos dados

In [2]:
# Criação do client do S3 
s3 = boto3.client('s3')

bucket = "bucket-518893645065"
prefix_raw = "weather_data/raw/"
file_historico = "weather_data/history/historical_data.csv"

In [3]:
# Busca e baixa os arquivos salvos diariamente a partir da api da OpenWeather
response = s3.list_objects_v2( 
    Bucket=bucket,
    Prefix=prefix_raw)

arquivos_raw = [ obj["Key"]  for obj in response.get("Contents", []) if obj["Key"].endswith(".parquet")] 

dfs = []
for key in arquivos_raw:
    buffer = io.BytesIO()
    s3.download_fileobj(bucket, key, buffer)
    buffer.seek(0)
    df = pq.read_table(buffer).to_pandas()
    dfs.append(df)

df_raw = pd.concat(dfs,ignore_index=True)

In [4]:
# Busca e baixa o arquivo com os dados historicos da api da OpenWeather
obj = s3.get_object(Bucket=bucket, Key=file_historico)
df_hist = pd.read_csv(io.BytesIO(obj['Body'].read()))

# print(df_hist)

# Dados Historicos

## Análise dos Dados

In [5]:
df_hist.head(2)

Unnamed: 0,dt,dt_iso,timezone,city_name,lat,lon,temp,visibility,dew_point,feels_like,...,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,283996800,1979-01-01 00:00:00 +0000 UTC,-10800,Casa,-23.527811,-46.656732,23.55,,20.1,24.08,...,,,,,,100,804,Clouds,overcast clouds,04n
1,284000400,1979-01-01 01:00:00 +0000 UTC,-10800,Casa,-23.527811,-46.656732,23.65,,20.2,24.19,...,,,,,,100,804,Clouds,overcast clouds,04n


In [6]:
df_hist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 419726 entries, 0 to 419725
Data columns (total 28 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   dt                   419726 non-null  int64  
 1   dt_iso               419726 non-null  object 
 2   timezone             419726 non-null  int64  
 3   city_name            419726 non-null  object 
 4   lat                  419726 non-null  float64
 5   lon                  419726 non-null  float64
 6   temp                 419726 non-null  float64
 7   visibility           270086 non-null  float64
 8   dew_point            419726 non-null  float64
 9   feels_like           419726 non-null  float64
 10  temp_min             419726 non-null  float64
 11  temp_max             419726 non-null  float64
 12  pressure             419726 non-null  int64  
 13  sea_level            0 non-null       float64
 14  grnd_level           0 non-null       float64
 15  humidity         

In [7]:
# Verifica as colunas presentes em cada um dos dataframes 
columns_raw = df_raw.columns
columns_hist = df_hist.columns

# Verifica quais colunas estão presentes em ambos
igual_hist = [c for c in columns_hist if c in columns_raw]
diferentes_hist = [c for c in columns_hist if c not in columns_raw]

print("### HIST ###")
print("Iguais:", igual_hist)
print("Diferentes:", diferentes_hist)

igual_raw = [c for c in columns_raw if c in columns_hist]
diferentes_raw = [c for c in columns_raw if c not in columns_hist ]

print("### RAW ###")
print("Iguais:", igual_raw)
print("Diferentes:", diferentes_raw)

### HIST ###
Iguais: ['temp', 'temp_min', 'temp_max', 'pressure', 'humidity', 'wind_speed', 'wind_deg', 'clouds_all', 'weather_id', 'weather_main']
Diferentes: ['dt', 'dt_iso', 'timezone', 'city_name', 'lat', 'lon', 'visibility', 'dew_point', 'feels_like', 'sea_level', 'grnd_level', 'wind_gust', 'rain_1h', 'rain_3h', 'snow_1h', 'snow_3h', 'weather_description', 'weather_icon']
### RAW ###
Iguais: ['weather_id', 'weather_main', 'temp', 'temp_min', 'temp_max', 'pressure', 'humidity', 'wind_speed', 'wind_deg', 'clouds_all']
Diferentes: ['temp_feels_like', 'year', 'month', 'day', 'hour', 'day_duration']


## Tratamento dos Dados

### Renomeia a coluna fells_fike para temp_fell_like

In [8]:
df_hist = df_hist.rename(columns={"feels_like": "temp_feels_like"})
df_hist.columns

Index(['dt', 'dt_iso', 'timezone', 'city_name', 'lat', 'lon', 'temp',
       'visibility', 'dew_point', 'temp_feels_like', 'temp_min', 'temp_max',
       'pressure', 'sea_level', 'grnd_level', 'humidity', 'wind_speed',
       'wind_deg', 'wind_gust', 'rain_1h', 'rain_3h', 'snow_1h', 'snow_3h',
       'clouds_all', 'weather_id', 'weather_main', 'weather_description',
       'weather_icon'],
      dtype='object')

In [9]:
# Verifica as colunas presentes em cada um dos dataframes 
columns_raw = df_raw.columns
columns_hist = df_hist.columns

# Verifica quais colunas estão presentes em ambos
igual_hist = [c for c in columns_hist if c in columns_raw]
diferentes_hist = [c for c in columns_hist if c not in columns_raw]

print("### HIST ###")
print("Iguais:", igual_hist)
print("Diferentes:", diferentes_hist)

igual_raw = [c for c in columns_raw if c in columns_hist]
diferentes_raw = [c for c in columns_raw if c not in columns_hist ]

print("### RAW ###")
print("Iguais:", igual_raw)
print("Diferentes:", diferentes_raw)

### HIST ###
Iguais: ['temp', 'temp_feels_like', 'temp_min', 'temp_max', 'pressure', 'humidity', 'wind_speed', 'wind_deg', 'clouds_all', 'weather_id', 'weather_main']
Diferentes: ['dt', 'dt_iso', 'timezone', 'city_name', 'lat', 'lon', 'visibility', 'dew_point', 'sea_level', 'grnd_level', 'wind_gust', 'rain_1h', 'rain_3h', 'snow_1h', 'snow_3h', 'weather_description', 'weather_icon']
### RAW ###
Iguais: ['weather_id', 'weather_main', 'temp', 'temp_feels_like', 'temp_min', 'temp_max', 'pressure', 'humidity', 'wind_speed', 'wind_deg', 'clouds_all']
Diferentes: ['year', 'month', 'day', 'hour', 'day_duration']


### Retira as colunas que não serão utilizadas

In [10]:
# Adiciona a coluna de data
igual_hist.append("dt_iso") 
print("Iguais:", igual_hist)

# Mantém apenas as colunas comuns aos dois + a coluna de data
df_hist = df_hist[igual_hist]
print(f"Colunas atuais em df_hist: {df_hist.columns}")


Iguais: ['temp', 'temp_feels_like', 'temp_min', 'temp_max', 'pressure', 'humidity', 'wind_speed', 'wind_deg', 'clouds_all', 'weather_id', 'weather_main', 'dt_iso']
Colunas atuais em df_hist: Index(['temp', 'temp_feels_like', 'temp_min', 'temp_max', 'pressure',
       'humidity', 'wind_speed', 'wind_deg', 'clouds_all', 'weather_id',
       'weather_main', 'dt_iso'],
      dtype='object')


In [11]:
df_hist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 419726 entries, 0 to 419725
Data columns (total 12 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   temp             419726 non-null  float64
 1   temp_feels_like  419726 non-null  float64
 2   temp_min         419726 non-null  float64
 3   temp_max         419726 non-null  float64
 4   pressure         419726 non-null  int64  
 5   humidity         419726 non-null  int64  
 6   wind_speed       419726 non-null  float64
 7   wind_deg         419726 non-null  int64  
 8   clouds_all       419726 non-null  int64  
 9   weather_id       419726 non-null  int64  
 10  weather_main     419726 non-null  object 
 11  dt_iso           419726 non-null  object 
dtypes: float64(5), int64(5), object(2)
memory usage: 38.4+ MB


### Criação das colunas year, month, day e hour

In [12]:
# Converte a coluna dt_iso para datetime
df_hist["dt_iso"] = pd.to_datetime(
    df_hist["dt_iso"], 
    format="%Y-%m-%d %H:%M:%S %z UTC", 
    errors="coerce"
)

In [13]:
df_hist["year"] = df_hist["dt_iso"].dt.year
df_hist["month"] = df_hist["dt_iso"].dt.month
df_hist["day"] = df_hist["dt_iso"].dt.day
df_hist["hour"] = df_hist["dt_iso"].dt.hour

df_hist.head(1)

Unnamed: 0,temp,temp_feels_like,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,clouds_all,weather_id,weather_main,dt_iso,year,month,day,hour
0,23.55,24.08,23.02,24.66,1011,81,1.28,9,100,804,Clouds,1979-01-01 00:00:00+00:00,1979,1,1,0


In [14]:
# df_hist.drop(columns=["dt_iso"], inplace=True)
df_hist.drop(columns=["dt_iso"], inplace=True)

In [15]:
df_hist.info()
df_hist.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 419726 entries, 0 to 419725
Data columns (total 15 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   temp             419726 non-null  float64
 1   temp_feels_like  419726 non-null  float64
 2   temp_min         419726 non-null  float64
 3   temp_max         419726 non-null  float64
 4   pressure         419726 non-null  int64  
 5   humidity         419726 non-null  int64  
 6   wind_speed       419726 non-null  float64
 7   wind_deg         419726 non-null  int64  
 8   clouds_all       419726 non-null  int64  
 9   weather_id       419726 non-null  int64  
 10  weather_main     419726 non-null  object 
 11  year             419726 non-null  int32  
 12  month            419726 non-null  int32  
 13  day              419726 non-null  int32  
 14  hour             419726 non-null  int32  
dtypes: float64(5), int32(4), int64(5), object(1)
memory usage: 41.6+ MB


Unnamed: 0,temp,temp_feels_like,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,clouds_all,weather_id,weather_main,year,month,day,hour
0,23.55,24.08,23.02,24.66,1011,81,1.28,9,100,804,Clouds,1979,1,1,0


# Dados Atuais

## Análise dos dados

In [16]:
df_raw.info()
df_raw.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241 entries, 0 to 240
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   weather_id       241 non-null    int64  
 1   weather_main     241 non-null    object 
 2   temp             241 non-null    float64
 3   temp_feels_like  241 non-null    float64
 4   temp_min         241 non-null    float64
 5   temp_max         241 non-null    float64
 6   pressure         241 non-null    int64  
 7   humidity         241 non-null    int64  
 8   wind_speed       241 non-null    float64
 9   wind_deg         241 non-null    int64  
 10  clouds_all       241 non-null    int64  
 11  year             241 non-null    int64  
 12  month            241 non-null    int64  
 13  day              241 non-null    int64  
 14  hour             241 non-null    int64  
 15  day_duration     241 non-null    int64  
dtypes: float64(5), int64(10), object(1)
memory usage: 30.3+ KB


Unnamed: 0,weather_id,weather_main,temp,temp_feels_like,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,clouds_all,year,month,day,hour,day_duration
0,800,Clear,24.81,24.39,24.64,25.96,1019,40,1.54,120,0,2025,10,1,0,44246
1,800,Clear,24.18,23.7,23.9,24.64,1019,40,3.09,90,0,2025,10,1,1,44246


## Tratamento dos dados

### Retira a coluna day_duration

In [17]:
df_raw.drop(columns=["day_duration"], inplace=True)
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241 entries, 0 to 240
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   weather_id       241 non-null    int64  
 1   weather_main     241 non-null    object 
 2   temp             241 non-null    float64
 3   temp_feels_like  241 non-null    float64
 4   temp_min         241 non-null    float64
 5   temp_max         241 non-null    float64
 6   pressure         241 non-null    int64  
 7   humidity         241 non-null    int64  
 8   wind_speed       241 non-null    float64
 9   wind_deg         241 non-null    int64  
 10  clouds_all       241 non-null    int64  
 11  year             241 non-null    int64  
 12  month            241 non-null    int64  
 13  day              241 non-null    int64  
 14  hour             241 non-null    int64  
dtypes: float64(5), int64(9), object(1)
memory usage: 28.4+ KB


In [18]:
# Verifica as colunas presentes em cada um dos dataframes 
columns_raw = df_raw.columns
columns_hist = df_hist.columns

# Verifica quais colunas estão presentes em ambos
igual = [c for c in columns_hist if c in columns_raw]
diferentes = [c for c in columns_hist if c not in columns_raw]

print("Iguais:", igual)
print("Diferentes:", diferentes)

igual = [c for c in columns_raw if c in columns_hist]
diferentes = [c for c in columns_raw if c not in columns_hist ]

print("Iguais:", igual)
print("Diferentes:", diferentes)


Iguais: ['temp', 'temp_feels_like', 'temp_min', 'temp_max', 'pressure', 'humidity', 'wind_speed', 'wind_deg', 'clouds_all', 'weather_id', 'weather_main', 'year', 'month', 'day', 'hour']
Diferentes: []
Iguais: ['weather_id', 'weather_main', 'temp', 'temp_feels_like', 'temp_min', 'temp_max', 'pressure', 'humidity', 'wind_speed', 'wind_deg', 'clouds_all', 'year', 'month', 'day', 'hour']
Diferentes: []


In [19]:
print(columns_hist, columns_raw)

Index(['temp', 'temp_feels_like', 'temp_min', 'temp_max', 'pressure',
       'humidity', 'wind_speed', 'wind_deg', 'clouds_all', 'weather_id',
       'weather_main', 'year', 'month', 'day', 'hour'],
      dtype='object') Index(['weather_id', 'weather_main', 'temp', 'temp_feels_like', 'temp_min',
       'temp_max', 'pressure', 'humidity', 'wind_speed', 'wind_deg',
       'clouds_all', 'year', 'month', 'day', 'hour'],
      dtype='object')


### Concatenação dos dataframes

In [20]:
df_all = pd.concat([df_hist, df_raw], ignore_index=True)
print(df_all.shape)      
print(df_all.columns)        
df_all.head()              



(419967, 15)
Index(['temp', 'temp_feels_like', 'temp_min', 'temp_max', 'pressure',
       'humidity', 'wind_speed', 'wind_deg', 'clouds_all', 'weather_id',
       'weather_main', 'year', 'month', 'day', 'hour'],
      dtype='object')


Unnamed: 0,temp,temp_feels_like,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,clouds_all,weather_id,weather_main,year,month,day,hour
0,23.55,24.08,23.02,24.66,1011,81,1.28,9,100,804,Clouds,1979,1,1,0
1,23.65,24.19,22.98,24.67,1011,81,1.32,9,100,804,Clouds,1979,1,1,1
2,23.53,24.13,22.68,24.54,1012,84,1.35,2,100,804,Clouds,1979,1,1,2
3,20.61,21.08,20.0,21.74,1012,90,1.1,308,100,804,Clouds,1979,1,1,3
4,20.47,20.97,19.57,21.51,1012,92,1.6,215,100,500,Rain,1979,1,1,4


In [21]:
# Ordena os valores

In [22]:
df_all = df_all.sort_values(by=["year", "month", "day", "hour"]).reset_index(drop=True)



In [23]:
df_all.head(5)

Unnamed: 0,temp,temp_feels_like,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,clouds_all,weather_id,weather_main,year,month,day,hour
0,23.55,24.08,23.02,24.66,1011,81,1.28,9,100,804,Clouds,1979,1,1,0
1,23.65,24.19,22.98,24.67,1011,81,1.32,9,100,804,Clouds,1979,1,1,1
2,23.53,24.13,22.68,24.54,1012,84,1.35,2,100,804,Clouds,1979,1,1,2
3,20.61,21.08,20.0,21.74,1012,90,1.1,308,100,804,Clouds,1979,1,1,3
4,20.47,20.97,19.57,21.51,1012,92,1.6,215,100,500,Rain,1979,1,1,4


# Análise dos Dados

In [24]:
df_all.describe()

Unnamed: 0,temp,temp_feels_like,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,clouds_all,weather_id,year,month,day,hour
count,419967.0,419967.0,419967.0,419967.0,419967.0,419967.0,419967.0,419967.0,419967.0,419967.0,419967.0,419967.0,419967.0,419967.0
mean,20.158817,20.279712,19.430135,20.850687,1016.435458,77.389292,3.162139,159.997581,56.58811,724.841457,2002.073822,6.496279,15.727088,11.546602
std,4.502631,4.784747,4.502127,4.55402,4.39958,16.07517,1.76521,101.879027,38.896072,131.76675,13.465246,3.453002,8.797023,6.939614
min,0.88,-1.54,0.88,2.18,1001.0,11.0,0.0,0.0,0.0,200.0,1979.0,1.0,1.0,0.0
25%,17.01,16.99,16.27,17.64,1013.0,68.0,2.06,90.0,20.0,701.0,1990.0,3.0,8.0,6.0
50%,20.06,20.3,19.35,20.77,1016.0,82.0,3.09,141.0,75.0,800.0,2002.0,7.0,16.0,12.0
75%,23.0,23.27,22.18,23.76,1019.0,89.0,4.12,206.0,98.0,803.0,2014.0,9.0,23.0,18.0
max,38.32,40.33,38.03,38.9,1034.0,100.0,30.8,360.0,100.0,804.0,2025.0,12.0,31.0,23.0


In [25]:
df_all.isnull().sum()

temp               0
temp_feels_like    0
temp_min           0
temp_max           0
pressure           0
humidity           0
wind_speed         0
wind_deg           0
clouds_all         0
weather_id         0
weather_main       0
year               0
month              0
day                0
hour               0
dtype: int64

In [26]:
df_all["weather_main"].value_counts()

weather_main
Clouds          195267
Clear            83375
Rain             72629
Mist             42137
Haze             16433
Drizzle           5318
Thunderstorm      3684
Fog                920
Smoke              182
Tornado             10
Dust                 9
Squall               3
Name: count, dtype: int64

In [27]:
# Criando o mapeamento
weather_map = df_all.groupby('weather_main')['weather_id'].first().to_dict()
print(weather_map)


{'Clear': 800, 'Clouds': 804, 'Drizzle': 300, 'Dust': 761, 'Fog': 741, 'Haze': 721, 'Mist': 701, 'Rain': 500, 'Smoke': 711, 'Squall': 771, 'Thunderstorm': 201, 'Tornado': 781}


# Feature Enginnering

### Adiciona as colunas relacionas a horarios especificos (11h, 16h e 21h)

In [28]:
# Filtra horas específicas
df_6h = df_all[df_all["hour"] == 6]
df_11h = df_all[df_all["hour"] == 11]
df_16h = df_all[df_all["hour"] == 16]
df_21h = df_all[df_all["hour"] == 21]

# Selecionar apenas colunas necessárias
features_6h = ["year", "month", "day", "temp", "humidity", "pressure",
               "clouds_all", "wind_speed", "wind_deg",
               "temp_min", "temp_max", "temp_feels_like"]

df_6h = df_6h[features_6h]
df_11h = df_11h[["year", "month", "day", "temp"]]
df_11h = df_11h.rename(columns={"temp": "temp_11h"})
df_16h = df_16h[["year", "month", "day", "temp"]]
df_16h = df_16h.rename(columns={"temp": "temp_16h"})
df_21h = df_21h[["year", "month", "day", "temp"]]
df_21h = df_21h.rename(columns={"temp": "temp_21h"})


# Merge pelo dia
df_model = df_6h.merge(df_11h, on=["year", "month", "day"])
df_model = df_model.merge(df_16h, on=["year", "month", "day"])
df_model = df_model.merge(df_21h, on=["year", "month", "day"])


In [29]:
df_model.info()
df_model.head(1)
df_model = df_model.dropna()
df_model.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18991 entries, 0 to 18990
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   year             18991 non-null  int64  
 1   month            18991 non-null  int64  
 2   day              18991 non-null  int64  
 3   temp             18991 non-null  float64
 4   humidity         18991 non-null  int64  
 5   pressure         18991 non-null  int64  
 6   clouds_all       18991 non-null  int64  
 7   wind_speed       18991 non-null  float64
 8   wind_deg         18991 non-null  int64  
 9   temp_min         18991 non-null  float64
 10  temp_max         18991 non-null  float64
 11  temp_feels_like  18991 non-null  float64
 12  temp_11h         18991 non-null  float64
 13  temp_16h         18991 non-null  float64
 14  temp_21h         18991 non-null  float64
dtypes: float64(8), int64(7)
memory usage: 2.2 MB


Unnamed: 0,year,month,day,temp,humidity,pressure,clouds_all,wind_speed,wind_deg,temp_min,temp_max,temp_feels_like,temp_11h,temp_16h,temp_21h
count,18991.0,18991.0,18991.0,18991.0,18991.0,18991.0,18991.0,18991.0,18991.0,18991.0,18991.0,18991.0,18991.0,18991.0,18991.0
mean,2002.707756,6.484545,15.70465,17.482156,87.853931,1016.003949,59.104997,2.327881,127.86462,16.755738,18.167592,17.567089,19.185532,24.063725,20.878141
std,13.340298,3.478144,8.770998,3.301751,8.618082,4.332973,40.279382,1.538949,100.672289,3.311883,3.310247,3.629733,3.762267,4.477194,3.951282
min,1979.0,1.0,1.0,3.37,30.0,1002.0,0.0,0.0,0.0,2.06,4.26,0.08,4.87,8.77,7.05
25%,1991.0,3.0,8.0,15.295,84.0,1013.0,20.0,1.5,53.0,14.48,15.96,15.185,16.52,21.01,18.22
50%,2004.0,7.0,16.0,17.76,89.0,1016.0,75.0,2.11,120.0,17.07,18.4,17.9,19.51,24.49,21.0
75%,2014.0,10.0,23.0,20.12,94.0,1019.0,100.0,3.1,170.0,19.36,20.72,20.48,22.01,27.4,23.52
max,2025.0,12.0,31.0,27.62,100.0,1032.0,100.0,27.8,360.0,27.06,29.29,28.25,29.78,36.77,35.22


# Treinamento

In [30]:
X = df_model[["temp", "humidity", "pressure", "clouds_all", "wind_speed", "wind_deg", "temp_min", "temp_max", "temp_feels_like"]]
y = df_model[["temp_11h", "temp_16h", "temp_21h"]]
X_train, X_test, y_train, y_test, = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## Treino do modelo de regressão

In [31]:
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor

xgb_reg = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model_temp = MultiOutputRegressor(xgb_reg)
model_temp.fit(X_train, y_train)


0,1,2
,estimator,"XGBRegressor(...state=42, ...)"
,n_jobs,

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


# Avaliação

In [32]:
y_pred = model_temp.predict(X_test)

# Erro médio absoluto por target
mae_11h = mean_absolute_error(y_test["temp_11h"], y_pred[:, 0])
mae_16h = mean_absolute_error(y_test["temp_16h"], y_pred[:, 1])
mae_21h = mean_absolute_error(y_test["temp_21h"], y_pred[:, 2])

r2_11h = r2_score(y_test["temp_11h"], y_pred[:, 0])
r2_16h = r2_score(y_test["temp_16h"], y_pred[:, 1])
r2_21h = r2_score(y_test["temp_21h"], y_pred[:, 2])

print(f"MAE 11h: {mae_11h:.3f}, R²: {r2_11h:.3f}")
print(f"MAE 16h: {mae_16h:.3f}, R²: {r2_16h:.3f}")
print(f"MAE 21h: {mae_21h:.3f}, R²: {r2_21h:.3f}")


MAE 11h: 0.958, R²: 0.891
MAE 16h: 2.042, R²: 0.652
MAE 21h: 1.888, R²: 0.607


# Deploy

In [35]:
# from xgboost import XGBRegressor
# from sagemaker.xgboost.model import XGBoostModel
import sagemaker
from sagemaker.sklearn.model import SKLearnModel
import joblib, tarfile

joblib.dump(model_temp, "model.pkl")

with tarfile.open("model.tar.gz", "w:gz") as tar:
  tar.add("model.pkl")

model_path = "xgboost-sklearn/model.tar.gz" 
s3.upload_file("model.tar.gz", bucket_name, model_path)

role = sagemaker.get_execution_role()
session = sagemaker.Session()

model = SKLearnModel(
    model_data=f"s3://{bucket}/xgboost-sklearn/model.tar.gz",
    role=role,
    entry_point="inference.py",
    framework_version="1.2-1",
    py_version="py3",
)

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.t2.medium",
    endpoint_name="multioutput-xgb-endpoint"
)
