In [None]:
import pandas as pd
import numpy as np
import holidays
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import plotly.express as px

## Feature Engineering

In [None]:
# ===========================================
# Configurações e dados
# ===========================================
SEED = 42
br_holidays = holidays.Brazil(years=[2024, 2025])
sp_holidays = holidays.Brazil(years=[2024, 2025], prov='SP')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Carrega dataset
df_transformada = pd.read_csv("dados_transformados.csv")
#df_transformada = pd.read_csv("cardapio_final (1).csv")
df_transformada["Data"] = pd.to_datetime(df_transformada["Data"])

FileNotFoundError: [Errno 2] No such file or directory: 'dados_transformados.csv'

In [None]:
df_transformada.head()

In [None]:
df_categorizada = pd.read_csv("cardapio_final.csv")
df_categorizada["Data"] = pd.to_datetime(df_transformada["Data"])
df_categorizada.head()

In [None]:
df = df_transformada.merge(
    df_categorizada[["Data", "cardapio_trans", "Ferias"]],
    on="Data",
    how="left"
)

In [None]:
# ===========================================
# Feature Engineering
# ===========================================
# Codificação categórica
categorical_cols = ["refeicao", "Dia_Semana", "cardapio_trans", "Ferias"]
le_dict = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

    le_dict[col] = le

# Features de tempo
df["is_weekend"] = (df["Data"].dt.weekday >= 5).astype(int)

sp_holidays_ts = [pd.Timestamp(f) for f in sp_holidays]
df["feriado"] = df["Data"].isin(sp_holidays_ts).astype(int)

def holiday_week(x):
    start_week = x - pd.Timedelta(days=x.weekday())  # segunda
    end_week = start_week + pd.Timedelta(days=6)     # domingo
    return int(any(start_week <= f <= end_week for f in sp_holidays_ts))

df["is_holiday_week"] = df["Data"].apply(holiday_week)
df["days_to_holiday"] = df["Data"].apply(
    lambda x: min(abs((x - f).days) for f in sp_holidays_ts)
)

# Lags e médias móveis
df["lag_1"] = df["total_refeicao"].shift(1)
df["lag_7"] = df["total_refeicao"].shift(7)
# df["rolling_3"] = df["total_refeicao"].rolling(3).mean()
# df["rolling_7"] = df["total_refeicao"].rolling(7).mean()
# df["rolling_3_mod"] = df["rolling_3"] + np.random.normal(
#     0, 0.1 * df["rolling_3"].std(), len(df)
# )

# Remove linhas com NaN (de lags)
df = df.dropna()

# Features adicionais
df["mes"] = df["Data"].dt.month
df["semana_ano"] = df["Data"].dt.isocalendar().week.astype(int)

## Modelagem

In [None]:
# ===========================================
# Divisão treino / teste
# ===========================================
split_date = '2025-04-30'
train_df = df[df["Data"] <= split_date]
test_df = df[df["Data"] > split_date]

# features = [
#     "refeicao", "cardapio_padrao", "Dia_Semana",
#     "precip", "tavg", "tmin", "tmax",
#     "is_weekend", "feriado", "is_holiday_week",
#     "lag_1", "lag_7", "rolling_3", "rolling_7",
#     "mes", "semana_ano", "days_to_holiday"
# ]

features = [
    "refeicao", "Dia_Semana", "tavg",
    "is_weekend", "feriado", "is_holiday_week",
    "lag_1", "lag_7",
    "mes", "semana_ano", "days_to_holiday", "cardapio_trans", "Ferias"
]
target = "total_refeicao"

In [None]:
# get_dummies garante compatibilidade
X_train = pd.get_dummies(train_df[features], columns=categorical_cols, drop_first=True)
X_test = pd.get_dummies(test_df[features], columns=categorical_cols, drop_first=True)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

In [None]:
y_train = train_df[target]
y_test = test_df[target]

In [None]:
# ===========================================
# Treinamento com RandomForest + GridSearch
# ===========================================
# param_grid = {
#     "n_estimators": [200, 400, 800],
#     "max_depth": [None, 10, 20],
#     "min_samples_split": [2, 5, 10],
#     "min_samples_leaf": [1, 2, 4]
# }
#
# grid = GridSearchCV(
#     RandomForestRegressor(random_state=SEED),
#     param_grid,
#     cv=3,
#     scoring="r2",
#     n_jobs=-1
# )
#
# grid.fit(X_train, y_train)
# print("Melhores parâmetros:", grid.best_params_)
#
# best_rf = grid.best_estimator_

In [None]:
# model = RandomForestRegressor(random_state=SEED, n_estimators=300, max_depth=12, min_samples_split=2, min_samples_leaf=1)
model = RandomForestRegressor(random_state=SEED, n_estimators=400, max_depth=None, min_samples_split=2, min_samples_leaf=2)
model.fit(X_train, y_train)

In [None]:
# ===========================================
# Avaliação do modelo
# ===========================================
#y_pred = best_rf.predict(X_test)
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred) * 100

print(f"\n--- Random Forest ---")
print(f"MAE: {mae:.2f} alunos")
print(f"RMSE: {rmse:.2f} alunos")
print(f"R²: {r2:.3f}")

feat_names = X_train.columns
importances = model.feature_importances_

feature_importance = pd.DataFrame({
    "feature": feat_names,
    "importance": importances
}).sort_values(by="importance", ascending=False)
print("\n")
print(feature_importance)

In [None]:
# Teste alternativo com HistGradientBoosting
hgb = HistGradientBoostingRegressor(random_state=SEED)
hgb.fit(X_train, y_train)
y_pred_hgb = hgb.predict(X_test)

mae_hgb = mean_absolute_error(y_test, y_pred_hgb)
rmse_hgb = np.sqrt(mean_squared_error(y_test, y_pred_hgb))
r2_hgb = r2_score(y_test, y_pred_hgb) * 100

print(f"\n--- HistGradientBoosting ---")
print(f"MAE: {mae_hgb:.2f} alunos")
print(f"RMSE: {rmse_hgb:.2f} alunos")
print(f"R²: {r2_hgb:.3f}")

In [None]:
# Visualização
resultados = test_df[["Data"]].copy()
resultados["Real"] = y_test.values
resultados["Previsto_RF"] = y_pred
resultados["Previsto_HGB"] = y_pred_hgb

plt.figure(figsize=(12, 6))
plt.plot(resultados["Data"], resultados["Real"], label="Real", linewidth=2)
plt.plot(resultados["Data"], resultados["Previsto_RF"], "--", label="Random Forest")
plt.plot(resultados["Data"], resultados["Previsto_HGB"], ":", label="HistGradientBoosting")
plt.title("Previsão de Demanda de Refeições", fontsize=14)
plt.xlabel("Data")
plt.ylabel("Total de Refeições")
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()


# Métodos de Aprendizado não supervisionado

In [None]:
# K-means
print(df.columns.tolist())

cols = ['Dia_Semana', 'cardapio_trans', 'refeicao',
        'Ferias', 'feriado', 'is_holiday_week',
        'tavg', 'precip']

# one-hot sem drop_first
X = pd.get_dummies(df[cols], drop_first=False)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Aplico o método do cotovelo

inertias = []
K = range(2, 12)

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)

# Usamos o Silhouette Score para avaliar
sil_scores = []

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X_scaled)
    sil = silhouette_score(X_scaled, labels)
    sil_scores.append(sil)

for k, sil in zip(K, sil_scores):
    print(f"k={k} → Silhouette={sil:.4f}")

## Testar modelo para predição

In [None]:
def prever_consumo(model, df, le_dict, X_train, data_prevista,
                   refeicao, cardapio_trans, tavg):

    # Monta linha base
    new = pd.DataFrame({"Data": [pd.to_datetime(data_prevista)]})

    # Converte dias da semana para PT igual ao treino
    dias_pt = {
        0: "Segunda",
        1: "Terça",
        2: "Quarta",
        3: "Quinta",
        4: "Sexta",
        5: "Sábado",
        6: "Domingo"
    }
    dia_idx = new["Data"].dt.weekday.iloc[0]
    dia_semana = dias_pt[dia_idx]

    # Label encoding com fallback para categorias novas
    def safe_transform(col, value):
        if value in le_dict[col].classes_:
            return le_dict[col].transform([value])[0]
        print(f"Aviso: nova categoria '{value}' em '{col}'. Definindo como 0.")
        return 0

    new["refeicao"] = safe_transform("refeicao", refeicao)
    new["cardapio_trans"] = safe_transform("cardapio_trans", cardapio_trans)
    new["Dia_Semana"] = safe_transform("Dia_Semana", dia_semana)

    # Clima
    new["tavg"] = tavg

    # Feriados
    sp_holidays_ts = [pd.Timestamp(f) for f in sp_holidays]
    data = new["Data"].iloc[0]

    new["is_weekend"] = int(dia_idx >= 5)
    new["feriado"] = int(data in sp_holidays_ts)

    def holiday_week(x):
        start = x - pd.Timedelta(days=x.weekday())
        end = start + pd.Timedelta(days=6)
        return int(any(start <= f <= end for f in sp_holidays_ts))

    new["is_holiday_week"] = holiday_week(data)
    new["days_to_holiday"] = min(abs((data - f).days) for f in sp_holidays_ts)

    # Tempo
    new["mes"] = new["Data"].dt.month
    new["semana_ano"] = new["Data"].dt.isocalendar().week.astype(int)

    # Lags e médias móveis baseadas no dataset mais recente
    ultima_data = df["Data"].max()

    new["lag_1"] = df.loc[df["Data"] == ultima_data, "total_refeicao"].values[0]

    data_7 = ultima_data - pd.Timedelta(days=7)
    new["lag_7"] = df.loc[df["Data"] == data_7, "total_refeicao"].values[0] if (df["Data"] == data_7).any() else new["lag_1"]

    # new["rolling_3"] = df["total_refeicao"].tail(3).mean()
    # new["rolling_7"] = df["total_refeicao"].tail(7).mean()

    # Prepara entrada p/ modelo
    new_X = new.drop(columns=["Data"])
    new_X = pd.get_dummies(new_X, columns=["refeicao", "cardapio_trans", "Dia_Semana"], drop_first=True)
    new_X = new_X.reindex(columns=X_train.columns, fill_value=0)

    # Predição
    pred = model.predict(new_X)[0]
    return round(pred, 2)


In [None]:
previsao = prever_consumo(
    model=model,
    df=df,
    le_dict=le_dict,
    X_train=X_train,
    data_prevista="2025-10-30",
    refeicao="Almoço",
    cardapio_trans="frango",
    tavg=20,
)

print("Previsão de alunos:", previsao)

In [None]:
previsao = prever_consumo(
    model=model_weighted,
    df=df,
    le_dict=le_dict,
    X_train=X_train,
    data_prevista="2025-10-30",
    refeicao="Almoço",
    cardapio_trans="carne bovina",
    tavg=20,
)

print("Previsão de alunos:", previsao)

In [None]:
previsao = prever_consumo(
    model=model_weighted,
    df=df,
    le_dict=le_dict,
    X_train=X_train,
    data_prevista="2025-10-30",
    refeicao="Jantar",
    cardapio_trans="frango",
    tavg=20,
)

print("Previsão de alunos:", previsao)

In [None]:
previsao = prever_consumo(
    model=hgb,
    df=df,
    le_dict=le_dict,
    X_train=X_train,
    data_prevista="2025-10-30",
    refeicao="Almoço",
    cardapio_trans="frango",
    tavg=20,
)

print("Previsão de alunos:", previsao)

In [None]:
model

In [None]:
feat_names = X_train.columns
importances = model.feature_importances_

feature_importance = pd.DataFrame({
    "feature": feat_names,
    "importance": importances
}).sort_values(by="importance", ascending=False)

print(feature_importance)


In [None]:
feat_names = X_train.columns
importances = model_weighted.feature_importances_

feature_importance = pd.DataFrame({
    "feature": feat_names,
    "importance": importances
}).sort_values(by="importance", ascending=False)

print(feature_importance)


In [None]:
from sklearn.tree import export_graphviz
import graphviz


In [None]:
# Pega uma árvore do Random Forest
estimator = model.estimators_[0]

dot_data = export_graphviz(
    estimator,
    out_file=None,
    feature_names=X_train.columns,
    filled=True,
    rounded=True,
    special_characters=True
)

graph = graphviz.Source(dot_data)
graph.render("arvore_random_forest")  # salva como PDF
graph
