# Predição RU Unicamp

Objetivo: Prever quantos alunos irão a cada restaurante (RU, RA, RS) em um determinado dia.

In [None]:
# Bibliotecas

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import os
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor
import numpy as np
import logging
import holidays



In [None]:
# Variaveis Locais
br_holidays = holidays.Brazil(years=[2024, 2025])
sp_holidays = holidays.Brazil(years=[2024, 2025], prov='SP')

SEED = 59

## Carregando os Dados

In [None]:
#df = pd.read_csv("dados_volumetria_cardapio.csv")
df = pd.read_csv("dados_transformados.csv")
df=df.drop(columns=['tmin','tmax'])
df.columns
df.head()

KeyError: "['tmin', 'tmax'] not found in axis"

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
# Renomeando colunas
#df.columns = [
#    "data", "ru_almoco", "ra_almoco", "rs_almoco", "total_almoco", "ru_janta", "ra_janta", "rs_janta", "total_janta", "total_dia",
 #   "dia_semana", "mes", "dia_mes", "ano", "tipo_cardapio", "acompanhamento", "prato", "guarnicao", "opcao_vegetal", "salada", "sobremesa",
  #  "suco", "guarnicao_categoria", "prato_categoria"
#]

In [None]:
df["Data"] = pd.to_datetime(df["Data"])
#df["opcao_vegetal"] = df["opcao_vegetal"].fillna("None")
df.columns

In [None]:
# Codifica variáveis categóricas
categorical_cols = ['refeicao', 'cardapio_padrao',
       'cardapio_vegano', 'Dia_Semana']
le_dict = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    le_dict[col] = le


In [None]:
df["is_weekend"] = df["Data"].dt.weekday >= 5  # sábado=5, domingo=6
df["is_weekend"] = df["is_weekend"].astype(int)

In [None]:
sp_holidays_ts = [pd.Timestamp(f) for f in sp_holidays]
df["feriado"] = df["Data"].isin(sp_holidays_ts).astype(int)

def holiday_week(x):
    start_week = x - pd.Timedelta(days=x.weekday())       # segunda-feira da semana
    end_week = start_week + pd.Timedelta(days=6)          # domingo da semana
    return int(any(start_week <= f <= end_week for f in sp_holidays_ts))

df["is_holiday_week"] = df["Data"].apply(holiday_week)

In [None]:
df.head(16)

### Modelagem

In [None]:
def prepare_data(df, target_col, test_size):
    df["Data"] = pd.to_datetime(df["Data"])
    #df["opcao_vegetal"] = df["opcao_vegetal"].fillna("None")

    #categorical_cols = ["dia_semana", "tipo_cardapio", "acompanhamento",
    #                    "prato", "guarnicao", "opcao_vegetal", "salada",
    #                    "sobremesa", "suco", "guarnicao_categoria", "prato_categoria"]
    categorical_cols = ['refeicao', 'cardapio_padrao',
       'cardapio_vegano', 'Dia_Semana','Data']
    le_dict = {}
    for col in categorical_cols:
      le = LabelEncoder()
      df[col] = le.fit_transform(df[col].astype(str))
      le_dict[col] = le

    X = df.drop(columns=categorical_cols)

    y = df[target_col]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, shuffle=False
    )

    return X_train, X_test, y_train, y_test, X.columns.tolist()  # retorna lista de features usadas

In [None]:
def train_model(df, target_col, model, parameters, test_size=0.91):
    X_train, X_test, y_train, y_test, features_used = prepare_data(df, target_col, test_size)


    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred) * 100

    print(f"MAE: {mae:.2f} alunos")
    print(f"RMSE: {rmse:.2f} alunos")
    print(f"R²: {r2:.3f}")


In [None]:
params = {"n_estimators": 400, "random_state": SEED}
model = RandomForestRegressor(n_estimators=params["n_estimators"], random_state=SEED)

train_model(df, target_col="total_refeicao", model=model, parameters=params)

In [None]:
df.columns
df.head(12)