# Notebook 03 — Modelagem (Machine Learning)
Treinamento de um modelo de Regressão Linear, avaliação e salvamento.

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import joblib

processed_csv = Path("data/processed/tips_clean.csv")
df = pd.read_csv(processed_csv)

# Separar X e y
X = df.drop(columns=['tip'])
y = df['tip']

print("X shape:", X.shape, "y shape:", y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Train:", X_train.shape, "Test:", X_test.shape)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

# avaliação rápida nos dados de treino
y_train_pred = model.predict(X_train)
print("Train R2:", r2_score(y_train, y_train_pred))

In [None]:
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f"Test R2: {r2:.4f}")
print(f"Test RMSE: {rmse:.4f}")

In [None]:
coeffs = pd.Series(model.coef_, index=X.columns).sort_values(ascending=False)
display(coeffs)

print("Intercept:", model.intercept_)

In [None]:
MODEL_DIR = Path("models")
MODEL_DIR.mkdir(parents=True, exist_ok=True)
model_path = MODEL_DIR / "linear_regression.pkl"
joblib.dump(model, model_path)
print(f"Saved model to: {model_path}")