# Actividad Árboles con Advertising.csv

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv("Advertising.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,TV,radio,newspaper,sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  200 non-null    int64  
 1   TV          200 non-null    float64
 2   radio       200 non-null    float64
 3   newspaper   200 non-null    float64
 4   sales       200 non-null    float64
dtypes: float64(4), int64(1)
memory usage: 7.9 KB


In [9]:
df = df.drop(columns=["Unnamed: 0"])

In [11]:
X = df[["TV", "radio", "newspaper"]]
y = df["sales"]

In [13]:
results = {}

In [15]:
def report_fit(name, y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    print(f"{name:30s}  R^2 = {r2:.4f}")
    results[name] = r2
    return r2

In [17]:
#Regresión benchmark sin split
lin_full = LinearRegression().fit(X, y)
report_fit("LinearRegression (full)", y, lin_full.predict(X))

LinearRegression (full)         R^2 = 0.8972


0.8972106381789522

In [19]:
#Decision tree sin split
tree_full = DecisionTreeRegressor(random_state=42).fit(X, y)
report_fit("DecisionTree (full)", y, tree_full.predict(X))

DecisionTree (full)             R^2 = 1.0000


1.0

In [21]:
#Comparación sin split
print(f"LinearRegression R^2: {results['LinearRegression (full)']:.4f}")
print(f"DecisionTree    R^2: {results['DecisionTree (full)']:.4f}")

LinearRegression R^2: 0.8972
DecisionTree    R^2: 1.0000


Observaciónes (sin split):
- LinearRegression (full) = Tiene un buen ajuste de 0.8972, pero podría estar aún más ajustado.
- DecisionTree (full) = El R^2 siendo 1.00 es una señal clara de sobreajuste. Sin usar train/test split el árbol memoriza el dataset completo.

In [24]:
# 4) Train/Test split 80/20
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

lin_split = LinearRegression().fit(X_train, y_train)
report_fit("LinearRegression (train)", y_train, lin_split.predict(X_train))
report_fit("LinearRegression (test)",  y_test,  lin_split.predict(X_test))

tree_split = DecisionTreeRegressor(random_state=42).fit(X_train, y_train)
report_fit("DecisionTree (train)", y_train, tree_split.predict(X_train))
report_fit("DecisionTree (test)",  y_test,  tree_split.predict(X_test))

LinearRegression (train)        R^2 = 0.8957
LinearRegression (test)         R^2 = 0.8994
DecisionTree (train)            R^2 = 1.0000
DecisionTree (test)             R^2 = 0.9311


0.9310914968293178

Observaciónes con split 80/20:
- LinearRegression (test) = Cambió el valor de 0.89 a 0.91. Elrendimiento es más estable y generaliza razonablemente.
- DecisionTree (train) = Cambió el valor de 1.000 a 0.90–0.94. Mejoró en test respecto al full, pero aún muestra alta varianza (capacidad muy alta en train).

In [41]:
'''
#polynomial degree=2
poly = PolynomialFeatures(degree=2, include_bias=False)

poly_lin = Pipeline([
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),
    ("lin",  LinearRegression())
])
poly_lin.fit(X_train, y_train)

report_fit("Poly+Linear (train)", y_train, poly_lin.predict(X_train))
report_fit("Poly+Linear (test)",  y_test,  poly_lin.predict(X_test))
'''

'\n#polynomial degree=2\npoly = PolynomialFeatures(degree=2, include_bias=False)\n\npoly_lin = Pipeline([\n    ("poly", PolynomialFeatures(degree=2, include_bias=False)),\n    ("lin",  LinearRegression())\n])\npoly_lin.fit(X_train, y_train)\n\nreport_fit("Poly+Linear (train)", y_train, poly_lin.predict(X_train))\nreport_fit("Poly+Linear (test)",  y_test,  poly_lin.predict(X_test))\n'

In [29]:
#repetir pasos con poly=2
poly = PolynomialFeatures(degree=2, include_bias=False)

In [31]:
#benchmark poly
X_poly_full = poly.fit_transform(X)
lin_poly_full = LinearRegression().fit(X_poly_full, y)
report_fit("B | Poly+LR (full)", y, lin_poly_full.predict(X_poly_full))

B | Poly+LR (full)              R^2 = 0.9865


0.9865057435307856

In [33]:
#árbol poly
tree_poly_full = DecisionTreeRegressor(random_state=42).fit(X_poly_full, y)
report_fit("B | Poly+Tree (full)", y, tree_poly_full.predict(X_poly_full))

B | Poly+Tree (full)            R^2 = 1.0000


1.0

In [35]:
#comparación sin split (con polinomios)
print(f"Poly+LR   R^2: {results['B | Poly+LR (full)']:.4f}")
print(f"Poly+Tree R^2: {results['B | Poly+Tree (full)']:.4f}")

Poly+LR   R^2: 0.9865
Poly+Tree R^2: 1.0000


Observaciones con polinomios sin split
- Poly+LR (full)= La regresión polinómica de grado 2 mejora notablemente el ajuste respecto a la lineal simple, ya que capta interacciones y relaciones no lineales entre TV, radio y newspaper.
- Poly+Tree (full)= El árbol con variables polinómicas memoriza completamente el conjunto. Los árboles ya modelan no linealidades por sí mismos, así que expandir variables no aporta y refuerza el overfitting.
- Ambos modelos aumentan el R² al usar términos cuadráticos, pero solo el Poly+LR ofrece una mejora genuina y generalizable; el Poly+Tree pierde capacidad de generalización.

In [37]:
#train/test split 80/20 con polinomios
X_train_p = poly.fit_transform(X_train)
X_test_p  = poly.transform(X_test)

lin_poly_split = LinearRegression().fit(X_train_p, y_train)
report_fit("B | Poly+LR (train)", y_train, lin_poly_split.predict(X_train_p))
report_fit("B | Poly+LR (test)",  y_test,  lin_poly_split.predict(X_test_p))

tree_poly_split = DecisionTreeRegressor(random_state=42).fit(X_train_p, y_train)
report_fit("B | Poly+Tree (train)", y_train, tree_poly_split.predict(X_train_p))
report_fit("B | Poly+Tree (test)",  y_test,  tree_poly_split.predict(X_test_p))

B | Poly+LR (train)             R^2 = 0.9861
B | Poly+LR (test)              R^2 = 0.9869
B | Poly+Tree (train)           R^2 = 1.0000
B | Poly+Tree (test)            R^2 = 0.9486


0.9485799997029806

Observaciónes de polinomios con train/test split.
- Poly+LR (full) = Excelente desempeño y muy estable entre entrenamiento y prueba →l modelo generaliza bien y captura relaciones no lineales sin sobreajustar.
- Poly+Tree = 

In [39]:
summary = (
    pd.DataFrame(list(results.items()), columns=["Modelo","R^2"])
      .sort_values("R^2", ascending=False)
      .reset_index(drop=True)
)
display(summary)

Unnamed: 0,Modelo,R^2
0,DecisionTree (full),1.0
1,DecisionTree (train),1.0
2,B | Poly+Tree (full),1.0
3,B | Poly+Tree (train),1.0
4,B | Poly+LR (test),0.986918
5,B | Poly+LR (full),0.986506
6,B | Poly+LR (train),0.986105
7,B | Poly+Tree (test),0.94858
8,DecisionTree (test),0.931091
9,LinearRegression (test),0.899438
