<a href="https://colab.research.google.com/github/cesargar1507/DatasetsUB/blob/main/M6_AI4_GarciaCesar.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [20]:
ruta = "https://raw.githubusercontent.com/cesargar1507/DatasetsUB/main/act4_kc_house_data.csv"
df = pd.read_csv(ruta,sep=",")

In [21]:
df.head(5)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [22]:
df.isna().sum()

id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

**PREPROCESADO DE LOS DATOS**

In [23]:
df['sqft_living'] = df['sqft_living'] * 0.092903
df['sqft_lot'] = df['sqft_lot'] * 0.092903
df['sqft_above'] = df['sqft_above'] * 0.092903
df['sqft_basement'] = df['sqft_basement'] * 0.092903
df['sqft_living15'] = df['sqft_living15'] * 0.092903
df['sqft_lot15'] = df['sqft_lot15'] * 0.092903

In [24]:
df['date'] = df['date'].str.slice(0, 8)

**DIVISION DE DATOS EN ENTRENAMIENTO Y VALIDACION**

In [25]:
X = df.drop(columns=['id', 'price'])
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**MODELO ARBOLES DE DECISION**

In [26]:
tree_model = DecisionTreeRegressor(random_state=42)
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)

**MODELO RANDOM FOREST**

In [27]:
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

**MODELO GRADIENT BOOSTING**

In [28]:
gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)

In [29]:
r2_tree = r2_score(y_test, y_pred_tree)
r2_rf = r2_score(y_test, y_pred_rf)
r2_gb = r2_score(y_test, y_pred_gb)

print(f"R² Árbol de Decisión: {r2_tree}")
print(f"R² Random Forest: {r2_rf}")
print(f"R² Gradient Boosting: {r2_gb}")

R² Árbol de Decisión: 0.6997745902592047
R² Random Forest: 0.8510606536544804
R² Gradient Boosting: 0.8572563170407257


In [30]:
param_grid_tree = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}

tree = DecisionTreeRegressor(random_state=42)
grid_search_tree = GridSearchCV(tree, param_grid_tree, cv=5, scoring='r2')
grid_search_tree.fit(X_train, y_train)

best_tree_model = grid_search_tree.best_estimator_
y_pred_tree = best_tree_model.predict(X_test)
r2_tree = r2_score(y_test, y_pred_tree)
print(f"Mejor R² Árbol de Decisión: {r2_tree}")
print(f"Mejores Hiperparámetros Árbol de Decisión: {grid_search_tree.best_params_}")

Mejor R² Árbol de Decisión: 0.7778096206681626
Mejores Hiperparámetros Árbol de Decisión: {'max_depth': 10, 'min_samples_leaf': 5, 'min_samples_split': 2}


In [None]:
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}

rf = RandomForestRegressor(random_state=42)
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='r2')
grid_search_rf.fit(X_train, y_train)

best_rf_model = grid_search_rf.best_estimator_
y_pred_rf = best_rf_model.predict(X_test)
r2_rf = r2_score(y_test, y_pred_rf)
print(f"Mejor R² Random Forest: {r2_rf}")
print(f"Mejores Hiperparámetros Random Forest: {grid_search_rf.best_params_}")

In [None]:
param_grid_gb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}

gb = GradientBoostingRegressor(random_state=42)
grid_search_gb = GridSearchCV(gb, param_grid_gb, cv=5, scoring='r2')
grid_search_gb.fit(X_train, y_train)

best_gb_model = grid_search_gb.best_estimator_
y_pred_gb = best_gb_model.predict(X_test)
r2_gb = r2_score(y_test, y_pred_gb)
print(f"Mejor R² Gradient Boosting: {r2_gb}")
print(f"Mejores Hiperparámetros Gradient Boosting: {grid_search_gb.best_params_}")