In [1]:
# =====================================================
# üè† House Prices - Pipeline Aprimorado
# =====================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso


# =====================================================
# üìÅ 1. Leitura dos dados
# =====================================================
caminho_train = "/home/akel/PycharmProjects/Kaggle/HousePrices/data/train.csv"
caminho_test  = "/home/akel/PycharmProjects/Kaggle/HousePrices/data/test.csv"

train = pd.read_csv(caminho_train)
test  = pd.read_csv(caminho_test)

train=train[train.select_dtypes(include=['number']).columns] 


# =====================================================
# üßπ 2. Pr√©-processamento inicial
# =====================================================
# Remove colunas com mais de 40% de valores nulos (ajust√°vel)
colunas_nulas = train.columns[(train.isnull().sum()/train.shape[0] > 0.10)]
train = train.drop(columns=colunas_nulas)
test = test.drop(columns=[c for c in colunas_nulas if c in test.columns])

# Separa vari√°veis categ√≥ricas e num√©ricas
num_features = train.select_dtypes(include=['number']).columns.drop(['Id', 'SalePrice'])
cat_features = train.select_dtypes(include=['object']).columns

# =====================================================
# üéØ 3. Separa√ß√£o em features e target (com transforma√ß√£o log)
# =====================================================
X = train.drop(['Id', 'SalePrice'], axis=1)
y_log = np.log1p(train['SalePrice'])

# Split treino/teste
X_train, X_val, y_train, y_val = train_test_split(X, y_log, test_size=0.3, random_state=42)

# =====================================================
# üß© 4. Pr√©-processadores
# =====================================================
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# ColumnTransformer completo
preprocessador = ColumnTransformer(transformers=[
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
])

# =====================================================
# üîß 5. Pipeline do modelo (Ridge com valida√ß√£o cruzada interna)
# =====================================================
pipeline_rd = Pipeline(steps=[
    ('preprocess', preprocessador),
    ('ridge', RidgeCV(alphas=[0.1, 1.0, 10.0, 50.0, 100.0], cv=5))
])

# =====================================================
# üöÄ 6. Treinamento e avalia√ß√£o
# =====================================================
pipeline_rd.fit(X_train, y_train)
y_pred_log = pipeline_rd.predict(X_val)
y_pred = np.expm1(y_pred_log)
y_true = np.expm1(y_val)

# Fun√ß√£o de avalia√ß√£o
def avaliar_modelo(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    print("üìä RESULTADOS DE VALIDA√á√ÉO:")
    print(f"MAE : {mae:.3f}")
    print(f"RMSE: {rmse:.3f}")
    print(f"R¬≤  : {r2:.3f}")
    return {'MAE': mae, 'RMSE': rmse, 'R2': r2}

avaliar_modelo(y_true, y_pred)

# =====================================================
# üßÆ 7. Cross-validation global (mais robusto)
# =====================================================
scores = cross_val_score(pipeline_rd, X, y_log, cv=5, scoring='r2')
print("\nüîÅ Valida√ß√£o cruzada (5-fold):")
print(f"R¬≤ m√©dio: {scores.mean():.4f} ¬± {scores.std():.4f}")



# #submissao.to_csv("/home/akel/PycharmProjects/Kaggle/HousePrices/submissao_ridge.csv", index=False)
print("\n‚úÖ Arquivo de submiss√£o salvo como 'submissao_ridge.csv'")


üìä RESULTADOS DE VALIDA√á√ÉO:
MAE : 18824.545
RMSE: 28836.617
R¬≤  : 0.881

üîÅ Valida√ß√£o cruzada (5-fold):
R¬≤ m√©dio: 0.8495 ¬± 0.0534

‚úÖ Arquivo de submiss√£o salvo como 'submissao_ridge.csv'
