In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/train1/train1.csv


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [3]:
df = pd.read_csv('/kaggle/input/train1/train1.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
#Perccentual de nulos
df.isnull().mean()

Id               0.000000
MSSubClass       0.000000
MSZoning         0.000000
LotFrontage      0.177397
LotArea          0.000000
                   ...   
MoSold           0.000000
YrSold           0.000000
SaleType         0.000000
SaleCondition    0.000000
SalePrice        0.000000
Length: 81, dtype: float64

In [5]:
#Percentual de nulos, ordenados, acima de 20% já fica complicado para usar.
df.loc[:, df.isnull().mean()>0].isnull().mean().sort_values()

Electrical      0.000685
MasVnrArea      0.005479
BsmtQual        0.025342
BsmtCond        0.025342
BsmtFinType1    0.025342
BsmtExposure    0.026027
BsmtFinType2    0.026027
GarageCond      0.055479
GarageQual      0.055479
GarageFinish    0.055479
GarageYrBlt     0.055479
GarageType      0.055479
LotFrontage     0.177397
FireplaceQu     0.472603
MasVnrType      0.597260
Fence           0.807534
Alley           0.937671
MiscFeature     0.963014
PoolQC          0.995205
dtype: float64

In [6]:
#Exlusão da análise de colunas com valores nulos > 0,17%.
missing_proportion = df.loc[:, df.isnull().mean()>0].isnull().mean().sort_values()
cols_to_drop = missing_proportion[missing_proportion > 0.17].index
df = df.drop(columns=cols_to_drop)
df.shape


(1460, 74)

In [7]:
numerical_cols = df.select_dtypes(exclude='object').columns.tolist()
categorical_cols = df.select_dtypes(include='object').columns.tolist()

numerical_cols = [col for col in numerical_cols if col not in ['SalePrice', 'Id']]

In [8]:
X = df.drop(['SalePrice', 'Id'], axis=1)
y = df['SalePrice']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1168, 72), (292, 72), (1168,), (292,))

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor

# Pipeline para transformação de colunas numéricas
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

# Pipeline para transformação de colunas categóricas
categorical_transformer = Pipeline(steps=[
    ("imputer2", SimpleImputer(strategy='most_frequent')),
    ("encoder", OneHotEncoder(handle_unknown='ignore'))  # Corrigido: handle_unknown
])

# Definição do transformador de colunas
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Aplicando as transformações no conjunto de treino e teste
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Treinamento do modelo RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X_train_transformed, y_train)


In [12]:
y_pred = rf.predict(X_test_transformed)

In [13]:
y_pred

array([142237.  , 323873.  , 116048.  , 152667.68, 317113.68,  87852.  ,
       217262.65, 150209.14,  87337.  , 127838.12, 152724.5 , 124130.5 ,
       116104.5 , 204895.85, 177010.  , 132890.25, 194378.8 , 134248.5 ,
       115903.5 , 205047.22, 160739.55, 221086.52, 175625.17, 124385.1 ,
       192178.04, 174432.61, 183458.95, 107892.  , 176541.5 , 194897.88,
       120379.5 , 245605.93, 173568.52, 114258.  , 258955.46, 148344.  ,
       140318.45, 202781.02, 311411.84, 108165.32, 124393.04, 236250.17,
       120733.5 , 365842.3 , 135281.1 , 151546.87, 116391.5 , 128520.  ,
       386520.41, 147283.23, 121732.  , 198360.  , 121308.47, 352368.25,
       139135.  , 237384.45, 191608.7 , 151315.75, 142645.  , 113571.32,
        78060.5 , 146491.6 , 309835.94, 281421.69, 281353.99, 213841.78,
       111610.  , 310001.2 , 114139.33, 165154.43, 128361.77, 130942.5 ,
       113745.5 ,  90353.  , 437615.03, 172073.99, 312279.87, 297288.72,
       137453.25, 124485.5 , 101859.5 , 103699.5 , 