In [2]:
import numpy as np
import pandas as pd

# Nome do arquivo localizado na mesma pasta do Jupyter Notebook
file_name = 'train.csv'

# Ler o arquivo CSV
df = pd.read_csv(file_name)

# Exibir as primeiras linhas do dataframe para confirmar que o arquivo foi lido corretamente
print(df.head())




   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [4]:
#Perccentual de nulos
df.isnull().mean()

Id               0.000000
MSSubClass       0.000000
MSZoning         0.000000
LotFrontage      0.177397
LotArea          0.000000
                   ...   
MoSold           0.000000
YrSold           0.000000
SaleType         0.000000
SaleCondition    0.000000
SalePrice        0.000000
Length: 81, dtype: float64

In [5]:
#Percentual de nulos, ordenados, acima de 20% já fica complicado para usar.
df.loc[:, df.isnull().mean()>0].isnull().mean().sort_values()

Electrical      0.000685
MasVnrArea      0.005479
BsmtQual        0.025342
BsmtCond        0.025342
BsmtFinType1    0.025342
BsmtExposure    0.026027
BsmtFinType2    0.026027
GarageCond      0.055479
GarageQual      0.055479
GarageFinish    0.055479
GarageYrBlt     0.055479
GarageType      0.055479
LotFrontage     0.177397
FireplaceQu     0.472603
MasVnrType      0.597260
Fence           0.807534
Alley           0.937671
MiscFeature     0.963014
PoolQC          0.995205
dtype: float64

In [6]:
#Exlusão da análise de colunas com valores nulos > 0,17%.
missing_proportion = df.loc[:, df.isnull().mean()>0].isnull().mean().sort_values()
cols_to_drop = missing_proportion[missing_proportion > 0.17].index
df = df.drop(columns=cols_to_drop)
df.shape


(1460, 74)

In [7]:
numerical_cols = df.select_dtypes(exclude='object').columns.tolist()
categorical_cols = df.select_dtypes(include='object').columns.tolist()

numerical_cols = [col for col in numerical_cols if col not in ['SalePrice', 'Id']]

In [8]:
X = df.drop(['SalePrice', 'Id'], axis=1)
y = df['SalePrice']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1168, 72), (292, 72), (1168,), (292,))

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor

# Pipeline para transformação de colunas numéricas
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

# Pipeline para transformação de colunas categóricas
categorical_transformer = Pipeline(steps=[
    ("imputer2", SimpleImputer(strategy='most_frequent')),
    ("encoder", OneHotEncoder(handle_unknown='ignore'))  # Corrigido: handle_unknown
])

# Definição do transformador de colunas
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Aplicando as transformações no conjunto de treino e teste
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Treinamento do modelo RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X_train_transformed, y_train)


In [12]:
y_pred = rf.predict(X_test_transformed)

In [13]:
y_pred

array([140667.  , 317660.4 , 117424.  , 154699.5 , 322729.61,  87145.83,
       207265.26, 150249.9 ,  88241.49, 131856.84, 156124.95, 120023.16,
       110150.  , 206145.4 , 180410.27, 132699.75, 197275.05, 136890.07,
       115795.  , 205462.21, 165576.08, 223285.95, 176368.64, 124129.25,
       196126.6 , 176123.84, 183982.88, 107838.  , 176803.67, 194584.48,
       122617.1 , 247335.55, 177128.98, 113643.5 , 257164.14, 144219.  ,
       146361.9 , 204416.47, 304865.51, 104008.48, 125238.  , 241185.09,
       120567.  , 381244.2 , 134217.6 , 148939.5 , 115032.99, 127589.5 ,
       396454.96, 148480.68, 121562.83, 200884.5 , 126305.17, 340276.63,
       139384.  , 238779.42, 193906.  , 152176.5 , 145063.  , 113899.  ,
        79027.  , 148807.25, 315206.79, 272875.71, 288575.51, 204742.3 ,
       111939.  , 311821.95, 116930.5 , 162981.06, 127738.84, 132048.25,
       114279.33,  91313.  , 449423.71, 174405.08, 309551.06, 300895.22,
       136934.25, 121819.5 , 103148.  , 102493.66, 