In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
train_data = pd.read_csv('/path/to/train.csv')
test_data = pd.read_csv('/path/to/test.csv')


In [None]:
print(train_data.describe())
print(test_data.describe())
print(train_data.isnull().sum().sort_values(ascending=False))
print(test_data.isnull().sum().sort_values(ascending=False))
train_data.hist(bins=50, figsize=(20,15))
plt.show()


In [None]:
numeric_features = train_data.select_dtypes(include=['int64', 'float64']).columns
categorical_features = train_data.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
X_train = train_data.drop('SalePrice', axis=1)
y_train = train_data['SalePrice']
pipeline.fit(X_train, y_train)


In [None]:
predictions = pipeline.predict(test_data)
print(predictions[:5])


In [None]:
output = pd.DataFrame({'Id': test_data.Id, 'SalePrice': predictions})
output.to_csv('submission.csv', index=False)
print("Arquivo de submiss√£o criado com sucesso!")
