In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
X_full = pd.read_csv('train.csv', index_col='Id')
X_test_full = pd.read_csv('test.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

# To keep things simple, we'll use only numerical predictors
X = X_full.select_dtypes(exclude=['object'])
X_test = X_test_full.select_dtypes(exclude=['object'])

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)


In [2]:
# Shape of training data (num_rows, num_columns)
print(X_train.shape)

# Number of missing values in each column of training data
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

(1168, 36)
LotFrontage    212
MasVnrArea       6
GarageYrBlt     58
dtype: int64


# Drop columns with missing values

In [3]:
# Fill in the line below: get names of columns with missing values
missing_val_series = missing_val_count_by_column[missing_val_count_by_column > 0]
missing_val_cols = [col for col in X_train if X_train[col].isnull().any()]

# Fill in the lines below: drop columns in training and validation data

reduced_X_train = X_train.drop(missing_val_cols, axis=1)
reduced_X_valid = X_valid.drop(missing_val_cols, axis=1)


# Imputation

In [30]:
from sklearn.impute import SimpleImputer

# Fill in the lines below: imputation

my_imputer = SimpleImputer(strategy = 'median')
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

from sklearn.impute import SimpleImputer


In [103]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function for comparing different approaches
def score_dataset(X_train,X_valid,y_train,y_valid ):
    model = RandomForestRegressor(n_estimators=60,min_samples_split=3, random_state=0)
    model.fit(final_X_train, y_train)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [104]:
print("MAE (Imputation):")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))

MAE (Imputation):
17671.529191422953


In [105]:
print("MAE (Drop columns with missing values):")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))

MAE (Drop columns with missing values):
17644.128476480393


In [106]:
my_imputer = SimpleImputer(strategy = 'median')
final_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
final_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

# Define and fit model
model = RandomForestRegressor(n_estimators=60,min_samples_split=3, random_state=0)
model.fit(final_X_train, y_train)

print("MAE (Drop columns with missing values):")
print(score_dataset(final_X_train, final_X_valid, y_train, y_valid))

MAE (Drop columns with missing values):
17671.529191422953


In [116]:
lista_colunas = list(X_train.columns)
                   
final_X_train = pd.DataFrame(my_imputer.fit_transform(X_train),columns=lista_colunas)                
final_X_valid = pd.DataFrame(my_imputer.fit_transform(X_valid),columns=lista_colunas)

colunas_pouco_imp = ['3SsnPorch','PoolArea','MiscVal']

refinal_X_train = final_X_train.drop(colunas_pouco_imp, axis=1)
refinal_X_valid = final_X_valid.drop(colunas_pouco_imp, axis=1)

model = RandomForestRegressor(n_estimators=60,min_samples_split=3, random_state=0)
model.fit(refinal_X_train, y_train)
preds = model.predict(refinal_X_valid)
mean_absolute_error(y_valid, preds)

17886.66654553526

In [108]:
print("MAE (Drop columns with missing values):")
print(score_dataset(refinal_X_train, refinal_X_valid, y_train, y_valid)

SyntaxError: unexpected EOF while parsing (<ipython-input-108-af9e4b6643a6>, line 2)

In [117]:
# Vendo quantidade de valores diferentes de 0 por coluna

X_test.fillna(0).astype(bool).sum(axis=0)

MSSubClass       1459
LotFrontage      1232
LotArea          1459
OverallQual      1459
OverallCond      1459
YearBuilt        1459
YearRemodAdd     1459
MasVnrArea        567
BsmtFinSF1        996
BsmtFinSF2        180
BsmtUnfSF        1335
TotalBsmtSF      1417
1stFlrSF         1459
2ndFlrSF          620
LowQualFinSF       14
GrLivArea        1459
BsmtFullBath      608
BsmtHalfBath       93
FullBath         1456
HalfBath          538
BedroomAbvGr     1457
KitchenAbvGr     1457
TotRmsAbvGrd     1459
Fireplaces        729
GarageYrBlt      1381
GarageCars       1382
GarageArea       1382
WoodDeckSF        697
OpenPorchSF       817
EnclosedPorch     251
3SsnPorch          13
ScreenPorch       140
PoolArea            6
MiscVal            51
MoSold           1459
YrSold           1459
dtype: int64

In [None]:
set()