## House Prices Competition level2

In [245]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [246]:
# Load the data, and separate the target
iowa_file_path = 'input/train.csv'
home_data = pd.read_csv(iowa_file_path,index_col='Id')
test_data_path = 'input/test.csv'
# read test data file using pandas
test_data = pd.read_csv(test_data_path,index_col='Id')

In [247]:
# Remove rows with missing target, separate target from predictors
home_data.dropna(axis=0, subset=["SalePrice"], inplace=True)
y = home_data.SalePrice
home_data.drop(["SalePrice"], axis=1, inplace=True)

# To keep things simple, we'll use only numerical predictors
X = home_data.select_dtypes(exclude=['object'])
X_test = test_data.select_dtypes(exclude=['object'])


In [248]:
#features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']

In [249]:
# Break off validation set from training data
train_X, val_X, train_y, val_y = train_test_split(X, y,train_size=0.8, test_size=0.2,random_state=0 )

In [250]:
train_X.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
619,20,90.0,11694,9,5,2007,2007,452.0,48,0,...,774,0,108,0,0,260,0,0,7,2007
871,20,60.0,6600,5,5,1962,1962,0.0,0,0,...,308,0,0,0,0,0,0,0,8,2009
93,30,80.0,13360,5,7,1921,2006,0.0,713,0,...,432,0,0,44,0,0,0,0,8,2009
818,20,,13265,8,5,2002,2002,148.0,1218,0,...,857,150,59,0,0,0,0,0,7,2008
303,20,118.0,13704,7,5,2001,2002,150.0,0,0,...,843,468,81,0,0,0,0,0,1,2006


In [251]:
print(train_X.shape)
#Number of missing values
missing_val_count_by_column = (train_X.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column>0])


(1168, 36)
LotFrontage    212
MasVnrArea       6
GarageYrBlt     58
dtype: int64


In [252]:
# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [253]:
# columns with missing values 
col_with_missing = [col for col in train_X.columns if train_X[col].isnull().any()]
col_with_missing

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

### Score from Approach 1 (Drop Columns with Missing Values)
Since we are working with both training and validation sets, we are careful to drop the same columns in both DataFrames.

In [254]:
# Drop columns in training and validation data
reduced_train_X = train_X.drop(col_with_missing, axis=1)
reduced_val_X = val_X.drop(col_with_missing, axis=1)

In [255]:
print("MAE from Approach 1 (Drop columns with missing values):")
print(score_dataset(reduced_train_X, reduced_val_X, train_y, val_y))

MAE from Approach 1 (Drop columns with missing values):
17837.82570776256


### Score from Approach 2 (Imputation)¶
Next, we use SimpleImputer to replace missing values with the mean value along each column.

Although it's simple, filling in the mean value generally performs quite well (but this varies by dataset). While statisticians have experimented with more complex ways to determine imputed values (such as regression imputation, for instance), the complex strategies typically give no additional benefit once you plug the results into sophisticated machine learning models.

In [256]:
from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer()
imputed_train_X = pd.DataFrame(my_imputer.fit_transform(train_X))
imputed_val_X = pd.DataFrame(my_imputer.fit_transform(val_X))
# Imputation removed column names; put them back
imputed_train_X.columns = train_X.columns
imputed_val_X.columns = val_X.columns

print("MAE from Approach 2 (Imputation):")
print(score_dataset(imputed_train_X, imputed_val_X, train_y, val_y))


MAE from Approach 2 (Imputation):
18056.85163242009


### Score from Approach 3 (An Extension to Imputation)¶
Next, we impute the missing values, while also keeping track of which values were imputed.

In [257]:
# Make copy to avoid changing original data (when imputing)
train_X_plus = train_X.copy()
val_X_plus = val_X.copy()

for col in col_with_missing:
    train_X_plus[col+"_was_missing"] = train_X_plus[col].isnull()
    val_X_plus[col+"_was_missing"] = val_X_plus[col].isnull()

my_imputer = SimpleImputer()
imputed_train_X_plus = pd.DataFrame(my_imputer.fit_transform(train_X_plus))
imputed_val_X_plus = pd.DataFrame(my_imputer.fit_transform(val_X_plus))
imputed_train_X_plus.columns = train_X_plus.columns
imputed_val_X_plus.columns = val_X_plus.columns
print("MAE from Approach 3 (An extension to imputation):")
print(score_dataset(imputed_train_X_plus, imputed_val_X_plus, train_y, val_y))

MAE from Approach 3 (An extension to imputation):
18135.90062785388


Let's implement approach 2 to give out result

In [258]:
def generate_sub(X,y,test_data,model):

    rf_model_on_full_data = model
    rf_model_on_full_data.fit(X,y)


    # make predictions which we will submit. 
    test_preds = rf_model_on_full_data.predict(test_data)

    ## create my submission file
    output = pd.DataFrame({'Id': test_data.index,
                        'SalePrice': test_preds})
    output.to_csv('out/submission.csv', index=False)

In [259]:
model = RandomForestRegressor(n_estimators=100, random_state=0)
#preprocessing the test data
my_imputer = SimpleImputer()
imputed_test_data = pd.DataFrame(my_imputer.fit_transform(X_test),index=X_test.index)
imputed_test_data.columns = X_test.columns
generate_sub(imputed_train_X,train_y,imputed_test_data,model)