In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [2]:
train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test=pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

In [3]:
print(train.shape)
print(test.shape)

(1460, 81)
(1459, 80)


In [4]:
y=train.SalePrice
X_train = train.drop(['SalePrice'],axis=1)
X_test = test

In [5]:
missing_cols = [col for col in X_train.columns if X_train[col].isnull().any()] 
missing_cols

['LotFrontage',
 'Alley',
 'MasVnrType',
 'MasVnrArea',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [6]:
from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer(strategy='most_frequent')
i_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
i_X_test = pd.DataFrame(my_imputer.transform(X_test))

# Imputation removed column names; put them back
i_X_train.columns = X_train.columns
i_X_test.columns = X_test.columns



In [7]:
i_X_train.columns[i_X_train.isnull().any()]

Index([], dtype='object')

In [8]:
X_train = i_X_train 
X_test = i_X_test 

In [9]:
object_cols = [col for col in X_train.columns if X_train[col].dtype=="object" and X_train[col].nunique()<10]
numerical_cols = [col for col in X_train.columns if train[col].dtype in ['float64','int64']]
cols = object_cols + numerical_cols
X_train = X_train[cols]
X_test = X_test[cols]

In [10]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_test.index = X_test.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_test = X_test.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

# Ensure all columns have string type
OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_test.columns = OH_X_test.columns.astype(str)




In [11]:
X_train = OH_X_train
X_test = OH_X_test 

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

model = RandomForestRegressor(n_estimators=100, random_state=0)
scores = -1 * cross_val_score(model, X_train, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')
print("MAE scores:\n", scores)
print("MAE mean scores:\n", scores.mean())

MAE scores:
 [18205.41428082 17373.72928082 18046.56863014 16581.89267123
 19421.1135274 ]
MAE mean scores:
 17925.74367808219


In [13]:
model.fit(X_train,y)
preds = model.predict(X_test)

In [14]:
output = pd.DataFrame({'Id':X_test.Id, 'SalePrice':preds})

In [15]:
output.to_csv("my_submissions.csv",index=False)