In [81]:
#https://www.kaggle.com/c/house-prices-advanced-regression-techniques/overview

import sys
sys.path.append('/home/ec2-user/ml/myMLLib/src')
from myMLLib.src.utils import pandas_util
from myMLLib.src.custom_transformers.OrderedCategoricalDataEncoder import OrderedCategoricalDataEncoder
from myMLLib.src.custom_transformers.OneHotEncoder import OneHotEncoder
from myMLLib.src.custom_transformers.NumericAttributeStandardizer import NumericAttributeStandardizer
import pandas as pd

In [82]:
def prepare_data(df):
    # Missing data implies that house does not have lott frontage
    df['LotFrontage'].fillna(0, inplace=True)
    # Missing date can be assumed tthat house does not have an alley
    df['Alley'].fillna('NA', inplace=True)
    # Missing data can be assumed that MasVnrType does not exists
    df['MasVnrType'].fillna('None', inplace=True)
    # Missing data can be assumed that MasVnrArea is zero
    df['MasVnrArea'].fillna(0, inplace=True)
    # Missing data can be assumed that BsmtQual not exists
    df['BsmtQual'].fillna('NA', inplace=True)
    # Missing data can be assumed that BsmtCond not exists
    df['BsmtCond'].fillna('NA', inplace=True)
    # Fill one row with No if other basement features are available
    df.loc[df.BsmtQual.notna() & df.BsmtExposure.isna(), 'BsmtExposure'] = 'No'
    # Missing data can be assumed that BsmtExposure not exists
    df['BsmtExposure'].fillna('NA', inplace=True)
    df['BsmtFinType1'].fillna('NA', inplace=True)
    df.loc[332, 'BsmtFinType2'] = 'LwQ'
    df['BsmtFinType2'].fillna('NA', inplace=True)

    df['Electrical'].fillna('SBrkr', inplace=True)

    df['FireplaceQu'].fillna('NA', inplace=True)

    df['GarageType'].fillna('NA', inplace=True)
    df['GarageYrBlt'].fillna(df['GarageYrBlt'].min(), inplace=True)
    df['GarageFinish'].fillna('NA', inplace=True)
    df['GarageQual'].fillna('NA', inplace=True)
    df['GarageCond'].fillna('NA', inplace=True)
    df['PoolQC'].fillna('NA', inplace=True)
    df['Fence'].fillna('NA', inplace=True)
    df['MiscFeature'].fillna('NA', inplace=True)
    df['KitchenQual'].fillna('TA', inplace=True)
    
    df['MSZoning'].fillna(df['MSZoning'].value_counts().index[0], inplace=True)
    df['Utilities'].fillna(df['Utilities'].value_counts().index[0], inplace=True)
    df['Exterior1st'].fillna(df['Exterior1st'].value_counts().index[0], inplace=True)
    df['Exterior2nd'].fillna(df['Exterior2nd'].value_counts().index[0], inplace=True)
    df['BsmtFinSF1'].fillna(0, inplace=True)
    df['BsmtFinSF2'].fillna(0, inplace=True)
    df['BsmtUnfSF'].fillna(0, inplace=True)
    df['TotalBsmtSF'].fillna(0, inplace=True)
    df['BsmtFullBath'].fillna(0, inplace=True)
    df['BsmtHalfBath'].fillna(0, inplace=True)
    df['Functional'].fillna(df['Functional'].value_counts().index[0], inplace=True)
    df['GarageCars'].fillna(0, inplace=True)
    df['GarageArea'].fillna(0, inplace=True)
    df['SaleType'].fillna(df['SaleType'].value_counts().index[0], inplace=True)



    # Defining six columns for basement finished type: GLQ, ALQ, BLQ, Rec, LwQ, Unf. Along with area for each
    default_list = [0 for x in range(df.shape[0])]
    df['GLQ'] = pd.Series(default_list.copy(),  index=df.index)
    df['ALQ'] = pd.Series(default_list.copy(),  index=df.index)
    df['BLQ'] = pd.Series(default_list.copy(),  index=df.index)
    df['Rec'] = pd.Series(default_list.copy(),  index=df.index)
    df['LwQ'] = pd.Series(default_list.copy(),  index=df.index)

    for index, row in df.iterrows():
        if row['BsmtFinType1'] != 'NA' and row['BsmtFinType1'] != 'Unf':
            df.loc[index, row['BsmtFinType1']] = int(row['BsmtFinSF1'])
        if row['BsmtFinType2'] != 'NA' and row['BsmtFinType2'] != 'Unf':
            df.loc[index, row['BsmtFinType2']] += int(row['BsmtFinSF2'])

    # Convert date columns into numerical attribute by setting oldest date to 0 and increase by 1 for each year/month/week/day
    # YearBuilt, YearRemodAdd, GarageYrBlt, MoSold, YrSold
    df['YearBuilt'] = df['YearBuilt'] - df['YearBuilt'].min()
    df['YearRemodAdd'] = df['YearRemodAdd'] - df['YearRemodAdd'].min()
    df['GarageYrBlt'] = (df['GarageYrBlt'] - df['GarageYrBlt'].min()).astype(int)
    di = {1: "January", 2: "February", 3: "March", 4: "April", 5: "May", 6: "June", 7: "July", 8: "August", 9:"September", 10:"October", 11:"November", 12:"December" }
    df['MoSold'] = df['MoSold'].map(di)
    df['YrSold'] = df['YrSold'] - df['YrSold'].min()

df = pandas_util.load_data_frame_from_csv_file('/home/ec2-user/ml/house_prices_kaggle_train.csv')
prepare_data(df)
test_df = pandas_util.load_data_frame_from_csv_file('/home/ec2-user/ml/house_prices_kaggle_test.csv')
prepare_data(test_df)

In [83]:
one_hot_encoding_columns = ["MSSubClass", "MSZoning", "Street", "Alley", "LotShape", "LandContour", "Utilities", "LotConfig",
                            "LandSlope", "Neighborhood", "BldgType", "HouseStyle", "RoofStyle", "RoofMatl", "ExterQual", "ExterCond",
                            "Foundation", "Heating", "CentralAir", "Electrical", "Functional", "PavedDrive", "SaleType", "SaleCondition", "MoSold"]
int_encoding_columns = ["BsmtQual", "BsmtCond", "BsmtExposure", "HeatingQC", "KitchenQual", "FireplaceQu", "PoolQC"]
standardize_columns = ["LotFrontage", "LotArea", "1stFlrSF", "2ndFlrSF", "LowQualFinSF", "GrLivArea", "BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces", "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal"]
no_op_columns = ["OverallQual", "OverallCond"]
delete_columns = ["TotalBsmtSF"]

In [84]:
def get_X(df):
    df_integer_encoded = OrderedCategoricalDataEncoder().transform(df[int_encoding_columns])
    df_one_hot_encoded = OneHotEncoder().transform(df[one_hot_encoding_columns])
    df_standardized = NumericAttributeStandardizer().transform(df[standardize_columns])
    return pd.concat([df_integer_encoded, df_one_hot_encoded, df_standardized, df[no_op_columns]], axis=1)


{}

In [91]:
X = get_X(df)
y = df[['SalePrice']]

X_test = get_X(test_df)


train_columns = X.columns
test_columns = X_test.columns

default_list_test = [0 for x in range(X_test.shape[0])]

for column in (set(train_columns) - set(test_columns)):
    X_test[column] = pd.Series(default_list_test.copy(),  index=X_test.index)
    
default_list = [0 for x in range(X.shape[0])]

for column in (set(test_columns) - set(train_columns)):
    X[column] = pd.Series(default_list.copy(),  index=X.index)
    

from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X, y)




  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [92]:
predict_list = tree_reg.predict(X_test)
list_to_append = []
for index in range(0, len(predict_list)):
    list_to_append.append([test_df.iloc[index]['Id'], predict_list[index]])
    
pred_df = pd.DataFrame(list_to_append)

In [93]:
pred_df.to_csv('/home/ec2-user/ml/kag_sub.csv', index=False)