In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
import category_encoders as ce
from catboost import CatBoostRegressor, Pool
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import ShuffleSplit, cross_val_score, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler, PowerTransformer

from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV

In [None]:
train_raw, test_raw = pd.read_csv('~/Downloads/house-pricing/train.csv'), pd.read_csv('~/Downloads/house-pricing/test.csv')

In [None]:
import sys
sys.path.insert(0, '/home/amine/house-prices/helpers')

from transformers.ContinuousFeaturesImputer import ContinuousFeaturesImputer
from transformers.RareCategoriesImputer import RareCategoriesImputer

In [None]:

# Street, Utilities, Condition2, RoofMatl: bad repartition
# Alley, PoolQC: most are missing


# MasVnrArea, BsmtFinSF1, 2ndFlrSF, LowQualFinSF, PoolArea: Had a lot of 0.0
date_related = ['GarageYrBlt', 'MoSold', 'YrSold']
boolean = ['CentralAir']
target = 'SalePrice'
to_drop = ['Id', 'PoolArea', 'PoolQC', 'YearBuilt', 'YearRemodAdd']

numerical = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
             'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF',
             'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'MiscVal', 'OverallCond', 'OverallQual'] + date_related
categorical = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
              'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
              'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu',
              'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']  + boolean

assert len(categorical )+ len(numerical) + len(to_drop) + 1 == train_raw.shape[1]
assert set(categorical + numerical + date_related + boolean + [target] + to_drop) == set(train_raw.columns)



In [None]:
train = train_raw.copy()
train['LivibleRatio'] = train['1stFlrSF'] / train['LotArea']
train['HasMasVnr'] = train['MasVnrArea'] == 0
train['BuildingAge'] = 2020 - train['YearBuilt']
train['LastRemodling'] = 2020 - train['YearRemodAdd']
train['HasPool'] = train['PoolArea'] > 0
train['BuiltPercentage'] = train['1stFlrSF'] / train['LotArea']
train['DiffBuitSold'] = train['YearBuilt'] - train['YrSold']
train['AvrRoomSF'] = train['TotRmsAbvGrd'] / (train['1stFlrSF'] + train['2ndFlrSF'])
train['2ndfloorPercentage'] = train['2ndFlrSF'] / train['1stFlrSF']
train['HasBasment'] = train['TotalBsmtSF'] > 0
train['DiffRemodBuilt'] = train['YearRemodAdd'] - train['YearBuilt']

test = test_raw.copy()
test['LivibleRatio'] = test['1stFlrSF'] / test['LotArea']
test['HasMasVnr'] = test['MasVnrArea'] == 0
test['BuildingAge'] = 2020 - test['YearBuilt']
test['LastRemodling'] = 2020 - test['YearRemodAdd']
test['HasPool'] = test['PoolArea'] > 0
test['BuiltPercentage'] = test['1stFlrSF'] / test['LotArea']
test['DiffBuitSold'] = test['YearBuilt'] - test['YrSold']
test['AvrRoomSF'] = test['TotRmsAbvGrd'] / (test['1stFlrSF'] + test['2ndFlrSF'])
test['2ndfloorPercentage'] = test['2ndFlrSF'] / test['1stFlrSF']
test['HasBasment'] = test['TotalBsmtSF'] > 0
test['DiffRemodBuilt'] = test['YearRemodAdd'] - test['YearBuilt']



engineered = ['TotalArea', 'LivibleRatio', 'HasMasVnr', 'BuildingAge', 'LastRemodling', 'HasPool', 'BuiltPercentage', 'DiffBuitSold', 'AvrRoomSF',
             '2ndfloorPercentage', 'HasBasment', 'DiffRemodBuilt']

to_normalize = ['DiffBuitSold', 'BuildingAge', 'LastRemodling', 'DiffRemodBuilt']
train = train.drop(columns=to_drop, axis=1)
test = test.drop(columns=to_drop, axis=1)

In [None]:
# Finding discrete variables

discrete = []
for var in numerical:
    if len(train[var].unique())<20:
        print(var, ' values: ', train[var].unique())
        discrete.append(var)
        
print('There are {} discrete variables'.format(len(discrete)))

In [None]:
# Missing values:
mostly_missing = list()
treshold = 0.8
for var in train.columns:
    missing_prc =  train[var].isna().mean()
    if missing_prc > treshold:
        print(f"{var}: {train[var].unique()}, {missing_prc}.")
        mostly_missing.append(var)

In [None]:
# Dist and outliers for continuous features
continuous = [var for var in numerical if var not in discrete and var not in ['Id', 'SalePrice']]
# continuous
for var in continuous:
    plt.figure(figsize=(15,6))
    plt.subplot(1, 2, 1)
    fig = sns.boxplot(y=train[var])
    fig.set_title('')
    fig.set_ylabel(var)
    
    plt.subplot(1, 2, 2)
    fig = sns.distplot(train[var].dropna())
    fig.set_ylabel('Number of houses')
    fig.set_xlabel(var)

    plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, train.SalePrice, test_size=0.2,
                                                    random_state=43)
X_train.pop(target), X_test.pop(target)

y_train, y_test = np.log(y_train), np.log(y_test)

In [None]:
# X_train = train.copy()
# y_train = np.log(X_train.pop(target))

# X_test = test.copy()

In [None]:
result = ContinuousFeaturesImputer(drop_features=False, create_is_0=True, impute_zeros=True, drop_outliers=True).fit_transform(X_train)

In [None]:
# Dealing with missing values in continuous
# If most of the values are 0, then create seperate boolean column to indicate if 0, and replace the 0 values with median, then apply transformation

for var in continuous:
    
    plt.figure(figsize=(20,6))
    plt.subplot(1, 4, 1)
    fig = sns.boxplot(y=X_train[var])
    fig.set_title('')
    fig.set_ylabel(var)
    
    plt.subplot(1, 4, 2)
    fig = sns.distplot(X_train[var].dropna())
    fig.set_ylabel('Number of houses')
    fig.set_xlabel(var)

    plt.show()

In [None]:
# Missing values in continuous variables
# add variable indicating missingness + median imputation
for df in [X_train, X_test]:
    for var in ['LotFrontage', 'GarageYrBlt']:
        df[var+'_NA'] = np.where(df[var].isnull(), 1, 0)
        df[var].fillna(X_train[var].median(), inplace=True) 

# for df in [X_train, X_test]:
#     df.MasVnrArea.fillna(X_train.MasVnrArea.median(), inplace=True)

In [None]:
# Missing values in categorical data

# Adding missing label
for df in [X_train, X_test]:
    for var in categorical:
        df[var].fillna('Missing', inplace=True)

In [None]:
X_train = X_train[categorical]

result = RareCategoriesImputer(0.03).fit_transform(X_train)

In [None]:
# Adding rare label
treshold = 0.03
def rare_imputation(variable):
    # find frequent labels / discrete numbers
    temp = X_train.groupby([variable])[variable].count()/np.float(len(X_train))
    frequent_cat = [x for x in temp.loc[temp > treshold].index.values]
    
    X_train.loc[:, variable] = np.where(X_train[variable].isin(frequent_cat), X_train[variable], 'Rare')
    X_test.loc[:, variable] = np.where(X_test[variable].isin(frequent_cat), X_test[variable], 'Rare')
    
# find unfrequent labels in categorical variables
for var in categorical:
    rare_imputation(var)

In [None]:
(result == X_train).all().all()

In [None]:
# Encoding
encoder = ce.TargetEncoder(cols=categorical)
encoder.fit(X_train, y_train)
X_train, X_test = encoder.transform(X_train), encoder.transform(X_test)

In [None]:
# Scaling
scaler = RobustScaler() # create an instance
scaler.fit(X_train)
X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)

In [None]:
param_dist = {'objective':'reg:squarederror', 'n_estimators':1_000, 'reg_lambda': 0.1, 'max_depth':1, 'colsample_bynode': 0.5, }
model1 = XGBRegressor(**param_dist)
model1.fit(X_train, y_train, eval_set=[(X_train, y_train)], eval_metric="rmse")

model = model1

rmse()

sns.distplot(y_test)
sns.distplot(model.predict(X_test))

In [None]:
def rmse(selected=False):
    if selected:
        train_pred, test_pred  = model.predict(X_train[selected]), model.predict(X_test[selected])
    else:
        train_pred, test_pred  = model.predict(X_train), model.predict(X_test)
        
    train_mse, test_mse = mean_squared_error(train_pred, y_train), mean_squared_error(test_pred, y_test), 
    return np.sqrt(train_mse), np.sqrt(test_mse) 

In [None]:
model = RandomForestRegressor(n_jobs=-1)
model.fit(X_train, y_train)
rmse()

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import tensorflow as tf
from tensorflow.keras.layers import LayerNormalization
from keras.regularizers import l2
from keras.optimizers import Adam


def _rmse(pred, true):
    mse = tf.keras.losses.mean_squared_error(pred, true)
    return tf.keras.backend.sqrt(mse)


def build_model(input_dim):
    initializer = tf.keras.initializers.GlorotNormal()
    model = Sequential()
#     model.add(Dense(256, input_dim=input_dim, kernel_initializer=initializer, activation='relu', kernel_regularizer=l2(0.2), bias_regularizer=l2(0.2)))
#     model.add(LayerNormalization())
    model.add(Dense(128, kernel_initializer=initializer, activation='relu', kernel_regularizer=l2(0.1), bias_regularizer=l2(0.1)))
    model.add(LayerNormalization())
    model.add(Dense(64, kernel_initializer=initializer, activation='relu', kernel_regularizer=l2(0.1), bias_regularizer=l2(0.1)))
    model.add(LayerNormalization())
    model.add(Dense(32, kernel_initializer=initializer, activation='relu', kernel_regularizer=l2(0.1), bias_regularizer=l2(0.1)))
    model.add(LayerNormalization())
    model.add(Dense(16, kernel_initializer=initializer, activation='relu', kernel_regularizer=l2(0.1), bias_regularizer=l2(0.1)))
    model.add(LayerNormalization())
    model.add(Dense(8, kernel_initializer=initializer, activation='relu', kernel_regularizer=l2(0.1), bias_regularizer=l2(0.1)))
    model.add(LayerNormalization())
    model.add(Dense(4, kernel_initializer=initializer, activation='relu', kernel_regularizer=l2(0.1), bias_regularizer=l2(0.1)))
    model.add(LayerNormalization())
    model.add(Dense(1, kernel_initializer=initializer))
    opt = Adam()
    model.compile(loss='mean_squared_error', optimizer=opt, metrics=[_rmse])
    return model

model = build_model(X_train.shape[1])z
model.fit(X_train, y_train, epochs=1000, verbose=1, batch_size=128, validation_data=(X_test, y_test))

# Y_train, Y_test = Y_train * std + mean, Y_test * std + mean
rmse()


In [None]:
np.argsort((model.predict(X_test) - y_test).abs())

In [None]:
y_test.iloc[184]

In [None]:
y_test.mean(), y_test.max(), y_test.min()

In [None]:
sns.distplot(y_test)
sns.distplot(model.predict(X_test))

In [None]:
sns.distplot(y_test)
sns.distplot(model.predict(X_test))

In [None]:
submission = pd.DataFrame({'Id': test_raw.Id, 'SalePrice': np.exp(_predict(X_test))})
submission.to_csv("FirstSubmission.csv", index=False)

# Model for 11.5 and 12.5

In [None]:
# X_train, y_train, X_test, y_test = pd.DataFrame(X_train), pd.Series(y_train.reset_index(drop=True)), pd.DataFrame(X_test), pd.Series(y_test.reset_index(drop=True))


X_train, y_train = pd.DataFrame(X_train), pd.Series(y_train.reset_index(drop=True))

In [None]:
train_filter = (y_train > 11.5) & (y_train < 12.5)  
# test_filter = (y_test > 11.5) & (y_test < 12.5)

# X_train_fil, y_train_fil, X_test_fil, y_test_fil = X_train[train_filter], y_train[train_filter], X_test[test_filter], y_test[test_filter]
X_train_fil, y_train_fil = X_train[train_filter], y_train[train_filter]


In [None]:
X_train_fil.shape, X_test_fil.shape

In [None]:
param_dist = {'objective':'reg:squarederror', 'n_estimators':1_000, 'reg_lambda': 0.1, 'max_depth':1, 'colsample_bynode': 0.2 }
model = XGBRegressor(**param_dist)
model.fit(X_train_fil, y_train_fil, eval_set=[(X_train_fil, y_train_fil)], eval_metric="rmse")


rmse()

sns.distplot(y_test_fil)
sns.distplot(model.predict(X_test_fil))

In [None]:
train_pred, test_pred  = model.predict(X_train_fil), model.predict(X_test_fil)

train_mse, test_mse = mean_squared_error(train_pred, y_train_fil), mean_squared_error(test_pred, y_test_fil), 
print(np.sqrt(train_mse), np.sqrt(test_mse))

In [None]:
model1, model

In [None]:
def _predict(df):
    pred_m1_train, pred_m_train = model1.predict(df), model.predict(pd.DataFrame(df))
    
    train_filter = (pred_m1_train > 11.5) & (pred_m1_train < 12.5)
    
    predictions = pred_m1_train * (~train_filter) + pred_m_train * (train_filter)
    
    return predictions

In [None]:
train_pred, test_pred  = _predict(X_train_fil), _predict(X_test_fil)

train_mse, test_mse = mean_squared_error(train_pred, y_train_fil), mean_squared_error(test_pred, y_test_fil), 
print(np.sqrt(train_mse), np.sqrt(test_mse))
