In [274]:
import numpy as np
import pandas as pd 
import seaborn as sns 
import csv
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from scipy import stats
from scipy.stats import norm

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)


In [275]:
#df_train contains data as is
df_train = pd.read_csv('train.csv')

target = df_train[['SalePrice']].copy()
df_train_id = df_train[['Id']].copy()

df_train = df_train.drop('SalePrice', axis=1)

### Hyperparameters for coarse feature selecting

* if feature has missing values more then `missing_values_ratio` - feature will be deleted 
* ~~if feature has mostly unique values more then `common_values_ratio` - feature will be deleted~~
* if feature has more then `moda_0_ratio` - feature will be deleted

In [276]:
missing_values_ratio = 0.5
common_values_ratio = 0.9
moda_0_ratio = 0.9

In [277]:
#Remove Columns which contain too many Missing Values
df_train = df_train.loc[:,df_train.columns[df_train.isnull().mean() < missing_values_ratio]]

In [278]:
#Remove columns with mostly unique values
#for column in df_train[cat_cols]:    
#    if len(df_train[column].value_counts()) > common_values_ratio * df_train.shape[0]:
#        df_train = df_train.drop(column, axis=1)

In [279]:
#Remove colums with extremely scewed distributions
for column in df_train.columns:    
    if df_train[column].value_counts().values[0] > moda_0_ratio * df_train.shape[0]:
        df_train = df_train.drop(column, axis=1)

In [280]:
#в колонках GarageX одинаковое количество пропущенных значений
#при этом наиболее важная информация по гаражам содержится в колонке GarageCars
#следовательно, остальные колонки GarageX можно удалить
df_train = df_train.drop(['GarageType'], 1)
df_train = df_train.drop(['GarageYrBlt'], 1)
df_train = df_train.drop(['GarageFinish'], 1)
df_train = df_train.drop(['GarageQual'], 1)
df_train = df_train.drop(['GarageArea'], 1)

In [281]:
# удаляю переменные, имеющие значительную корреляцию с аналогичными по смыслу переменными
# переменные найдены на основе анализа тепловой карты, представленной ниже
df_train = df_train.drop(['TotRmsAbvGrd'], 1)
df_train = df_train.drop(['1stFlrSF'], 1)

### Distinguish numerical from categorical columns
* if feature has unique values less than `unique_value_number_for_numerical` - feature is categorical

In [282]:
unique_value_number_for_numerical = 30

In [283]:
cat_cols = []
num_cols = []
for column in df_train.columns:
    if df_train[column].nunique() < unique_value_number_for_numerical:
        cat_cols.append(column)
    else:
        num_cols.append(column)

print(f"categorical: {cat_cols}\nnumerical: {num_cols}")

categorical: ['MSSubClass', 'MSZoning', 'LotShape', 'LandContour', 'LotConfig', 'Neighborhood', 'Condition1', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'RoofStyle', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenQual', 'Fireplaces', 'FireplaceQu', 'GarageCars', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition']
numerical: ['Id', 'LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '2ndFlrSF', 'GrLivArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch']


privious automated approach to distinguish numerical and categorical features is not good.
let's do it by hand

In [284]:
num_cols = ['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
    'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '2ndFlrSF', 'GrLivArea', 'WoodDeckSF', 'OpenPorchSF',
    'EnclosedPorch', 'OverallQual', 'OverallCond', 'BsmtFullBath', 'FullBath', 'HalfBath',
    'BedroomAbvGr','Fireplaces','GarageCars','MoSold', 'YrSold'
]

cat_cols = ['MSSubClass', 'MSZoning', 'LotShape', 'LandContour', 'LotConfig', 'Neighborhood', 'Condition1',
    'BldgType', 'HouseStyle', 'RoofStyle', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond',
    'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC',
    'KitchenQual', 'FireplaceQu', 'SaleType', 'SaleCondition'
]

In [285]:
cols_final = df_train.columns
cols_final_test = [c for c in cols_final if c not in ['SalePrice']]
num_cols_test = [c for c in num_cols if c not in ['SalePrice']]

### Correlation matrix
тепловая карта. позволяет быстро оценить зависимость между переменными и выявить мультиколлинеарность (это плохо, модель не построить)
белые квадраты указвывают на зависимость между переменными

In [None]:

corrmat = df_train.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);

In [None]:
# NOTE: somehow, the heatmap of my dataset doesn't show the null value sin the columns 
# 'country', 'province' and 'variety'. Please help me to fix this.
def nullscan(df_check, save=False):
    '''
    df: a dataframe on which we want to perofrm the nullscan
    save: determines, whether you want to save the .png of the plot or not
    
    plots the rate of null values per column in a dataframe using 
    a seaborn heatmap and a barplot.
    '''    
    # a df with the same size of the original dataframe, containing True in cells containing NUll values.
    # and False in all the other cells.
    df_nulls = df_check.isna()
    # a series containing the sum of all values within a column having the column names as indices.
    # True is interpreted as 1 and False is interpreted as 0 
    nulls_per_col = df_nulls.sum(axis=0)
    # the rate makes it way more interpretable:
    nulls_per_col /= len(df_check.index)

    with plt.style.context('dark_background'):
        fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, sharex=True, figsize=(8, 10))
    
        # ax1 is losely based on: https://www.kaggle.com/ipshitagh/wine-dataset-data-cleaning
        # NOTE: I could have used the cmap viridis or anything else instead, 
        # but I want to make clear that you can use any customized cmap as well.
        vir = matplotlib.cm.get_cmap('viridis')
        colormap = matplotlib.colors.ListedColormap([vir(0), 'darkorange'])
        sns.heatmap(df_check.isnull(), cmap=colormap, cbar=False, yticklabels=False, ax=ax1)
    
        nulls_per_col.plot(kind='bar', color='darkorange', x=nulls_per_col.values, 
                           y=nulls_per_col.index, ax=ax2, width=1, linewidth=1, 
                           edgecolor='black', align='edge', label='Null value rate')
        
        ax2.set_ylim((0,1))
        # centered labels
        labels=df_check.columns
        ticks = np.arange(0.5, len(labels))
        ax2.xaxis.set(ticks=ticks, ticklabels=labels)
    
        # hide spines:
        # NOTE: I could have used ax2.set_frameon(False), 
        # but I wanted the bottom and the left spine to stay white.
        ax2.spines['top'].set_color('black')
        ax2.spines['right'].set_color('black')
        
        
        
        # workaround to visualize very small amounts of null values per col
        #na_ticks = ticks[(nulls_per_col > 0) & (nulls_per_col < 0.05)]
        #if (len(na_ticks) > 0):
        #    ax2.plot(na_ticks, [0,]*len(na_ticks), 's', c='darkorange', markersize=10, 
        #             label='Very few missing values')
    
        fig.suptitle('Null Value Rate per Column', fontsize=30, y=1.05)
        ax2.legend()
        fig.tight_layout() 
        if(save):
            plt.savefig('nullscan.png')
        plt.show()
nullscan(df_train, save=False)

In [286]:
test = pd.read_csv('test.csv')[cols_final_test]
test_id = pd.read_csv('test.csv')['Id']

train_and_test = pd.concat([df_train, test], axis=0)

### Mean imputation
* for categorical variables - replace missing values by the most common value
* for numeric variables - replace by median value

In [287]:
# train
for column in df_train.columns:
    if column in cat_cols:
        df_train[column] = df_train[column].fillna(value=train_and_test[column].value_counts().index[0])
    else: 
        df_train[column] = df_train[column].fillna(value=train_and_test[column].median())

# test 
for column in test.columns:
    if column in cat_cols:
        test[column] = test[column].fillna(value=train_and_test[column].value_counts().index[0])
    else: 
        test[column] = test[column].fillna(value=train_and_test[column].median())

### Numerical features scaling (not for features with log-transform)
1. scaler fit on train_and_test
2. scaler transform numerical values to (-1,1) range

In [288]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

log_transformation_col = ['GrLivArea', 'TotalBsmtSF', 'BsmtUnfSF', 'WoodDeckSF', 'TotalPorchSF', 'LotArea']

col_for_scal = [col for col in num_cols_test if col not in log_transformation_col]

for column in col_for_scal:    
    scaler.fit(train_and_test[column].values.reshape(-1,1))
    df_train[column] = scaler.transform(df_train[column].values.reshape(-1,1))
    test[column] = scaler.transform(test[column].values.reshape(-1,1))

### We need to encode categorical data
1. first approach is to use OneHotEncoder

In [None]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(train_and_test[cat_cols])

data_cat = enc.transform(df_train[cat_cols])
data_cat.columns = enc.get_feature_names(cat_cols)
  
data_cat = pd.DataFrame.sparse.from_spmatrix(data_cat)
data_cat.columns = enc.get_feature_names(cat_cols)
df_train = df_train.drop(cat_cols, axis=1)
df_train = pd.concat([df_train, data_cat], axis=1)
#df_train.head()

In [None]:
test_cat = test[cat_cols]
test_cat = enc.transform(test[cat_cols])
test_cat = pd.DataFrame(test_cat.todense())
test_cat.columns = enc.get_feature_names(cat_cols)

test = test.drop(cat_cols, axis=1)
test = test.reset_index()
test_cat = test_cat.reset_index()
test = pd.concat([test, test_cat], axis=1)
test = test.drop('index', axis=1)
#test.head()

2. Second approach is to use hand encoding based

In [289]:
#MSSubClass
# 10 - low SalePrice median 
# 20 - medium SalePrice median 
# 30 - high SalePrice median 
mssubclass_category_codes = {
    20: 30, 30: 10, 40: 20, 45: 10, 50: 30, 60: 30,
    70: 20, 75: 20, 80: 20, 85: 10, 90: 10, 120: 20,
    150: 20, 160: 20, 180: 10, 190: 20 
}
 
#MSZoning
#only this codes in dataset: RL, RM, FV, RH, C (all)
mszoning_codes = {'C (all)': 10, 'RM': 20, 'RH': 30, 'RL': 40, 'FV': 50}

#Street
street_codes = {'Grvl': 1, 'Pave': 2}

#LotShape
lotshape_codes = {'Reg': 10, 'IR1': 20, 'IR2': 30, 'IR3': 40}
#LandContour
landcontour_codes = {'Bnk': 10, 'Lvl': 20, 'Low': 30, 'HLS': 40}
#Utilities
utilities_codes = {'ELO': 1, 'NoSeWa': 2, 'NoSewr': 3, 'AllPub': 4}
#LotConfig
lotconfig_codes = {'CulDSac': 1, 'Corner': 2, 'FR2': 3, 'FR3': 4, 'Inside': 5}
#LandSlope
landslope_codes = {'Sev': 1, 'Mod': 2, 'Gtl': 3}
#Neighborhood
#inaccuracy in dataset:
# not 'Names' but 'NAmes'
neighborhood_codes = {
    #1
    'Blmngtn': 16, 'Blueste': 8, 'BrDale': 3, 'BrkSide': 6, 'ClearCr': 18, 'CollgCr': 17, 'Crawfor': 19, 'Edwards': 5,  'Gilbert': 14, 
    #10    
    'IDOTRR': 2, 'MeadowV': 1, 'Mitchel': 12, 'NAmes': 10, 'NoRidge': 24, 'NPkVill': 11,  'NridgHt': 25, 'NWAmes': 15,  'OldTown': 4, 'SWISU': 9, 
    #20
    'Sawyer': 7, 'SawyerW': 13, 'Somerst': 21, 'StoneBr': 23, 'Timber': 22,  'Veenker': 20
}

#Condition 1 and 2
condition_codes = {
    'Artery': 1, 'Feedr': 2, 'Norm': 3, 'RRNn': 4, 'RRAn': 5,
    'PosN': 6, 'PosA': 7, 'RRNe': 8, 'RRAe': 9
}

#OverallCond sort by style's median price
overallcond_codes = {
    1: 1, 2: 2, 9: 3, 3: 4, 4: 5,
    8: 6, 7: 7, 6: 8, 5: 9
}

#BldgType
#inaccuracy in dataset:
# not 'TwnhsI' but 'Twnhs'
# not '2FmCon' but '2fmCon'
# not 'Duplx' but 'Duplex'
bldgtype_codes = {'TwnhsE': 1, 'Twnhs': 2, '1Fam': 3, '2fmCon': 4, 'Duplex': 5}

#HouseStyle sort by style's median price
housestyle_codes = {'1.5Unf': 1, '1.5Fin': 2, '2.5Unf': 3, 'SFoyer': 4, '1Story': 5, 'SLvl': 6, '2Story': 7, '2.5Fin': 8}

#RoofStyle
roofstyle_codes = {'Flat': 1, 'Gable': 2, 'Gambrel': 3, 'Hip': 4, 'Mansard': 5, 'Shed': 6}

#RoofMatl
roofmatl_codes = {'Membran': 1, 'WdShake': 2, 'WdShngl': 3, 'Roll': 4, 'Tar&Grv': 5, 'Metal': 6, 'CompShg': 7, 'ClyTile': 8}

#Exterior 1st and 2nd 
#inaccuracy in dataset:
# neither 'WdShing' not 'Wd Sdng' but 'Wd Shng'
# not 'CemntBd' but 'CmentBd'
# not 'BrkComm' but 'Brk Cmn'
exterior_codes = {
    'WdShing': 1, 'Wd Shng': 1, 'Wd Sdng': 2, 'AsbShng': 3, 'AsphShn': 4, 'CBlock': 5, 'CmentBd': 6, 'CemntBd': 6,
    'HdBoard': 7, 'Stone': 8, 'PreCast': 9, 'Other': 10, 'Plywood': 11, 'BrkComm': 12, 'Brk Cmn': 12,
    'VinylSd': 13, 'MetalSd': 14, 'Stucco': 15, 'ImStucc': 16, 'BrkFace': 17
} 

#MasVnrType
masvnrtype_codes = {'None': 1, 'Stone': 2, 'CBlock': 3, 'BrkCmn': 4, 'BrkFace': 5}

#ExterQual
#ExterCond
#HeatingQC
#KitchenQual
fivelevel_codes = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5} 

#BsmtQual
#BsmtCond
#FireplaceQu
#GarageQual
#GarageCond
sixlevel_codes = {'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}

#Foundation
foundation_codes = {'Wood': 1, 'BrkTil': 2, 'CBlock': 3, 'Stone': 4, 'Slab': 5, 'PConc': 6} 

#BsmtExposure
bsmtexposure_codes = {'Gd': 1, 'Av': 2, 'Mn': 3, 'No': 4, 'NA': 5}

#BsmtFinType1 and BsmtFinType2
bsmtfintype_codes = {'NA': 1, 'Unf': 2, 'LwQ': 3, 'Rec': 4, 'BLQ': 5, 'ALQ': 6, 'GLQ': 7}

#Heating
heating_codes = {'Wall': 1, 'OthW': 2, 'Floor': 3, 'GasA': 4, 'Grav': 5, 'GasW': 6}

#CentralAir
yno_codes = {'N': 0, 'Y': 1}

#Electrical
electrical_codes = {'FuseP': 1, 'FuseF': 2, 'Mix': 3, 'FuseA': 4, 'SBrkr': 5}

#Functional
functional_codes = {'Sal': 1, 'Sev': 2, 'Maj2': 3, 'Maj1': 4, 'Mod': 5, 'Min2': 6, 'Min1': 7, 'Typ': 8} 

# GarageType
garagetype_codes = {
    'NA': 1, 'CarPort': 2, 'Detchd': 3, 'Attchd': 4,
    'Basment': 5, 'BuiltIn': 6, '2Types': 7
}

#GarageFinish
garagefinish_codes = {'NA': 1, 'Unf': 2, 'RFn': 3, 'Fin': 4}

#PavedDrive
paveddrive_codes = {'N': 1, 'P': 2, 'Y': 3}

#PoolQC
poolqc_codes = {'NA': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5} 

#Fence
fence_codes = {'NA': 1, 'MnWw': 2, 'GdWo': 3, 'MnPrv': 4, 'GdPrv': 5} 

#MiscFeature
miscfeature_codes = {'NA': 0, 'Othr': 1, 'Shed': 1, 'Gar2': 1, 'Elev': 2, 'TenC': 2}

#SaleType sort by type's median price
saletype_codes = {
    'Oth': 1, 'ConLI': 2, 'COD': 3, 'ConLD': 4, 'VWD': 5, 'ConLw': 6,
    'WD': 7, 'CWD': 8, 'New': 9, 'Con': 10
} 

#SaleCondition sort by condition's median price
salecondition_codes = {'AdjLand': 1, 'Abnorml': 2, 'Family': 3, 'Alloca': 4, 'Normal': 5, 'Partial': 6} 

In [290]:
def handy_map(df):
    #df.MSSubClass = df.MSSubClass.map(mssubclass_codes)
    df['MSSubclass_category'] = df.MSSubClass.map(mssubclass_category_codes)
    df.MSSubClass = df.MSSubClass.astype(np.int64, copy=False)
    df.MSZoning = df.MSZoning.map(mszoning_codes).astype(np.int64, copy=False)
    #df.Street = df.Street.map(street_codes).astype(np.int64, copy=False)
    df.LotShape = df.LotShape.map(lotshape_codes).astype(np.int64, copy=False)
    df.LandContour = df.LandContour.map(landcontour_codes).astype(np.int64, copy=False)
    df.LotConfig = df.LotConfig.map(lotconfig_codes).astype(np.int64, copy=False)
    df.Condition1 = df.Condition1.map(condition_codes).astype(np.int64, copy=False)
    df.Neighborhood = df.Neighborhood.map(neighborhood_codes).astype(np.int64, copy=False)
    df.BldgType = df.BldgType.map(bldgtype_codes).astype(np.int64, copy=False)
    df.HouseStyle = df.HouseStyle.map(housestyle_codes).astype(np.int64, copy=False)
    df.RoofStyle = df.RoofStyle.map(roofstyle_codes).astype(np.int64, copy=False)
    df.Exterior1st = df.Exterior1st.map(exterior_codes).astype(np.int64, copy=False)
    df.Exterior2nd = df.Exterior2nd.map(exterior_codes).astype(np.int64, copy=False)
    df.MasVnrType = df.MasVnrType.map(masvnrtype_codes).astype(np.int64, copy=False)
    df.ExterQual = df.ExterQual.map(fivelevel_codes).astype(np.int64, copy=False)
    df.ExterCond = df.ExterCond.map(fivelevel_codes).astype(np.int64, copy=False)
    df.HeatingQC = df.HeatingQC.map(fivelevel_codes).astype(np.int64, copy=False)
    df.KitchenQual = df.KitchenQual.map(fivelevel_codes).astype(np.int64, copy=False)
    df.Foundation = df.Foundation.map(foundation_codes).astype(np.int64, copy=False)

    
    df.FireplaceQu = df.FireplaceQu.map(sixlevel_codes).astype(np.int64, copy=False)

    #df.CentralAir = df.CentralAir.map(yno_codes).astype(np.int64, copy=False)
    #df.Electrical = df.Electrical.map(electrical_codes).astype(np.int64, copy=False)
    #df.Functional = df.Functional.map(functional_codes).astype(np.int64, copy=False)
    #df.PavedDrive = df.PavedDrive.map(paveddrive_codes).astype(np.int64, copy=False)
    df.BsmtExposure = df.BsmtExposure.map(bsmtexposure_codes).astype(np.int64, copy=False)
    df.SaleCondition = df.SaleCondition.map(salecondition_codes).astype(np.int64, copy=False)
    df.SaleType = df.SaleType.map(saletype_codes).astype(np.int64, copy=False)
    
    #df.OverallCond = df.OverallCond.map(overallcond_codes).astype(np.int64, copy=False)

    df.BsmtQual = df.BsmtQual.map(sixlevel_codes).astype(np.int64, copy=False)
    df.BsmtCond = df.BsmtCond.map(sixlevel_codes).astype(np.int64, copy=False)
    df.BsmtFinType1 = df.BsmtFinType1.map(bsmtfintype_codes).astype(np.int64, copy=False)
    df.BsmtFinType2 = df.BsmtFinType2.map(bsmtfintype_codes).astype(np.int64, copy=False)
    '''
    df.BsmtFinSF1 = df.BsmtFinSF1.fillna(0)    
    df.BsmtFinSF2 = df.BsmtFinSF2.fillna(0)        
    df.BsmtUnfSF = df.BsmtUnfSF.fillna(0)    
    df.TotalBsmtSF = df.TotalBsmtSF.fillna(0)   
    df.GarageCars = df.GarageCars.fillna(0)
    
    cat_cols = ['MSSubClass', 'MSZoning', 'LotShape', 'LandContour', 'LotConfig', 'Neighborhood', 'Condition1',
    'BldgType', 'HouseStyle', 'RoofStyle', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond',
    'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC',
    'KitchenQual', 'FireplaceQu', 'SaleType', 'SaleCondition'
    ]    
    '''
    return df

df_train = handy_map(df_train)
test = handy_map(test)

In [291]:
def add_columns(df):
    # общее количество ванн
    df['TotalBath'] = df.FullBath + df.BsmtFullBath + .5 * (df.HalfBath)    
    df.TotalBath = df.TotalBath.fillna(0)
    #удаляем исходные колонки
    df = df.drop(['FullBath'],1)
    df = df.drop(['BsmtFullBath'],1)   
    df = df.drop(['HalfBath'],1)

    # общая площадь веранд
    df['TotalPorchSF'] = df.OpenPorchSF + df.EnclosedPorch + 1
    #удаляем исходные колонки
    df = df.drop(['OpenPorchSF'],1)
    df = df.drop(['EnclosedPorch'],1)    

    df['TotalQual'] = 5 * df.OverallQual * df.OverallCond + 13 * df.ExterQual * df.ExterCond + 13 * df.KitchenQual
    #удаляем исходные колонки
    df = df.drop(['OverallQual'],1)
    df = df.drop(['OverallCond'],1)
    df = df.drop(['ExterQual'],1)
    df = df.drop(['ExterCond'],1)
    df = df.drop(['KitchenQual'],1)

    df['BsmtTotalQual'] = df.BsmtQual * df.BsmtCond + \
        df.BsmtFinType1 * (df.BsmtFinSF1/(df.TotalBsmtSF + 1)) + \
        df.BsmtFinType2 * (df.BsmtFinSF2/(df.TotalBsmtSF + 1)) - \
        df.BsmtUnfSF/(df.TotalBsmtSF + 1)

    #df['BsmtQualFin_1and2'] = (df.BsmtFinType1 * np.log(df.BsmtFinSF1 + 1) + df.BsmtFinType2 * np.log(df.BsmtFinSF2 + 1)) * np.log(df.TotalBsmtSF + 1)

    #удаляем исходные колонки
    '''df = df.drop(['BsmtQual'],1)
    df = df.drop(['BsmtCond'],1)
    df = df.drop(['BsmtFinType1'],1)
    df = df.drop(['BsmtFinType2'],1)
    df = df.drop(['BsmtFinSF1'],1)
    df = df.drop(['BsmtFinSF2'],1)
    df = df.drop(['TotalBsmtSF'],1)
    df = df.drop(['BsmtUnfSF'],1)'''

    return df

df_train = add_columns(df_train)
test = add_columns(test)

In [292]:
df_train.describe(include='all')

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,LotShape,LandContour,LotConfig,Neighborhood,Condition1,BldgType,HouseStyle,YearBuilt,YearRemodAdd,RoofStyle,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,HeatingQC,2ndFlrSF,GrLivArea,BedroomAbvGr,Fireplaces,FireplaceQu,GarageCars,WoodDeckSF,MoSold,YrSold,SaleType,SaleCondition,MSSubclass_category,TotalBath,TotalPorchSF,TotalQual,BsmtTotalQual
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.9,37.14,0.02,10516.83,14.08,20.5,4.13,12.84,3.03,2.91,5.28,-0.0,0.03,2.41,10.05,9.96,2.34,0.01,4.26,3.57,3.01,3.34,4.57,0.0,2.27,-0.02,567.24,1057.43,4.15,0.02,1515.46,0.01,0.02,3.72,0.0,94.24,0.04,0.02,7.0,4.83,25.59,-0.01,0.97,181.35,10.18
std,421.61,42.3,7.96,0.94,9981.26,5.82,4.5,1.43,6.69,0.88,0.71,1.55,1.0,0.99,0.83,4.59,4.53,1.81,1.01,1.6,0.68,0.28,1.04,2.07,1.0,0.87,0.95,441.87,438.71,0.96,1.02,525.48,0.99,1.0,0.6,0.98,125.34,1.0,1.01,1.02,0.89,6.86,1.48,1.3,32.51,2.55
min,1.0,20.0,10.0,-2.07,1300.0,10.0,10.0,1.0,1.0,1.0,1.0,1.0,-3.28,-1.64,1.0,1.0,1.0,1.0,-0.57,1.0,2.0,1.0,1.0,2.0,-0.97,2.0,-0.29,0.0,0.0,1.0,-0.79,334.0,-3.48,-0.92,1.0,-2.32,0.0,-1.92,-1.36,1.0,1.0,10.0,-2.22,-0.06,81.76,1.0
25%,365.75,20.0,40.0,-0.4,7553.5,10.0,20.0,3.0,7.0,3.0,3.0,5.0,-0.57,-0.83,2.0,7.0,7.0,1.0,-0.57,3.0,3.0,3.0,3.0,2.0,-0.97,2.0,-0.29,223.0,795.75,3.0,-0.79,1129.5,-1.05,-0.92,3.0,-1.01,0.0,-0.45,-0.6,7.0,5.0,20.0,-1.23,-0.06,156.16,8.45
50%,730.5,50.0,40.0,-0.06,9478.5,10.0,20.0,5.0,13.0,3.0,3.0,5.0,0.06,0.47,2.0,13.0,13.0,1.0,-0.57,3.0,4.0,3.0,4.0,5.0,-0.13,2.0,-0.29,477.5,991.5,5.0,-0.79,1464.0,0.17,0.62,4.0,0.31,0.0,-0.08,0.16,7.0,5.0,30.0,-0.32,0.53,168.59,11.0
75%,1095.25,70.0,40.0,0.42,11601.5,20.0,20.0,5.0,17.0,3.0,3.0,7.0,0.95,0.94,2.0,13.0,13.0,5.0,0.35,6.0,4.0,3.0,4.0,7.0,0.59,2.0,-0.29,808.0,1298.25,5.0,0.91,1776.75,0.17,0.62,4.0,0.31,168.0,0.66,0.92,7.0,5.0,30.0,0.68,1.49,206.36,11.7
max,1460.0,190.0,50.0,10.44,215245.0,40.0,40.0,5.0,25.0,9.0,5.0,8.0,1.28,1.23,6.0,17.0,17.0,5.0,8.35,6.0,5.0,4.0,4.0,7.0,11.42,7.0,8.42,2336.0,6110.0,5.0,4.03,5642.0,6.25,3.72,5.0,2.93,857.0,2.13,1.68,10.0,6.0,30.0,7.11,9.47,419.82,19.98


In [293]:
def log_transform(df, col):
    zero = df[df[col] == 0].shape[0] 
    if zero:
        hascol = 'Has'+ col
        df[hascol] = pd.Series(len(df[col]), index=df.index)
        df[hascol] = 0 
        df.loc[df[col]>0, hascol] = 1
        df.loc[df[hascol]==1, col] = np.log(df[col])
        df = df.drop([hascol],1)
    else:
        df[col] = np.log(df[col])

log_transform(target, 'SalePrice')

for c in log_transformation_col:
    log_transform(df_train, c)
    log_transform(test, c)

## Feature Importance: Random Forest

In [297]:
df_train.describe(include='all')

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,LotShape,LandContour,LotConfig,Neighborhood,Condition1,BldgType,HouseStyle,YearBuilt,YearRemodAdd,RoofStyle,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,HeatingQC,2ndFlrSF,GrLivArea,BedroomAbvGr,Fireplaces,FireplaceQu,GarageCars,WoodDeckSF,MoSold,YrSold,SaleType,SaleCondition,MSSubclass_category,TotalBath,TotalPorchSF,TotalQual,BsmtTotalQual,HasTotalBsmtSF,HasBsmtUnfSF,HasWoodDeckSF
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,946.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.9,37.14,0.02,9.11,14.08,20.5,4.13,12.84,3.03,2.91,5.28,-0.0,0.03,2.41,10.05,9.96,2.34,0.01,4.26,3.57,3.01,3.34,4.57,0.0,2.27,-0.02,5.65,6.75,4.15,0.02,7.27,0.01,0.02,3.72,0.0,2.45,0.04,0.02,7.0,4.83,25.59,-0.01,0.09,181.35,10.18,0.97,0.92,0.48
std,421.61,42.3,7.96,0.94,0.52,5.82,4.5,1.43,6.69,0.88,0.71,1.55,1.0,0.99,0.83,4.59,4.53,1.81,1.01,1.6,0.68,0.28,1.04,2.07,1.0,0.87,0.95,1.85,1.15,0.96,1.02,0.33,0.99,1.0,0.6,0.98,2.59,1.0,1.01,1.02,0.89,6.86,1.48,0.84,32.51,2.55,0.16,0.27,0.5
min,1.0,20.0,10.0,-2.07,7.17,10.0,10.0,1.0,1.0,1.0,1.0,1.0,-3.28,-1.64,1.0,1.0,1.0,1.0,-0.57,1.0,2.0,1.0,1.0,2.0,-0.97,2.0,-0.29,0.0,0.0,1.0,-0.79,5.81,-3.48,-0.92,1.0,-2.32,0.0,-1.92,-1.36,1.0,1.0,10.0,-2.22,-2.46,81.76,1.0,0.0,0.0,0.0
25%,365.75,20.0,40.0,-0.4,8.93,10.0,20.0,3.0,7.0,3.0,3.0,5.0,-0.57,-0.83,2.0,7.0,7.0,1.0,-0.57,3.0,3.0,3.0,3.0,2.0,-0.97,2.0,-0.29,5.41,6.68,3.0,-0.79,7.03,-1.05,-0.92,3.0,-1.01,0.0,-0.45,-0.6,7.0,5.0,20.0,-1.23,-0.53,156.16,8.45,1.0,1.0,0.0
50%,730.5,50.0,40.0,-0.06,9.16,10.0,20.0,5.0,13.0,3.0,3.0,5.0,0.06,0.47,2.0,13.0,13.0,1.0,-0.57,3.0,4.0,3.0,4.0,5.0,-0.13,2.0,-0.29,6.17,6.9,5.0,-0.79,7.29,0.17,0.62,4.0,0.31,0.0,-0.08,0.16,7.0,5.0,30.0,-0.32,0.08,168.59,11.0,1.0,1.0,0.0
75%,1095.25,70.0,40.0,0.42,9.36,20.0,20.0,5.0,17.0,3.0,3.0,7.0,0.95,0.94,2.0,13.0,13.0,5.0,0.35,6.0,4.0,3.0,4.0,7.0,0.59,2.0,-0.29,6.69,7.17,5.0,0.91,7.48,0.17,0.62,4.0,0.31,5.12,0.66,0.92,7.0,5.0,30.0,0.68,0.73,206.36,11.7,1.0,1.0,1.0
max,1460.0,190.0,50.0,10.44,12.28,40.0,40.0,5.0,25.0,9.0,5.0,8.0,1.28,1.23,6.0,17.0,17.0,5.0,8.35,6.0,5.0,4.0,4.0,7.0,11.42,7.0,8.42,7.76,8.72,5.0,4.03,8.64,6.25,3.72,5.0,2.93,6.75,2.13,1.68,10.0,6.0,30.0,7.11,2.25,419.82,19.98,1.0,1.0,1.0


In [298]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

regr = RandomForestRegressor(random_state=42)

params = {'max_features': [13,15], 'max_depth': [25,28], 'n_estimators': [300, 350]}
gs = GridSearchCV(regr, params)
gs.fit(df_train, target)
regr = gs.best_estimator_
regr_score = gs.best_score_

regr_importance = pd.Series(regr.feature_importances_)
sorted_importance = regr_importance.sort_values(ascending=False)
sorted_colnames = df_train.columns[sorted_importance.index]
pd.Series(index=sorted_colnames, data=sorted_importance.values).head(30).plot.bar();

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
print(regr)

In [None]:
from sklearn.inspection import permutation_importance
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(random_state=42)

params = {'loss':('ls', 'huber'), 'n_estimators': [200, 300], 'learning_rate': [0.1, 0.2]}
gs = GridSearchCV(gbr, params)
gs.fit(df_train, target)
gbr = gs.best_estimator_
gbr_score = gs.best_score_

gbr_importance = permutation_importance(gbr, df_train, target, scoring='neg_mean_squared_error').importances_mean
pd.Series(data=gbr_importance, index=df_train.columns).sort_values(ascending=False).head(30).plot.bar();


In [None]:
print(gbr)

In [None]:
gbr_score

In [None]:
from sklearn.ensemble import AdaBoostRegressor

ada = AdaBoostRegressor(random_state=42)

params = {'n_estimators':(50, 100, 200), 'learning_rate':(1, 1.1, 0.9), 'loss': ['linear', 'square', 'exponential']}
gs = GridSearchCV(ada, params)
gs.fit(df_train, target)
ada = gs.best_estimator_
ada_score = gs.best_score_

ada_importance = permutation_importance(ada, df_train, target, scoring='neg_mean_squared_error').importances_mean
pd.Series(data=ada_importance, index=df_train.columns).sort_values(ascending=False).head(30).plot.bar();

In [None]:
print(ada)


In [None]:
ada_score

In [None]:
from sklearn.linear_model import ElasticNet

en = ElasticNet(random_state=42)

params = {'selection':('cyclic', 'random'), 'alpha':(1.0, 0.8), 'l1_ratio':(.3, .5, .7)}
gs = GridSearchCV(en, params)
gs.fit(df_train, target)
en = gs.best_estimator_
en_score = gs.best_score_

en_importance = permutation_importance(en, df_train, target, scoring='neg_mean_squared_error').importances_mean
pd.Series(data=en_importance, index=df_train.columns).sort_values(ascending=False).head(30).plot.bar();

In [None]:
en

In [None]:
[regr_score, en_score, ada_score, gbr_score]

In [None]:
# the best column names are stored in best.index
from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler()
regr_importance_s = pd.Series(minmax.fit_transform(regr_importance.values.reshape((-1,1))).flat)
en_importance_s = pd.Series(minmax.fit_transform(en_importance.reshape((-1,1))).flat)
ada_importance_s = pd.Series(minmax.fit_transform(ada_importance.reshape((-1,1))).flat)
gbr_importance_s = pd.Series(minmax.fit_transform(gbr_importance.reshape((-1,1))).flat)

importance_data = pd.Series(1/4*(regr_importance_s*regr_score + en_importance_s*en_score + ada_importance_s*ada_score + gbr_importance_s*gbr_score))
importance = pd.Series(data=importance_data.values, index=df_train.columns)
best = importance.sort_values(ascending=False).head(30)
plot = best.plot.bar();

plot.set_title('Best scoring Features')
plot.set_xlabel('Feature')
plot.set_ylabel('Sum of scores, weighted by model performance')

In [None]:
final = GradientBoostingRegressor(random_state=42)

params = {'loss':('ls', 'huber'), 'n_estimators': [200, 300], 'learning_rate': [0.1, 0.2]}
gs = GridSearchCV(final, params)
gs.fit(df_train[best.index], target)
final = gs.best_estimator_
final_score = gs.best_score_
final_score

In [None]:
rf_final = RandomForestRegressor(random_state=42)

params = {'max_features': [10,12], 'max_depth': [25], 'n_estimators': [300]}
gs = GridSearchCV(rf_final, params)
gs.fit(df_train[best.index], target)
rf_final = gs.best_estimator_
rf_final_score = gs.best_score_
rf_final_score

In [None]:
ada_final = AdaBoostRegressor(random_state=42, learning_rate=1.1, loss='exponential')

params = {'n_estimators':(200, 250)}
gs = GridSearchCV(ada_final, params)
gs.fit(df_train[best.index], target)
ada_final = gs.best_estimator_
ada_final_score = gs.best_score_
ada_final_score

In [None]:
en_final = ElasticNet(alpha=0.8, l1_ratio=0.7, random_state=42)

params = {'selection':('cyclic', 'random')}
gs = GridSearchCV(en_final, params)
gs.fit(df_train[best.index], target)
en_final = gs.best_estimator_
en_final_score = gs.best_score_
en_final_score

In [None]:
preds = 1/2*(final.predict(test[best.index]) + rf_final.predict(test[best.index]))
preds

In [None]:
df_result_x_test = pd.DataFrame(data={'Id': test['Id'], 'SalePrice': preds})

df_result_x_test['SalePrice'] = df_result_x_test['SalePrice'].astype(np.int64, copy=False)
df_result_x_test.to_csv('rf_grid_submission_4.csv', index=False)

In [None]:

#save datasets
#target['SalePrice'] = np.log(target['SalePrice'])   

df_train = pd.concat([df_train_id, df_train, target], axis=1)
df_train.to_csv('train_norm_3.csv', index=False, na_rep='NA')

test = pd.concat([test_id, test], axis=1)
test.to_csv('test_norm_3.csv', index=False, na_rep='NA')