In [1]:
import numpy as np 
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler,StandardScaler

from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import cross_val_score

In [2]:
data = pd.read_csv('train.csv', index_col=['Id'])
data = pd.DataFrame(data)

SOME FEATURE ENGINEERING TRICKS

In [3]:
data.MSSubClass = data.MSSubClass.astype(str)
fill_not_cols = ['Alley', 'PoolQC','Fence', 'MiscFeature']
age_cols = ['YearBuilt','YearRemodAdd','GarageYrBlt']
drop_cols = ['BsmtFinSF2', '2ndFlrSF', 'LowQualFinSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea', 'MoSold']
data['Age'] = (data.YrSold - data.YearBuilt)
data.drop('YearBuilt', axis=1, inplace=True)
data['AgeRenow'] = (data.YrSold - data.YearRemodAdd)
data.drop('YearRemodAdd', axis=1, inplace=True)
data['AgeGar'] = (data.YrSold - data.GarageYrBlt)
data.drop('GarageYrBlt', axis=1, inplace=True)
data['TotAge'] = data['Age']+data['AgeRenow']+data['AgeGar']
data.drop(['Age','AgeRenow','AgeGar'], axis=1, inplace=True)
data.reset_index(drop=True, inplace=True)

X_full = data.copy()
change_to_mean = ['TotAge','LotFrontage']
change_to_zero = ['MasVnrArea']
cat_change_to_none = ['Alley', 'MasVnrType','FireplaceQu','PoolQC','Fence', 'MiscFeature']
cat_to_most_frequent = ['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','Electrical','GarageType','GarageFinish',
                        'GarageQual','GarageCond']

IMPUTING

In [4]:

X_full['MasVnrArea'].fillna(0, inplace=True)

col_val_mean={}
for col in change_to_mean: 
    X_full[col].fillna(X_full[col].mean(), inplace=True)
    col_val_mean[col]=X_full[col].mean()
    
for col in cat_change_to_none: 
    X_full[col].fillna('None', inplace=True)
    
col_val_frequent={}
for col in cat_to_most_frequent:
    X_full[col].fillna(X_full[col].mode()[0], inplace=True)
    col_val_frequent[col]=X_full[col].mode()[0]

cat_cols = [col for col in X_full.columns if X_full[col].dtype == 'object']
num_cols = [col for col in X_full.columns if X_full[col].dtype in ['float64', 'int64']]

CUSTOM ENCODER 
- it allows to encode label in accordence with average sale price(target), namely as higher mean price as higher encoding number

In [5]:
class Enc():
    
    def __init__(self):
        pass

    def fit_transform(self,train,train_columns):
        self.train = train
        self.train_columns = train_columns
        self.dict_list=[]
        self.new_feat_names=[]
        
        for col in train_columns:
            ordering = pd.DataFrame()
            ordering['val'] = train[col].unique()
            ordering.index = ordering.val
            ordering['spmean'] = train[[col, 'SalePrice']].groupby(col).mean()['SalePrice']
            ordering = ordering.sort_values('spmean')
            ordering['ordering'] = range(1, ordering.shape[0]+1)
            var = ordering.columns[0]
            locals()[var] = ordering.ordering.to_dict()
            self.dict_list.append(locals()[var])
            ordering = ordering['ordering'].to_dict()
            self.new_feat_names.append(col+'_E')


            for cat, o in ordering.items():
                train.loc[train[col] == cat, col+'_E'] = o
                
    
    def transform(self, test, test_cols):
        self.test = test
        self.test_cols = test_cols
        
        
        for i in range(len(test_cols)):
            for key, val in self.dict_list[i].items():
                test.loc[test[test_cols[i]]==key, test_cols[i]+'_E'] = val
                
    def show_dic(self):
        return self.dict_list

In [6]:
train_encode=Enc()
train_encoded = train_encode.fit_transform(X_full, cat_cols)
X_full.drop(cat_cols, axis=1, inplace=True)

OUTLIERS DROPPING

In [7]:
X_full.drop(X_full[(X_full.LotFrontage>200)
              | (X_full.LotArea>10**5)
              | (X_full.MasVnrArea>1200)
              | (X_full.BsmtFinSF1>3000)
                | (X_full.BsmtUnfSF>4000)
                |(X_full['1stFlrSF']>3000)
                | (X_full.GrLivArea>4500)
                | (X_full.GarageArea>1200)
                | (X_full.OpenPorchSF>450)
                | (X_full.EnclosedPorch>400)].index,axis=0, inplace=True)

X_full['TotSF']=X_full['1stFlrSF']+X_full['2ndFlrSF']

FEATURE ENGINEERING AGAIN

In [8]:
X_full['2f_ind']=X_full['2ndFlrSF'].apply(lambda x: 1 if x>0 else 0)
X_full['Garage_ind']=X_full['GarageArea'].apply(lambda x: 1 if x>0 else 0)
X_full['WoodDeck_ind']=X_full['WoodDeckSF'].apply(lambda x: 1 if x>0 else 0)
X_full['OpenPorch_ind']=X_full['OpenPorchSF'].apply(lambda x: 1 if x>0 else 0)
X_full['EnclosedPorch_ind']=X_full['EnclosedPorch'].apply(lambda x: 1 if x>0 else 0)
#X_full['3SsnPorch_ind']=X_full['3SsnPorch'].apply(lambda x: 1 if x>0 else 0)
X_full['ScreenPorch_ind']=X_full['ScreenPorch'].apply(lambda x: 1 if x>0 else 0)
#X_full['Pool_ind']=X_full['PoolArea'].apply(lambda x: 1 if x>0 else 0)
X_full['MiscVal_ind']=X_full['MiscVal'].apply(lambda x: 1 if x>0 else 0)

X_full.drop(['1stFlrSF','2ndFlrSF', 'LandContour_E', 'Utilities_E', 'BsmtHalfBath'], axis=1, inplace=True)

TRAIN/TEST SPLITTING and XGB MODEL TRAINING (with already optimal hyperparameters)

In [9]:
y=X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)
X_train, X_valid, y_train, y_valid = train_test_split(X_full, y, train_size=0.8, test_size=0.2,random_state=1)

model= XGBRegressor(n_estimators=350, learning_rate=0.05, random_state=1)

model.fit(X_train, y_train)

preds = model.predict(X_valid)

mae = mean_absolute_error(y_valid, preds)
print("error", mae)


cross_val_scores = -1*cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
crss_val_score = cross_val_scores.mean()



print( 'cross_val error', crss_val_score)

error 15452.891476535468
cross_val error 14290.04499882364


TEST DATA PREDICTION 
- everything is the same as for TRAIN data set excepting encoding, here we apply just "transform" method of Enc class, instead of "fit_transform".

In [10]:
test_data_raw = pd.read_csv('test.csv', index_col=['Id'])
test_data = pd.DataFrame(test_data_raw)
test_data.reset_index(drop=True)

test_data.MSSubClass = test_data.MSSubClass.astype(str)
fill_not_cols = ['Alley', 'PoolQC','Fence', 'MiscFeature']
age_cols = ['YearBuilt','YearRemodAdd','GarageYrBlt']
drop_cols = ['BsmtFinSF2', '2ndFlrSF', 'LowQualFinSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea', 'MoSold']
test_data['Age'] = (test_data.YrSold - test_data.YearBuilt)
test_data.drop('YearBuilt', axis=1, inplace=True)
test_data['AgeRenow'] = (test_data.YrSold - test_data.YearRemodAdd)
test_data.drop('YearRemodAdd', axis=1, inplace=True)
test_data['AgeGar'] = (test_data.YrSold - test_data.GarageYrBlt)
test_data.drop('GarageYrBlt', axis=1, inplace=True)
test_data['TotAge'] = test_data['Age']+test_data['AgeRenow']+test_data['AgeGar']
test_data.drop(['Age','AgeRenow','AgeGar'], axis=1, inplace=True)
test_data.reset_index(drop=True, inplace=True)

test_full=test_data.copy()

test_full['MasVnrArea'].fillna(0, inplace=True)

col_val_mean={}
for col in change_to_mean: 
    test_full[col].fillna(test_full[col].mean(), inplace=True)
    col_val_mean[col]=test_full[col].mean()
    
for col in cat_change_to_none: 
    test_full[col].fillna('None', inplace=True)
    
col_val_frequent={}
for col in cat_to_most_frequent:
    test_full[col].fillna(test_full[col].mode()[0], inplace=True)
    col_val_frequent[col]=test_full[col].mode()[0]

cat_cols = [col for col in test_full.columns if test_full[col].dtype == 'object']
num_cols = [col for col in test_full.columns if test_full[col].dtype in ['float64', 'int64']]



df = test_full.isnull().sum()
for col in list(df[df>0].index):
    test_full[col].fillna(test_full[col].mode()[0], inplace=True)
    
test_encoded = train_encode.transform(test_full, cat_cols)
test_full.drop(cat_cols, axis=1, inplace=True) 

df = test_full.isnull().sum()
for col in list(df[df>0].index):
    test_full[col].fillna(test_full[col].mode()[0], inplace=True)
    
test_full['TotSF']=test_full['1stFlrSF']+test_full['2ndFlrSF']

test_full['2f_ind']=test_full['2ndFlrSF'].apply(lambda x: 1 if x>0 else 0)
test_full['Garage_ind']=test_full['GarageArea'].apply(lambda x: 1 if x>0 else 0)
test_full['WoodDeck_ind']=test_full['WoodDeckSF'].apply(lambda x: 1 if x>0 else 0)
test_full['OpenPorch_ind']=test_full['OpenPorchSF'].apply(lambda x: 1 if x>0 else 0)
test_full['EnclosedPorch_ind']=test_full['EnclosedPorch'].apply(lambda x: 1 if x>0 else 0)
#test_full['3SsnPorch_ind']=test_full['3SsnPorch'].apply(lambda x: 1 if x>0 else 0)
test_full['ScreenPorch_ind']=test_full['ScreenPorch'].apply(lambda x: 1 if x>0 else 0)
#test_full['Pool_ind']=test_full['PoolArea'].apply(lambda x: 1 if x>0 else 0)
test_full['MiscVal_ind']=test_full['MiscVal'].apply(lambda x: 1 if x>0 else 0)

test_full.drop(['1stFlrSF','2ndFlrSF', 'LandContour_E', 'Utilities_E', 'BsmtHalfBath'], axis=1, inplace=True)

MODEL APPLYING (FORECASTING)

In [11]:
test_pred = model.predict(test_full)

PRECICTIONS EXPORT

In [12]:
sub=pd.DataFrame({'Id':test_data_raw.index,'SalePrice':test_pred})
#sub.set_index(new_x.index, drop=True, inplace=True)
sub.to_csv('XGB_submission.csv', index=False)

In [13]:
sub

Unnamed: 0,Id,SalePrice
0,1461,125210.281250
1,1462,161066.046875
2,1463,185555.156250
3,1464,195264.578125
4,1465,196461.656250
...,...,...
1454,2915,85024.906250
1455,2916,86633.617188
1456,2917,170497.515625
1457,2918,117955.601562
