In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
from scipy.stats import skew
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
%matplotlib inline



# Data load

In [2]:
train_df = pd.read_csv('../dat/train.csv')
print(train_df.shape)
train_df.head(3)

(1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500


In [3]:
train_df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [5]:
y = train_df.SalePrice
train_df = train_df.drop('SalePrice', axis=1)
train_idx = len(train_df)

In [6]:
test_df = pd.read_csv('../dat/test.csv')
print(test_df.shape)
len(test_df.columns) == len(train_df.columns)

(1459, 80)


True

In [8]:
df = pd.concat([train_df, test_df], axis=0)
df.shape

(2919, 80)

# Missing Vals

In [None]:
# Number of colomns with nulls
def check_nulls(df):
    print(df.isnull().sum())
    return len(df.isnull().sum().nonzero()[0])

In [None]:
print('Number of colomns with nulls: ', check_nulls(train_df))

## Object type fill in missing vals:

In [None]:
train_df_obj = train_df.select_dtypes(include=['object'])
print(train_df_obj.shape)


In [None]:
print('Number of colomns with nulls: ', check_nulls(train_df_obj))

## FILLING ALL with None is not always the good way. Sometimes there's a specific default value per colomns

In [None]:
train_df_obj = train_df_obj.fillna("None")

In [None]:
print('Number of colomns with nulls: ', check_nulls(train_df_obj))

## Numerical missing values handling:

In [None]:
imp = Imputer()
train_df_non_obj = train_df.select_dtypes(exclude=['object'])
print(train_df_non_obj.shape)

## You must drop the target colomn

In [None]:
train_df_non_obj = train_df_non_obj.drop('SalePrice', axis=1)
print(train_df_non_obj.shape)

In [None]:
print('Number of colomns with nulls: ', check_nulls(train_df_non_obj))

In [None]:
#train_df_non_obj = imp.fit_transform(train_df_non_obj)
train_df_non_obj = pd.DataFrame(imp.fit_transform(train_df_non_obj), index=train_df_non_obj.index, columns=train_df_non_obj.columns)


In [None]:
print('Number of colomns with nulls: ', check_nulls(train_df_non_obj))

## Encode the skewed cols into log transform

In [None]:

# Log transform of the skewed numerical features to lessen impact of outliers
# Inspired by Alexandru Papiu's script : https://www.kaggle.com/apapiu/house-prices-advanced-regression-techniques/regularized-linear-models
# As a general rule of thumb, a skewness with an absolute value > 0.5 is considered at least moderately skewed
skewness = train_df_non_obj.apply(lambda x: skew(x))
skewness = skewness[abs(skewness) > 0.5]
print(str(skewness.shape[0]) + " skewed numerical features to log transform")
skewed_features = skewness.index
train_df_non_obj[skewed_features] = np.log1p(train_df_non_obj[skewed_features])

## Now put the two transfroms together

In [None]:
train_df = pd.concat([train_df_non_obj, train_df_obj, train_df.SalePrice], axis=1)# Dont forget to merge back the target price!

In [None]:
train_df.shape

In [None]:
print('Number of colomns with nulls: ', check_nulls(train_df))

# Handle categorial features via OHE

In [None]:
train_df = pd.get_dummies(train_df)

In [None]:
train_df.head(3)

In [None]:
train_df.shape

# Why the number of colomns increased?

In [None]:
for col in train_df.columns:
    print(col)

Because the categorial cols are now col per cat_value: PavedDrive --> PavedDrive_N, PavedDrive_P, PavedDrive_Y, each has only 1 or 0

Note that: if you do the same after filling NA with None, you get more colmns due to the _None cat.
    
Another way to do it is: LableEncoder. But it only works for ordinal not categorial vars, o.w. higher values means higher importance.

In [None]:
train_df.dtypes # No objects

# Target variable

In [None]:
# Log transform the target for official scoring
train_df.SalePrice = np.log1p(train_df.SalePrice)


# Train test split

In [None]:
X = train_df.drop('SalePrice', axis=1)
Y = train_df.SalePrice

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X.as_matrix(), Y.as_matrix(), test_size=0.25)

In [None]:
print(X.shape)
print(Y.shape)
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)


In [None]:


XGB = XGBRegressor(n_estimators=1000, learning_rate=0.05)
# Add silent=True to avoid printing out updates with each cycle
XGB.fit(X_train, Y_train, eval_set=[(X_test, Y_test)], verbose=True, early_stopping_rounds=5)

In [None]:
# make predictions
predictions = XGB.predict(X_test)


print("Mean Absolute Error : " + str(mean_absolute_error(predictions, Y_test)))
print("Mean Squared Error : " + str(mean_squared_error(predictions, Y_test)))
print("Root Mean Squared Error : " + str(np.sqrt(mean_squared_error(predictions, Y_test))))

# Submit

In [None]:
test_df = pd.read_csv('../dat/test.csv')
print(test_df.shape)


In [None]:
check_nulls(test_df)

In [None]:
test_df_obj = test_df.select_dtypes(include=['object']).fillna("None")
test_df_non_obj = test_df.select_dtypes(exclude=['object'])
# fit on train and transform on test
imp.fit(train_df_non_obj)
test_df_non_obj = pd.DataFrame(imp.transform(test_df_non_obj), index=test_df_non_obj.index, columns=test_df_non_obj.columns)

# skewed_features on train not test
test_df_non_obj[skewed_features] = np.log1p(test_df_non_obj[skewed_features])

test_df = pd.concat([test_df_non_obj, test_df_obj], axis=1)

In [None]:
test_df.shape

In [None]:
check_nulls(test_df)

In [None]:
#test_df.shape

In [None]:
# make predictions
#predictions = XGB.predict(test_df)


