# importing libraries

In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import Series,DataFrame
%matplotlib inline

# importing dataset

In [15]:
df = pd.read_csv('train.csv')
df_copy = df.copy()

In [29]:
df_copy.sort_values(by='YrSold',ascending=True,ignore_index=True,inplace=True)
df_copy;


X,Y = df_copy.drop('SalePrice',axis=1),df_copy.SalePrice
train_opt = (df_copy['YrSold']!=2010)
X_train,Y_train = X[train_opt],Y[train_opt]
valid_opt = (df_copy['YrSold']==2010)
X_valid,Y_valid = X[valid_opt],Y[valid_opt]

# preprocessing dataset for modelling

In [17]:
for column,contents in df_copy.items():
    if pd.api.types.is_numeric_dtype(contents):
        if pd.isnull(contents).sum():
            df_copy[column +'_is_missing'] = pd.isnull(contents)
            df_copy[column] = contents.fillna(contents.median())
            for column,contents in df_copy.items():
                if pd.api.types.is_string_dtype(contents):
                    df_copy[column +'_is_missing'] = pd.isnull(contents)
                    df_copy[column] = pd.Categorical(contents).codes +1

In [19]:
df_copy

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,GarageQual_is_missing,GarageCond_is_missing,PavedDrive_is_missing,PoolQC_is_missing,Fence_is_missing,MiscFeature_is_missing,SaleType_is_missing,SaleCondition_is_missing,MasVnrArea_is_missing,GarageYrBlt_is_missing
0,782,60,4,65.0,7153,2,0,4,4,1,...,False,False,False,True,True,True,False,False,False,False
1,1220,160,5,21.0,1680,2,0,4,4,1,...,True,True,False,True,True,True,False,False,False,True
2,282,20,2,60.0,7200,2,2,4,4,1,...,False,False,False,True,True,True,False,False,False,False
3,570,90,4,69.0,7032,2,0,1,4,1,...,False,False,False,True,True,True,False,False,False,False
4,1221,20,4,66.0,7800,2,0,1,4,1,...,False,False,False,True,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,354,30,5,60.0,8520,2,0,4,4,1,...,False,False,False,True,True,True,False,False,False,False
1456,1323,60,4,107.0,10186,2,0,1,4,1,...,False,False,False,True,True,True,False,False,False,False
1457,163,20,4,95.0,12182,2,0,4,4,1,...,False,False,False,True,True,True,False,False,False,False
1458,159,60,2,100.0,12552,2,0,4,4,1,...,False,False,False,True,False,True,False,False,False,False


In [23]:
pd.api.types.is_categorical_dtype(df_copy.Street)

False

# modeling

In [24]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_absolute_error

n_estimators = [int(x) for x in np.linspace(start=200,stop=2000,num=10)]
max_features = [0.5,1,'auto','sqrt']
max_depth = [int(x) for x in np.linspace(10,110,num=11)]
max_depth.append(None)
min_sample_split = [2,5,10]
min_sample_leaf = [1,2,4]
bootstrap = [True,False]

grid = {'n_estimators':n_estimators,
        'max_features':max_features,
        'max_depth':max_depth,
        'min_sample_split':min_sample_split,
        'min_sample_leaf':min_sample_leaf,
        'bootstrap':bootstrap}
#print(grid)
RFR = RandomForestRegressor()

RandomCV = RandomizedSearchCV(estimator=RFR,
                              param_distributions=grid,
                              n_iter=100,
                              cv=5,
                              random_state=45,
                              n_jobs=-1,
                              verbose=True)

GridCV = GridSearchCV(estimator=RFR,
                      cv=5,
                      n_jobs=-1,
                      param_grid=grid,
                      verbose=True)

def RMSLE(y_true,y_pred):
    '''
    returns root mean square log error
    '''
    return np.sqrt(mean_squared_log_error(y_true,y_pred))
    
def MAE(y_true,y_pred):
    '''
    returns mean absolute error
    '''
    return mean_absolute_error(y_true,y_pred)

In [25]:
model_score = {}

def fit_prediction_and_score(X_train,Y_train,X_valid,Y_valid,model):
    model.fit(X_train,Y_train)
    score = {'train_MAE':MAE(Y_train,model.predict(X_train)),
             'valid_MAE':MAE(Y_valid,model.predict(X_valid)),
             'train_RMSLE':RMSLE(Y_train,model.predict(X_train)),
             'valid_RMSLE':RMSLE(Y_train,model.predict(X_train)),
             'train_score':model.score(X_train,Y_train),
             'valid_score':model.score(X_valid,Y_valid)
            }
    model_score = score
    return model_score

In [46]:
fit_prediction_and_score(X_train,Y_train,X_valid,Y_valid,model=
                        GridCV)

Fitting 5 folds for each of 8640 candidates, totalling 43200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


ValueError: Invalid parameter min_sample_leaf for estimator RandomForestRegressor(max_depth=10, max_features=0.5). Check the list of available parameters with `estimator.get_params().keys()`.

In [11]:
RFR.get_params().keys()

dict_keys(['bootstrap', 'ccp_alpha', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

In [41]:
from sklearn.neighbors import LocalOutlierFactor
lof = LocalOutlierFactor()
yhat = lof.fit_predict(X_train)
mask = (yhat!=-1).astype(int)
Y_train = Y_train[mask]
X_train = X_train[mask,]

MemoryError: Unable to allocate 396. MiB for an array with shape (51940000,) and data type int64

In [30]:
X_train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,GarageQual_is_missing,GarageCond_is_missing,PavedDrive_is_missing,PoolQC_is_missing,Fence_is_missing,MiscFeature_is_missing,SaleType_is_missing,SaleCondition_is_missing,MasVnrArea_is_missing,GarageYrBlt_is_missing
0,782,60,4,65.0,7153,2,0,4,4,1,...,False,False,False,True,True,True,False,False,False,False
1,558,50,1,60.0,11040,2,0,4,3,1,...,False,False,False,True,True,True,False,False,False,False
2,1220,160,5,21.0,1680,2,0,4,4,1,...,True,True,False,True,True,True,False,False,False,True
3,282,20,2,60.0,7200,2,2,4,4,1,...,False,False,False,True,True,True,False,False,False,False
4,570,90,4,69.0,7032,2,0,1,4,1,...,False,False,False,True,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1280,1001,20,4,74.0,10206,2,0,4,4,1,...,False,False,False,True,True,True,False,False,False,False
1281,1008,160,5,21.0,2217,2,0,4,4,1,...,False,False,False,True,True,True,False,False,False,False
1282,440,50,4,67.0,12354,2,1,4,4,1,...,False,False,False,True,False,False,False,False,False,False
1283,438,45,5,50.0,6000,2,0,4,4,1,...,False,False,False,True,True,True,False,False,False,False


In [37]:
X_train.shape

(1285, 126)

In [38]:
X_train.size

161910

In [40]:
1285*127

163195