<span style="color:darkblue"><font size="5"> DeCockHousePrice Dataset: SalePrice Prediction</font></span> 
    
**All together**

# Data

In [4]:
df=pd.read_csv('house_price.csv')

In [5]:
df.head()

Unnamed: 0,LotFrontage,LotArea,Neighborhood,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,...,SaleType_COD,SaleType_New,SaleType_Other,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,65.0,8450,5,7,5,2003,2003,196.0,4,3,...,0,0,0,1,0,0,0,0,1,0
1,80.0,9600,7,6,8,1976,1976,0.0,3,3,...,0,0,0,1,0,0,0,0,1,0
2,68.0,11250,5,7,5,2001,2002,162.0,4,3,...,0,0,0,1,0,0,0,0,1,0
3,60.0,9550,6,7,5,1915,1970,0.0,3,3,...,0,0,0,1,1,0,0,0,0,0
4,84.0,14260,10,8,5,2000,2000,350.0,4,3,...,0,0,0,1,0,0,0,0,1,0


In [21]:
X=df.drop('SalePrice',axis=1).values
y=df.loc[:,'SalePrice'].values
y=np.array([1 if i>163000 else 0 for i in y])

# broadly Model Selection

In [75]:
from sklearn.ensemble import (RandomForestClassifier,
                              GradientBoostingClassifier,AdaBoostClassifier,ExtraTreesClassifier)
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from pipelinehelper import PipelineHelper
from sklearn.model_selection import GridSearchCV,cross_val_score,RandomizedSearchCV

In [109]:
X_trainval, X_test, y_trainval, y_test = train_test_split(X,y,random_state=0)
print(X_trainval.shape)
y_trainval[0]

(1095, 212)

In [111]:
seedNum=42
models=[('tree',DecisionTreeClassifier(random_state=seedNum)),
        ('abc',AdaBoostClassifier(random_state=seedNum)),\
        ('rfc',RandomForestClassifier(random_state=seedNum)),
        ('etc',ExtraTreesClassifier(random_state=seedNum)),
        ('xgbc',XGBClassifier(random_state=seedNum)),
        ('gbc', GradientBoostingClassifier(random_state=seedNum))]

In [112]:
results=[]
names=[]
metrics=[]
for name, model in models:
    cv_results=cross_val_score(model,X_trainval,y_trainval,cv=5)
    results.append(cv_results)
    metrics.append(cv_results.mean())
    
    print('%s: %f\n'%(name,cv_results.mean()))
best_model=models[metrics.index(max(metrics))]
print(f'The Best Model is:\n {best_model}')

tree: 0.871233

abc: 0.899543

rfc: 0.925114

etc: 0.916895

xgbc: 0.915068

gbc: 0.918721

The Best Model is:
 ('rfc', RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False))


In [123]:
# searchching for best parameters
pipe = Pipeline([('classifier', \
 PipelineHelper([('rfc',RandomForestClassifier(n_jobs=-1)),('xgbc', XGBClassifier(learning_rate=0.02,\
                                             n_estimators=100, objective='binary:logistic',nthread=4))]))])
search_space = [{'classifier':[XGBClassifier(learning_rate=0.02,\
                                             n_estimators=100, objective='binary:logistic',nthread=4)],
               'classifier__min_child_weight': [1, 5, 10],
                 'classifier__max_depth' : [3,5,7],
                  'classifier__subsample': [0.6, 0.8, 1.0],
                'classifier__gamma': [0.5, 1, 1.5, 2, 5],
                 'classifier__colsample_bytree': [0.6, 0.8, 1.0]
                },
                 {'classifier': [RandomForestClassifier(n_jobs=-1)],
                 'classifier__max_depth': [10, 30, None],
                 'classifier__min_samples_leaf': [1, 2, 4],
                 'classifier__n_estimators': [10, 100,200]}]
# Create grid search 
clf = GridSearchCV(pipe, search_space, cv=5, verbose=0)
best_model = clf.fit(X_trainval, y_trainval)

In [125]:
best_model.best_estimator_.get_params()['classifier']

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [126]:
print(f'Average accuracy of Cross Validation: {best_model.best_score_}')

Average accuracy of Cross Validation: 0.9242009132420093


In [132]:
print(f'Performance on unseen test set: {best_model.score(X_test,y_test)}')

Performance on unseen test set: 0.9424657534246575


# Compared with the other models
Performance of **RandomForestClassifier** on unseen data:
    $accuracy=94.25$%

Performance of **LogisticRegression** on unseen data: 
    $accuracy=91.78$%

Performance of **SVC** on unseen data: 
    $accuracy=91.78$%

- **Performance of RandomForestClassifier is better than LogisticRegression and SVM**


- **Bias still exists**

# VotingClassifier

   **Could VotingClassifier do better?** 

In [128]:
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC

In [131]:
gbc=GradientBoostingClassifier()
param_grid = {'n_estimators': [100, 250, 200],
                 'max_depth' : [3,5,7],
                  'subsample': [0.6, 0.8, 1.0],
             'min_samples_split':[2,3,4],
    'min_samples_leaf':[1,2,3]}
grid_search=GridSearchCV(gbc,param_grid,cv=5,n_jobs=-1)
grid_search.fit(X_trainval,y_trainval)
print(grid_search.best_params_)
print(grid_search.best_score_)

{'max_depth': 3, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 100, 'subsample': 0.6}
0.9223744292237444


In [143]:
grid_search.score(X_test,y_test)

0.9287671232876712

In [155]:
estimators = [ 
    ('rfc',RandomForestClassifier(max_depth=10, min_samples_leaf=2,n_estimators=200)),
    ('gbc',GradientBoostingClassifier(max_depth=3, subsample=0.6,min_samples_leaf=3, \
                                      min_samples_split=2, n_estimators=100,)),
    ('etc',ExtraTreesClassifier(random_state=seedNum)),
        ('xgbc',XGBClassifier(random_state=seedNum))]
vc_hard = VotingClassifier(estimators=estimators, n_jobs=-1,voting='hard')
vc_soft = VotingClassifier(estimators=estimators, n_jobs=-1,voting='soft')
vc_hard.fit(X_trainval,y_trainval)
vc_soft.fit(X_trainval,y_trainval)

In [157]:
print(f'accuracy of hard voting: {vc_hard.score(X_test,y_test)}')
print(f'accuracy of soft voting: {vc_soft.score(X_test,y_test)}')

accuracy of hard voting: 0.9315068493150684
accuracy of soft voting: 0.9397260273972603


In [159]:
print('Hard Voting:')
for est,name in zip(vc_hard.estimators_,vc_hard.estimators):
    print (name[0], est.score(X_test,y_test))
print('\nSoft Voting:')
for est,name in zip(vc_soft.estimators_,vc_soft.estimators):
    print (name[0], est.score(X_test,y_test))   


Hard Voting:
rfc 0.9452054794520548
gbc 0.9342465753424658
etc 0.9315068493150684
xgbc 0.936986301369863

Soft Voting:
rfc 0.9506849315068493
gbc 0.9315068493150684
etc 0.9315068493150684
xgbc 0.936986301369863


# Comparing VotingClassifier with RandomForestClassifier
Performance of **RandomForestClassifier** on unseen data:
    $accuracy=94.25$%
    
    
Performance of **Soft VotingClassifier** on unseen data:
    $accuracy=93.97$%
   