In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import time
import statistics

In [2]:
from sklearn import linear_model, metrics, model_selection, preprocessing

In [3]:
from sklearn.model_selection import StratifiedShuffleSplit
from scipy import sparse

In [4]:
AmesClean = pd.read_csv('AmesCleanDataSet.csv')
AmesClean.drop(['Unnamed: 0'],axis=1,inplace=True)
print('Ames clean dataset has size of :' + str(AmesClean.shape))

Ames clean dataset has size of :(1156, 59)


In [5]:
np.sum(np.sum(AmesClean.isna()))

0

In [6]:
AmesClean['salePriceCat'] = pd.cut(AmesClean.SalePrice,bins=10,labels = range(10)).values

In [7]:
categorical_features = ['MS_SubClass','MS_Zoning','Lot_Shape','Land_Contour','Lot_Config','Land_Slope']
categorical_features.extend(['Neighborhood','Condition_1','Bldg_Type','House_Style','Roof_Style'])
categorical_features.extend(['Mas_Vnr_Type','Exter_Qual','Exter_Cond','Foundation','Bsmt_Qual','Bsmt_Exposure'])
categorical_features.extend(['basement_type','Heating_QC','Garage_Finish','Mo_Sold','Sale_Type','Sale_Condition','Kitchen_Qual'])
categorical_features.extend(['exterior','Fireplace_Qu','Garage_Type','Garage_Qual'])
Ordinal_featues = ['Overall_Qual','Overall_Cond']
Continous_features = ['Lot_Frontage','Lot_Area','age','remodeled_age','Mas_Vnr_Area','basement_area']
Continous_features.extend(['Bsmt_Unf_SF','Total_Bsmt_SF','1st_Flr_SF','2nd_Flr_SF','Low_Qual_Fin_SF'])
Continous_features.extend(['Gr_Liv_Area','Bsmt_Full_Bath','Bsmt_Half_Bath','Full_Bath','Half_Bath'])
Continous_features.extend(['Bedroom_AbvGr','Kitchen_AbvGr','TotRms_AbvGrd','Fireplaces'])               
Continous_features.extend(['Garage_Cars','Garage_Area'])
Continous_features.extend(['Wood_Deck_SF','Open_Porch_SF','Enclosed_Porch','Screen_Porch','Pool_Area'])
Continous_features.extend(['SalePrice','garage_age'])

In [8]:
AmesCleanFinal = AmesClean.drop(categorical_features,axis=1)
AmesCleanFinalSP = sparse.csr_matrix(AmesCleanFinal.values)
AmesCleanColSP = AmesCleanFinal.columns

for ifeature in categorical_features:
    
    temp_df = pd.get_dummies(AmesClean.loc[:,ifeature],prefix=ifeature,prefix_sep='_')
    AmesCleanColSP = list(AmesCleanColSP) + list(temp_df.columns)
    temp_data_sp = sparse.csr_matrix(temp_df.values)
    AmesCleanFinalSP = sparse.hstack([AmesCleanFinalSP,temp_data_sp])
    
AmesCleanDumCleanSP = pd.DataFrame.sparse.from_spmatrix(AmesCleanFinalSP, columns = AmesCleanColSP)

In [9]:
print('The size of cleaned datanase in sparse mode is :' + str(AmesCleanDumCleanSP.shape))

The size of cleaned datanase in sparse mode is :(1156, 241)


In [10]:
X_sp = AmesCleanDumCleanSP.drop(['SalePrice','salePriceCat'],axis=1)
Y_sp = AmesCleanDumCleanSP.SalePrice

In [11]:
AmesCleanFinal = AmesClean.drop(categorical_features,axis=1)
AmesCleanFinalData = AmesCleanFinal.values
AmesCleanCol = AmesCleanFinal.columns

for ifeature in categorical_features:
    
    temp_df = pd.get_dummies(AmesClean.loc[:,ifeature],prefix=ifeature,prefix_sep='_')
    AmesCleanCol = list(AmesCleanCol) + list(temp_df.columns)
    temp_data = temp_df.values
    AmesCleanFinalData = np.hstack([AmesCleanFinalData,temp_data])
    
AmesCleanDumClean = pd.DataFrame(AmesCleanFinalData, columns = AmesCleanCol)

In [12]:
X = AmesCleanDumClean.drop(['SalePrice','salePriceCat'],axis=1)
Y = AmesCleanDumClean.SalePrice

In [13]:
import sys
print(sys.getsizeof(X)/sys.getsizeof(X_sp))

3.211096390302861


In [14]:
from sklearn.model_selection import train_test_split

In [15]:
from sklearn.ensemble import RandomForestRegressor

In [16]:
X_train_sp, X_test_sp, Y_train_sp, Y_test_sp = train_test_split(X_sp,Y_sp,test_size = 0.33,stratify = AmesCleanDumCleanSP.salePriceCat)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.33,stratify = AmesCleanDumClean.salePriceCat)

In [17]:
random_forest_model = RandomForestRegressor()

In [18]:
random_forest_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

In [20]:
random_forest_model.set_params(n_jobs = -1,oob_score = True, random_state = 144)

depth_list = np.arange(10,40,10)
estimator_list = [int(10**i) for i in np.arange(2,4,1)]
feature_list = np.arange(5,20,5)

random_forest_params = {'max_depth':depth_list,'max_features':feature_list,'n_estimators':estimator_list}
grid = GridSearchCV(random_forest_model,random_forest_params,cv=5,return_train_score=True)

In [21]:
%time grid.fit(X_train,Y_train)

CPU times: total: 1min 25s
Wall time: 1min 20s


GridSearchCV(cv=5,
             estimator=RandomForestRegressor(n_jobs=-1, oob_score=True,
                                             random_state=144),
             param_grid={'max_depth': array([10, 20, 30]),
                         'max_features': array([ 5, 10, 15]),
                         'n_estimators': [100, 1000]},
             return_train_score=True)

In [22]:
random_forest_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': True,
 'random_state': 144,
 'verbose': 0,
 'warm_start': False}

In [23]:
pd.DataFrame(grid.cv_results_).to_csv('GridSearchResult.csv')

In [24]:
grid.best_estimator_

RandomForestRegressor(max_depth=20, max_features=15, n_estimators=1000,
                      n_jobs=-1, oob_score=True, random_state=144)

In [25]:
df =pd.DataFrame(grid.cv_results_)

In [26]:
df.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_max_depth', 'param_max_features', 'param_n_estimators', 'params',
       'split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score',
       'std_test_score', 'rank_test_score', 'split0_train_score',
       'split1_train_score', 'split2_train_score', 'split3_train_score',
       'split4_train_score', 'mean_train_score', 'std_train_score'],
      dtype='object')

In [28]:
df.loc[:,['param_max_depth','param_max_features','param_n_estimators','mean_test_score','mean_train_score','std_test_score',
          'std_train_score']]

Unnamed: 0,param_max_depth,param_max_features,param_n_estimators,mean_test_score,mean_train_score,std_test_score,std_train_score
0,10,5,100,0.799128,0.934519,0.0377,0.002906
1,10,5,1000,0.801454,0.937031,0.032371,0.002541
2,10,10,100,0.831915,0.962288,0.041777,0.00237
3,10,10,1000,0.833716,0.962775,0.040748,0.002315
4,10,15,100,0.845942,0.969762,0.046001,0.001323
5,10,15,1000,0.847454,0.970299,0.045034,0.002193
6,20,5,100,0.814609,0.973331,0.033754,0.001582
7,20,5,1000,0.818609,0.974905,0.033475,0.001283
8,20,10,100,0.835978,0.977731,0.042623,0.001889
9,20,10,1000,0.838909,0.978087,0.040982,0.001457


In [29]:
min_diff = np.abs(df.mean_test_score.values-df.mean_train_score.values)
min_index = np.argmin(df.mean_test_score.values-df.mean_train_score.values)
df.iloc[min_index,:]

mean_fit_time                                                  0.239961
std_fit_time                                                   0.014151
mean_score_time                                                0.026745
std_score_time                                                  0.00233
param_max_depth                                                      20
param_max_features                                                    5
param_n_estimators                                                  100
params                {'max_depth': 20, 'max_features': 5, 'n_estima...
split0_test_score                                              0.761097
split1_test_score                                              0.806396
split2_test_score                                              0.816932
split3_test_score                                              0.821997
split4_test_score                                              0.866625
mean_test_score                                                0

In [35]:
max_depth     = 100
max_features  = 5
max_estimator = 100

random_forest_model.set_params(n_jobs = -1,oob_score = True, random_state = 144, max_depth = max_depth,
                              max_features = max_features, n_estimators = max_estimator)

sample_split = np.arange(1,40,5)

random_forest_params = {'min_samples_split':sample_split}
grid = GridSearchCV(random_forest_model,random_forest_params,cv=5,return_train_score=True)
%time grid.fit(X_train,Y_train)

CPU times: total: 9.08 s
Wall time: 22.8 s


5 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\joblib\externals\loky\process_executor.py", line 436, in _process_worker
    r = call_item()
  File "C:\ProgramData\Anaconda3\lib\site-packages\joblib\externals\loky\process_executor.py", line 288, in __call__
    return self.fn(*self.args, **self.kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 595, in __call__
    return self.func(*args, **kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py", line 

GridSearchCV(cv=5,
             estimator=RandomForestRegressor(max_depth=100, max_features=5,
                                             n_jobs=-1, oob_score=True,
                                             random_state=144),
             param_grid={'min_samples_split': array([ 1,  6, 11, 16, 21, 26, 31, 36])},
             return_train_score=True)

In [33]:
df.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_min_samples_split', 'params', 'split0_test_score',
       'split1_test_score', 'split2_test_score', 'split3_test_score',
       'split4_test_score', 'mean_test_score', 'std_test_score',
       'rank_test_score', 'split0_train_score', 'split1_train_score',
       'split2_train_score', 'split3_train_score', 'split4_train_score',
       'mean_train_score', 'std_train_score'],
      dtype='object')

In [36]:
df = pd.DataFrame(grid.cv_results_)
df.loc[:,['param_min_samples_split','mean_test_score','mean_train_score','std_test_score',
          'std_train_score']]

Unnamed: 0,param_min_samples_split,mean_test_score,mean_train_score,std_test_score,std_train_score
0,1,,,,
1,6,0.804927,0.9228,0.033737,0.00209
2,11,0.790417,0.887598,0.035363,0.00353
3,16,0.779025,0.860106,0.031914,0.007801
4,21,0.770651,0.841383,0.035122,0.003268
5,26,0.755378,0.817928,0.034518,0.005437
6,31,0.746673,0.804655,0.033233,0.007477
7,36,0.732828,0.786903,0.033503,0.008063


Based on the above observation, I can conclude that "min sample per split" plays a critical role for overfitting

In [37]:
import warnings
warnings.filterwarnings("ignore")

In [38]:
random_forest_model.set_params(n_jobs = -1,oob_score = True, random_state = 144)

depth_list = np.arange(10,40,10)
estimator_list = [int(10**i) for i in np.arange(2,4,1)]
feature_list = np.arange(5,20,5)
sample_split = np.arange(1,40,5)
min_samples_leaf = np.arange(5,40,5)

random_forest_params = {'max_depth':depth_list,'max_features':feature_list,'n_estimators':estimator_list,
                        'min_samples_split':sample_split,'min_samples_leaf':min_samples_leaf}
grid = GridSearchCV(random_forest_model,random_forest_params,cv=5,return_train_score=True)

In [39]:
%time grid.fit(X_train,Y_train)

CPU times: total: 55min 29s
Wall time: 1h 27min 44s


GridSearchCV(cv=5,
             estimator=RandomForestRegressor(max_depth=100, max_features=5,
                                             n_jobs=-1, oob_score=True,
                                             random_state=144),
             param_grid={'max_depth': array([10, 20, 30]),
                         'max_features': array([ 5, 10, 15]),
                         'min_samples_leaf': array([ 5, 10, 15, 20, 25, 30, 35]),
                         'min_samples_split': array([ 1,  6, 11, 16, 21, 26, 31, 36]),
                         'n_estimators': [100, 1000]},
             return_train_score=True)

In [40]:
pd.DataFrame(grid.cv_results_).to_csv('GridSearchResultFull.csv')

In [42]:
pd.DataFrame(grid.cv_results_).columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_max_depth', 'param_max_features', 'param_min_samples_leaf',
       'param_min_samples_split', 'param_n_estimators', 'params',
       'split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score',
       'std_test_score', 'rank_test_score', 'split0_train_score',
       'split1_train_score', 'split2_train_score', 'split3_train_score',
       'split4_train_score', 'mean_train_score', 'std_train_score'],
      dtype='object')

In [43]:
pd.DataFrame(grid.cv_results_).loc[:,['param_max_depth','param_max_features','param_min_samples_leaf',
                                     'param_min_samples_split','param_n_estimators','mean_train_score','mean_test_score']]

Unnamed: 0,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,mean_train_score,mean_test_score
0,10,5,5,1,100,,
1,10,5,5,1,1000,,
2,10,5,5,6,100,0.774589,0.728419
3,10,5,5,6,1000,0.777718,0.728014
4,10,5,5,11,100,0.771909,0.725699
...,...,...,...,...,...,...,...
1003,30,15,35,26,1000,0.719794,0.703115
1004,30,15,35,31,100,0.722562,0.706230
1005,30,15,35,31,1000,0.719794,0.703115
1006,30,15,35,36,100,0.722562,0.706230


In [45]:
print(np.max(pd.DataFrame(grid.cv_results_).loc[:,'mean_train_score']))
print(np.max(pd.DataFrame(grid.cv_results_).loc[:,'mean_test_score']))

0.8825426153267039
0.8210643754367337


In [46]:
df = pd.DataFrame(grid.cv_results_)
index_max_train = np.argmax(df.loc[:,'mean_train_score'].values)
index_max_test  = np.argmax(df.loc[:,'mean_test_score'].values)
print(str(np.max(df.loc[:,'mean_train_score'])) + str(df.loc[index_max_train,'mean_test_score']))
print(str(np.max(df.loc[:,'mean_test_score'])) + str(df.loc[index_max_test,'mean_train_score']))

0.8825426153267039nan
0.8210643754367337nan


In [None]:
grid_feature_importance = zip(X_train.columns,grid.best_estimator_.feature_importances_)
grid_feature_importance = sorted(grid_feature_importance,reverse=True,key = lambda x : x[1])
pd.DataFrame(grid_feature_importance,columns=['Feature','Importance']).to_csv('GridsearchFeatureImportance.csv')

In [None]:
pd.DataFrame(grid_feature_importance,columns=['Feature','Importance']).iloc[1:10,:]

In [None]:
df = pd.DataFrame(grid_feature_importance,columns=['Feature','Importance'])
feature_picked = df.loc[df['Importance'] > 0.02,'Feature'].values

In [None]:
feature_picked

In [None]:
imputation_error = pd.read_csv('KNNImputation.csv')

In [None]:
plt.figure(figsize=(12,8))
for ifeature in feature_picked:
    try:
        plt.plot(imputation_error.index.values,imputation_error.loc[:,ifeature].values,label = ifeature)
    except:
        continue
ax = plt.gca()
ax.set_xticks(np.arange(1, max(imputation_error.index.values),2),fontsize = 18)
ax.set_yticks(np.arange(-40, 81,20),fontsize = 18)
plt.grid()
plt.xlabel('# of neighborhoods',fontsize = 24)
plt.ylabel('Accuracy',fontsize = 24)
plt.legend(loc='upper right')
plt.show()

In [None]:
grid.best_estimator_.score(X_test,Y_test)

In [None]:
grid.best_estimator_.score(X_train,Y_train)

In [None]:
grid.best_params_

In [None]:
tree_mode_vif = RandomForestRegressor()
tree_mode_vif.set_params(max_depth = 20, max_features = 15, n_estimators = 1000)

In [None]:
vif_randomForest = {}
for ifeature in X_train.columns:
    X_temp = X_train.drop(ifeature,axis=1)
    Y_temp = X_train.loc[:,ifeature]
    tree_mode_vif.fit(X_temp,Y_temp)
    vif_randomForest[ifeature] = 1/(1 - tree_mode_vif.score(X_temp, Y_temp)**2 + 0.0001)
    #print('Size of data are :' + str(X_temp.shape))

In [None]:
plt.figure(figsize=(12,8))
ax = plt.gca()
plt.bar(list(vif_randomForest.keys()),list(vif_randomForest.values()))
plt.xlabel('Feature Name :')
plt.ylabel('VIF')
ax.tick_params(rotation=90.0)
plt.show()

In [None]:
val = list(vif_randomForest.values())
key = list(vif_randomForest.keys())
test_features = [key[i] for i in range(len(val)) if val[i] > 50.]

In [None]:
key[0]

In [None]:
print(grid.best_estimator_.score(X_test,Y_test))
print(grid.best_estimator_.score(X_train,Y_train))

In [None]:
threshold = 50
val = list(vif_randomForest.values())
key = list(vif_randomForest.keys())
test_features = [key[i] for i in range(len(val)) if val[i] > threshold]

X_train_vif = X_train.drop(test_features,axis=1)
X_test_vif = X_test.drop(test_features,axis=1)

%time grid.fit(X_train_vif,Y_train)

print(grid.best_estimator_.score(X_test_vif,Y_test))
print(grid.best_estimator_.score(X_train_vif,Y_train))

In [None]:
df = pd.DataFrame(grid.cv_results_)
np.min(df.mean_train_score.values-df.mean_test_score.values)

In [None]:
n_feature = 0

In [None]:
for ith in np.arange(10,10000,50):
    
    val = list(vif_randomForest.values())
    key = list(vif_randomForest.keys())
    test_features = [key[i] for i in range(len(val)) if val[i] > ith]
    if (len(test_features) == n_feature) :
        continue
    else:
        n_feature = len(test_features)
    X_train_vif = X_train.drop(test_features,axis=1)
    X_test_vif = X_test.drop(test_features,axis=1)
    grid.fit(X_train_vif,Y_train)
    df = pd.DataFrame(grid.cv_results_)
    min_diff = np.min(df.mean_train_score.values-df.mean_test_score.values)
    min_index = np.argmin(df.mean_train_score.values-df.mean_test_score.values)
    test_score = df.mean_test_score.values[min_index]
    train_score = df.mean_train_score.values[min_index]
    print(str(ith) + '/' + str(min_diff) + '/' + str(test_score)+'/'+str(train_score))